In [80]:
import pandas as pd
import requests
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression 
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import Normalizer
from scipy.special import expit
import matplotlib.pyplot as plt

In [3]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
r = requests.get(url, allow_redirects=True)
open('iris.txt','wb').write(r.content)

4551

In [4]:
header = ['sepal_length','sepal_width','petal_length','petal_width','names']
df = pd.read_csv('iris.txt',names = header,index_col =False)
df.head(10)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,names
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
5,5.4,3.9,1.7,0.4,Iris-setosa
6,4.6,3.4,1.4,0.3,Iris-setosa
7,5.0,3.4,1.5,0.2,Iris-setosa
8,4.4,2.9,1.4,0.2,Iris-setosa
9,4.9,3.1,1.5,0.1,Iris-setosa


In [5]:
X = df.iloc[:,:-1]
X.head(3)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2


In [6]:
y = (df.iloc[:,-1:])
y.head(3)

Unnamed: 0,names
0,Iris-setosa
1,Iris-setosa
2,Iris-setosa


In [104]:
X_train, X_test, y_train, y_test = train_test_split( X,y, test_size = 0.2, random_state = 0)
lr = LogisticRegression( solver= 'lbfgs', max_iter=100)

lr.fit(X_train, np.ravel(y_train))
y_pred = lr.predict(X_test)
accuracy_score(y_test,y_pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


1.0

##### Now lets look into the warning and and accuracy_score and think about what happened.

##### Problem: The non-convergence meaning that the model failed to make the result consistent across the iterations. resulting in issue with 'local-optimal'

##### Solution: To solve the issue, sklearn was smart enough to suggest to either increase the number of iterations or scale the data. While both are meant to resolve the issues, scaling the data is the aspect of parameter tuning but increasing max_iteration is the aspect of hyperparameter tuning and the option available to some aspect might be limited (eg. could rescaling dataset with variable such as money or time affects its meaning toward the whole research?).

##### The following analysis rescale the data and feed in the data to the same model setting to see if the issue of convergence resolves.

In [94]:
df.head(3)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,names
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa


##### now when scaling with dataset, there is a very important aspect to aware of. Never scale the entire dataset included the test data. This is a way of leaking information about the test dataset into the train dataset. Instead, scale the train data and transform the test data based on the train data scales.

In [106]:
# Data Transformation
norm = Normalizer()
X_train_scaled = norm.fit_transform(X_train)
X_test_scaled = norm.transform(X_test)

In [107]:
X_train_scaled.shape
y_train.shape

(120, 1)

In [112]:
#Rerunning the model
lr1 = LogisticRegression(max_iter=100)
lr1.fit(X_train_scaled,np.ravel(y_train))
y_pred1 = lr1.predict(X_test_scaled)
accuracy_score(y_test,y_pred1)

0.5666666666666667

#### Now that is better as the model converged within the 100 iterations, meaning that scaling the data not only resolved the issue of convergence, but saves resources. lets plot a log likelihood to see what happened throughout the iterations of both model. With some minor improvement in the hyperparameter tuning, including a L2 regularizing of C=10 will increase the quality of the model significantly by preventing overfitting.

In [181]:
#Rerunning the model 2 with C=10
lr1 = LogisticRegression(max_iter=100, C=10)
lr1.fit(X_train_scaled,np.ravel(y_train))
y_pred1 = lr1.predict(X_test_scaled)
accuracy_score(y_test,y_pred1)

0.9333333333333333

In [183]:
#Rerunning the model 3 with C=100
lr2 = LogisticRegression(max_iter=100, C=100)
lr2.fit(X_train_scaled,np.ravel(y_train))
y_pred2 = lr2.predict(X_test_scaled)
accuracy_score(y_test,y_pred2)

1.0