## Logistic Regression Exercise

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [8]:
#1) Load iris datasets from 'iris-data-clean.csv'
df = pd.read_csv('iris-data-clean.csv', header =0)
df

Unnamed: 0,sepal_length_cm,sepal_width_cm,petal_length_cm,petal_width_cm,class
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa
...,...,...,...,...,...
140,6.7,3.0,5.2,2.3,Virginica
141,6.3,2.5,5.0,1.9,Virginica
142,6.5,3.0,5.2,2.0,Virginica
143,6.2,3.4,5.4,2.3,Virginica


In [9]:
#Replace the values in the columns 'Class' as follows:
#"Setosa" = 0
#"Virginica" = 1
#"Versicolor" = 2
df.replace('Setosa', 0, inplace=True) 
#inplace=True - updates the df directly. 
#if inplace = false, a copy is returned and you need to save it somewhere
df.replace('Virginica', 1, inplace=True)
df.replace('Versicolor', 2, inplace=True)
df

Unnamed: 0,sepal_length_cm,sepal_width_cm,petal_length_cm,petal_width_cm,class
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
140,6.7,3.0,5.2,2.3,1
141,6.3,2.5,5.0,1.9,1
142,6.5,3.0,5.2,2.0,1
143,6.2,3.4,5.4,2.3,1


In [10]:
#2) Using Logistic Regression, classify the outcome (Column : 'Class') based on the 
#labels (Columns :'sepal length /cm', 'sepal width /cm', 'petal length /cm', 'petal width /cm')

#NOTE: classification means trying to predict which class an entity belongs to based on its features
#This qn does multiclass (aka multinomial) classification (i.e. 3 or more classes of outputs to choose from)

logReg = LogisticRegression(solver = 'lbfgs') #instantiate solver method called 'lbfgs' so you can use it later

x = df[['sepal_length_cm', 'sepal_width_cm', 'petal_length_cm', 'petal_width_cm']]
y = df['class'] #Classification : class = 0, 1 or 2

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 0)
#if you specify random state, the train data and test data will always be the same for the same set (no. that you set; can be any no.)
x_train.head()

Unnamed: 0,sepal_length_cm,sepal_width_cm,petal_length_cm,petal_width_cm
121,6.2,2.8,4.8,1.8
106,6.4,2.7,5.3,1.9
123,6.4,2.8,5.6,2.1
109,5.8,2.8,5.1,2.4
59,5.6,2.9,3.6,1.3


In [23]:
logReg.fit(x_train, y_train)

LogisticRegression()

In [21]:
#2a) Provide some values to predict the outcome
print(logReg.predict([[6.1,2.9,4.8,1.7]])) #testing using values 6.1,2.9,4.8,1.7

[2]


In [13]:
#2a) Provide some values to predict the outcome
print(logReg.predict([[6,3,5,2]])) #testing using values 6,3,5,2

[1]


In [22]:
#2b) Validate the model - print the confusion matrix and the accuracy score
from sklearn.metrics import accuracy_score

y_pred = logReg.predict(x_test)
#print(y_test)
print(y_pred)

[0 0 0 2 2 1 2 0 1 0 0 0 1 2 1 0 0 1 1 0 1 1 1 2 2 0 2 2 1 1 1 0 0 2 0 2 0]


In [15]:
#accuracy score
accuracy_score(y_test, y_pred)

0.972972972972973

In [16]:
#confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix (y_test, y_pred, labels = [2,1,0])
#'labels' = classification labels used

array([[10,  1,  0],
       [ 0, 11,  0],
       [ 0,  0, 15]], dtype=int64)

In [17]:
# probability of getting 0 vs 1 vs 2 for each row of X values
logReg.predict_proba(x_test)

array([[9.59754663e-01, 1.32083210e-07, 4.02452053e-02],
       [9.56590904e-01, 3.33257970e-07, 4.34087630e-02],
       [9.82520406e-01, 4.43837120e-08, 1.74795496e-02],
       [2.00106281e-02, 3.43072476e-02, 9.45682124e-01],
       [7.15973660e-03, 2.01334551e-01, 7.91505712e-01],
       [6.79164971e-05, 8.97229962e-01, 1.02702121e-01],
       [1.94379930e-02, 2.51974766e-02, 9.55364530e-01],
       [9.62263597e-01, 1.42348344e-07, 3.77362608e-02],
       [1.16958073e-04, 8.48959243e-01, 1.50923799e-01],
       [9.48220965e-01, 2.98082237e-07, 5.17787373e-02],
       [9.59017209e-01, 2.34867479e-07, 4.09825565e-02],
       [9.66777041e-01, 1.16469665e-07, 3.32228423e-02],
       [1.22672706e-04, 7.79854649e-01, 2.20022678e-01],
       [7.09244442e-03, 7.58098783e-02, 9.17097677e-01],
       [2.04917899e-04, 8.48832242e-01, 1.50962840e-01],
       [9.81958926e-01, 7.84681528e-08, 1.80409954e-02],
       [9.72642962e-01, 1.87191417e-07, 2.73568512e-02],
       [1.95199215e-05, 9.61736