### MultiClass Classification Problem

In [1]:
import os
%matplotlib inlineaa
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import warnings
warnings.filterwarnings('ignore')

### IRIS dataset

In [116]:
import seaborn as sns
df = sns.load_dataset('iris')
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [117]:
df.isnull().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

In [118]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [119]:
df['species'].value_counts()

setosa        50
versicolor    50
virginica     50
Name: species, dtype: int64

In [121]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df['species'] = encoder.fit_transform(df['species'])

In [122]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [123]:
df['species'].value_counts()

0    50
1    50
2    50
Name: species, dtype: int64

In [124]:
df.columns

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')

In [125]:
df = df[['sepal_length','petal_length', 'species']]

In [126]:
df.head()

Unnamed: 0,sepal_length,petal_length,species
0,5.1,1.4,0
1,4.9,1.4,0
2,4.7,1.3,0
3,4.6,1.5,0
4,5.0,1.4,0


In [127]:
# split the data into x and y
x = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [128]:
x.head()

Unnamed: 0,sepal_length,petal_length
0,5.1,1.4
1,4.9,1.4
2,4.7,1.3
3,4.6,1.5
4,5.0,1.4


In [129]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: species, dtype: int32

In [130]:
# split the data into train and test
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=1)

### Building Logistic Regression with Multiclass classification 
#### OVR or OVA

In [132]:
# multi_class : {'auto', 'ovr', 'multinomial'}
logit_ovr = LogisticRegression(multi_class='ovr')
logit_ovr.fit(x_train, y_train)

LogisticRegression(multi_class='ovr')

In [133]:
# predict
y_pred_ovr_train = logit_ovr.predict(x_train)
y_pred_ovr_test = logit_ovr.predict(x_test)

In [134]:
# evaluate the model
print(confusion_matrix(y_train, y_pred_ovr_train))
print()
print(confusion_matrix(y_test, y_pred_ovr_test))

[[39  0  0]
 [ 0 29  8]
 [ 0  1 43]]

[[11  0  0]
 [ 1  9  3]
 [ 0  0  6]]


In [135]:
print(classification_report(y_train, y_pred_ovr_train))
print()
print(classification_report(y_test, y_pred_ovr_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        39
           1       0.97      0.78      0.87        37
           2       0.84      0.98      0.91        44

    accuracy                           0.93       120
   macro avg       0.94      0.92      0.92       120
weighted avg       0.93      0.93      0.92       120


              precision    recall  f1-score   support

           0       0.92      1.00      0.96        11
           1       1.00      0.69      0.82        13
           2       0.67      1.00      0.80         6

    accuracy                           0.87        30
   macro avg       0.86      0.90      0.86        30
weighted avg       0.90      0.87      0.87        30



In [136]:
print(accuracy_score(y_train, y_pred_ovr_train))
print()
print(accuracy_score(y_test, y_pred_ovr_test))

0.925

0.8666666666666667


In [137]:
from sklearn.model_selection import cross_val_score
training_accuracy = cross_val_score(logit_ovr, x_train, y_train, cv=10)
test_accuracy = cross_val_score(logit_ovr, x_test, y_test, cv=10)
print("Train Accuracy", training_accuracy)
print()
print("Train Mean Accuracy", training_accuracy.mean())
print()
print("Train Max Accuracy", training_accuracy.max())
print()
print("Test Accuracy", test_accuracy)
print()
print("Test Mean Accuracy", test_accuracy.mean())
print()
print("Test Max Accuracy", test_accuracy.max())

Train Accuracy [0.91666667 1.         1.         0.91666667 1.         0.75
 1.         0.83333333 0.83333333 0.91666667]

Train Mean Accuracy 0.9166666666666666

Train Max Accuracy 1.0

Test Accuracy [1.         1.         1.         1.         1.         0.66666667
 1.         1.         1.         0.66666667]

Test Mean Accuracy 0.9333333333333332

Test Max Accuracy 1.0


### Multinomial 

In [138]:
# multi_class : {'auto', 'ovr', 'multinomial'}
logit_multi = LogisticRegression(multi_class='multinomial')
logit_multi.fit(x_train, y_train)

LogisticRegression(multi_class='multinomial')

In [139]:
# predict
y_pred_multi_train = logit_multi.predict(x_train)
y_pred_multi_test = logit_multi.predict(x_test)

In [140]:
# evaluate the model
print(confusion_matrix(y_train, y_pred_multi_train))
print()
print(confusion_matrix(y_test, y_pred_multi_test))

[[39  0  0]
 [ 0 34  3]
 [ 0  2 42]]

[[11  0  0]
 [ 0 12  1]
 [ 0  0  6]]


In [141]:
# evaluate the model
print(classification_report(y_train, y_pred_multi_train))
print()
print(classification_report(y_test, y_pred_multi_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        39
           1       0.94      0.92      0.93        37
           2       0.93      0.95      0.94        44

    accuracy                           0.96       120
   macro avg       0.96      0.96      0.96       120
weighted avg       0.96      0.96      0.96       120


              precision    recall  f1-score   support

           0       1.00      1.00      1.00        11
           1       1.00      0.92      0.96        13
           2       0.86      1.00      0.92         6

    accuracy                           0.97        30
   macro avg       0.95      0.97      0.96        30
weighted avg       0.97      0.97      0.97        30

