# Classification

In [118]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.metrics import confusion_matrix, classification_report, \
                            accuracy_score, f1_score

In [140]:
# Define function for evaluation
def evaluate(y_te, pred, labels, return_data = False):
    
    cm = confusion_matrix(y_te, pred, labels)

    print("\nConfusion matrix (absolute):")
    cm_abs = pd.DataFrame(data = cm, index = labels, columns = labels)
    cm_abs = pd.concat([cm_abs], keys=['True'], names=[''], axis = 0)
    cm_abs = pd.concat([cm_abs], keys=['Predicted'], names=[''], axis = 1)
    display(cm_abs)

    print("\nConfusion matrix (relative):")
    cm_rel = pd.DataFrame(data = cm / cm.sum(), index = labels, columns = labels)
    cm_rel = pd.concat([cm_rel], keys=['True'], names=[''], axis = 0)
    cm_rel = pd.concat([cm_rel], keys=['Predicted'], names=[''], axis = 1)
    cm_rel = round(cm_rel * 100, 2)
    display(cm_rel)

    print("\nClassification report:\n")
    class_report = classification_report(y_true = y_te, y_pred = pred)
    print(class_report)

    acc = round(accuracy_score(y_true = y_te, y_pred = pred), 4)
    print("Overall accuracy: {}".format(acc))
    print("\nMajority class accuracy ({}): {}".format(y_te.value_counts().idxmax(),
                                                    round(np.mean(y_te.value_counts().idxmax() == y_te), 4)))
    
    if return_data == True:
        return({'cm_rel': cm_rel, 'cm_abs': cm_abs, 'class_report': class_report, 'acc': acc, 'f1': f1})

In [141]:
# Import data
df = sns.load_dataset("tips")
display(df.head())
df.groupby("smoker").size()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


smoker
Yes     93
No     151
dtype: int64

In [142]:
# One hot encoding
df = pd.get_dummies(df)

# X, y split 
y = df["smoker_Yes"]
X = df.drop(columns = ["smoker_Yes", "smoker_No"])

# Train, test split
from sklearn import model_selection
X_tr, X_te, y_tr, y_te = model_selection.train_test_split(X, y, test_size=0.2, 
                                                          random_state=12345)

In [143]:
# Oversampling


### Logistic

In [152]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

steps = [#('StandardScaler', StandardScaler()), # scaling not necessary
         ('LogisticRegression', LogisticRegression())]

model = Pipeline(steps)
model.fit(X_tr, y_tr)
pred = model.predict(X_te)
evaluate(y_te, pred, labels = [1, 0], return_data = False)


Confusion matrix (absolute):


Unnamed: 0_level_0,Unnamed: 1_level_0,Predicted,Predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,1,0
,,,
True,1.0,9.0,10.0
True,0.0,7.0,23.0



Confusion matrix (relative):


Unnamed: 0_level_0,Unnamed: 1_level_0,Predicted,Predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,1,0
,,,
True,1.0,18.37,20.41
True,0.0,14.29,46.94



Classification report:

              precision    recall  f1-score   support

           0       0.70      0.77      0.73        30
           1       0.56      0.47      0.51        19

    accuracy                           0.65        49
   macro avg       0.63      0.62      0.62        49
weighted avg       0.64      0.65      0.65        49

Overall accuracy: 0.6531

Majority class accuracy (0): 0.6122


### LDA

In [146]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

steps = [#('StandardScaler', StandardScaler()), # scaling not necessary
         ('LinearDiscriminantAnalysis', LinearDiscriminantAnalysis())]

model = Pipeline(steps)
model.fit(X_tr, y_tr)
pred = model.predict(X_te)
evaluate(y_te, pred, labels = [1, 0], return_data = False)


Confusion matrix (absolute):


Unnamed: 0_level_0,Unnamed: 1_level_0,Predicted,Predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,1,0
,,,
True,1.0,8.0,11.0
True,0.0,7.0,23.0



Confusion matrix (relative):


Unnamed: 0_level_0,Unnamed: 1_level_0,Predicted,Predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,1,0
,,,
True,1.0,16.33,22.45
True,0.0,14.29,46.94



Classification report:

              precision    recall  f1-score   support

           0       0.68      0.77      0.72        30
           1       0.53      0.42      0.47        19

    accuracy                           0.63        49
   macro avg       0.60      0.59      0.59        49
weighted avg       0.62      0.63      0.62        49

Overall accuracy: 0.6327

Majority class accuracy (0): 0.6122


### SVM


In [151]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

steps = [('StandardScaler', StandardScaler()), # Scaling important
         ('SVC', SVC())]

model = Pipeline(steps)
model.fit(X_tr, y_tr)
pred = model.predict(X_te)
evaluate(y_te, pred, labels = [1, 0], return_data = False)


Confusion matrix (absolute):


Unnamed: 0_level_0,Unnamed: 1_level_0,Predicted,Predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,1,0
,,,
True,1.0,7.0,12.0
True,0.0,6.0,24.0



Confusion matrix (relative):


Unnamed: 0_level_0,Unnamed: 1_level_0,Predicted,Predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,1,0
,,,
True,1.0,14.29,24.49
True,0.0,12.24,48.98



Classification report:

              precision    recall  f1-score   support

           0       0.67      0.80      0.73        30
           1       0.54      0.37      0.44        19

    accuracy                           0.63        49
   macro avg       0.60      0.58      0.58        49
weighted avg       0.62      0.63      0.61        49

Overall accuracy: 0.6327

Majority class accuracy (0): 0.6122


### Naive Bayes

In [155]:
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

steps = [('StandardScaler', StandardScaler()),
         ('GaussianNB', GaussianNB())]

model = Pipeline(steps)
model.fit(X_tr, y_tr)
pred = model.predict(X_te)
evaluate(y_te, pred, labels = [1, 0], return_data = False)


Confusion matrix (absolute):


Unnamed: 0_level_0,Unnamed: 1_level_0,Predicted,Predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,1,0
,,,
True,1.0,7.0,12.0
True,0.0,5.0,25.0



Confusion matrix (relative):


Unnamed: 0_level_0,Unnamed: 1_level_0,Predicted,Predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,1,0
,,,
True,1.0,14.29,24.49
True,0.0,10.2,51.02



Classification report:

              precision    recall  f1-score   support

           0       0.68      0.83      0.75        30
           1       0.58      0.37      0.45        19

    accuracy                           0.65        49
   macro avg       0.63      0.60      0.60        49
weighted avg       0.64      0.65      0.63        49

Overall accuracy: 0.6531

Majority class accuracy (0): 0.6122


### KNN

In [159]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

steps = [('StandardScaler', StandardScaler()),
         ('KNeighborsClassifier', KNeighborsClassifier(n_neighbors=3))]

model = Pipeline(steps)
model.fit(X_tr, y_tr)
pred = model.predict(X_te)
evaluate(y_te, pred, labels = [1, 0], return_data = False)


Confusion matrix (absolute):


Unnamed: 0_level_0,Unnamed: 1_level_0,Predicted,Predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,1,0
,,,
True,1.0,10.0,9.0
True,0.0,8.0,22.0



Confusion matrix (relative):


Unnamed: 0_level_0,Unnamed: 1_level_0,Predicted,Predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,1,0
,,,
True,1.0,20.41,18.37
True,0.0,16.33,44.9



Classification report:

              precision    recall  f1-score   support

           0       0.71      0.73      0.72        30
           1       0.56      0.53      0.54        19

    accuracy                           0.65        49
   macro avg       0.63      0.63      0.63        49
weighted avg       0.65      0.65      0.65        49

Overall accuracy: 0.6531

Majority class accuracy (0): 0.6122


### Decision Tree / Random Forrest

In [160]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline

steps = [('StandardScaler', StandardScaler()),
         ('RandomForestClassifier', RandomForestClassifier())]

model = Pipeline(steps)
model.fit(X_tr, y_tr)
pred = model.predict(X_te)
evaluate(y_te, pred, labels = [1, 0], return_data = False)


Confusion matrix (absolute):


Unnamed: 0_level_0,Unnamed: 1_level_0,Predicted,Predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,1,0
,,,
True,1.0,9.0,10.0
True,0.0,7.0,23.0



Confusion matrix (relative):


Unnamed: 0_level_0,Unnamed: 1_level_0,Predicted,Predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,1,0
,,,
True,1.0,18.37,20.41
True,0.0,14.29,46.94



Classification report:

              precision    recall  f1-score   support

           0       0.70      0.77      0.73        30
           1       0.56      0.47      0.51        19

    accuracy                           0.65        49
   macro avg       0.63      0.62      0.62        49
weighted avg       0.64      0.65      0.65        49

Overall accuracy: 0.6531

Majority class accuracy (0): 0.6122
