In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
# from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import f1_score, RocCurveDisplay, roc_auc_score

In [5]:
seed = 42
# https://archive.ics.uci.edu/dataset/267/banknote+authentication
df = pd.read_csv('..\\..\\data\\data_banknote_authentication.csv')

ycols = 'class'
xcols = [name for name in df.columns if name not in ycols]

# Split data into test, split and validation sets
Y = df['class']
features = [name for name in df.columns if name not in ['class']]
X = df[features]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=seed)

all_names =[]
all_perf = {'F1':[],
            'AUC':[]}

In [6]:
def get_predictions(model):
    model.fit(X_train, Y_train)
    prediction = model.predict(X_test)
    out_df = pd.DataFrame(columns=['pred'], data=prediction)
    out_df['prob'] = model.predict_proba(X_test[:,1])
    return out_df


def train_and_display(preds, name):
    f1 = f1_score(Y_test, preds['pred'])
    auc = roc_auc_score(Y_test, preds['prob'])

    print("\n", name, ": ")
    print(classification_report(preds['pred'], Y_test))
    print("F1 = ", f1)
    print("AUC = ", auc)

    print()
    all_names.append(name)
    all_perf['F1'].append(f1)
    all_perf['AUC'].append(auc)

    # RocCurveDisplay.from_predictions(val[ycols], preds['prob'])
    # plt.grid()
    # plt.axis('equal')
    # plt.show()

def final_test(model, name):
    test_preds = model.predict(test[xcols])

    print("\n----------TESTING DATA---------\n", name, ": ")
    f1 = f1_score([i == 0 for i in Y_test.values], [i == 0 for i in test_preds])
    auc = roc_auc_score(Y_test, test_preds)
    print("F1 = ", f1)
    print("AUC = ", auc)

# Prepare base pipeline with preprocessing
numeric_transformer = Pipeline([
    # ('imputer', SimpleImputer(strategy='mean')) - no NA values so no need
    ('scaler', StandardScaler())
])
# No need for ColumnTransformer as all data is numeric

In [7]:
from sklearn.ensemble import RandomForestClassifier
rf = Pipeline([
    ('pre', numeric_transformer),
    ('model', RandomForestClassifier(n_estimators=100,
                                     min_samples_split=20,
                                     min_samples_leaf=5,
                                     n_jobs=1))
])
name = 'Random forest classifier'
train_and_display(rf, name)

KeyError: 'pred'

In [None]:
from sklearn.linear_model import LogisticRegression
lr = Pipeline([
    ('pre', numeric_transformer),
    ('model', LogisticRegression(random_state=seed, intercept_scaling=0.5, max_iter=10000))
])
name = 'Logistic regression'
train_and_display(lr, name)

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtc = Pipeline([
    ('pre', numeric_transformer),
    ('model', DecisionTreeClassifier(max_depth=3, min_samples_leaf=10, random_state=seed))
])
name = "Decision tree classifier"
train_and_display(dtc, name)

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
parameters = {'kernel':['rbf', 'poly', 'linear'],
            'C':[0.1, 1, 10], 'gamma':[1, 0.1, 0.001]}
name = "SVM found with GS"
gs_svm = Pipeline([
    ('pre', numeric_transformer),
    ('SVM', GridSearchCV(SVC(), parameters))
])
train_and_display(gs_svm, name)

In [None]:
final_test(gs_svm, name)

In [None]:
x = np.arange(len(all_names))
width = 0.25
multiplier = 0.5

fig, ax = plt.subplots(layout = 'constrained')

for attribute, measurement in all_perf.items():
    offset = width * multiplier
    rects = ax.bar(x + offset, measurement, width, label = attribute)
    ax.bar_label(rects, padding = 3)
    multiplier +=1

ax.set_ylabel('Score')
ax.set_title('Model accuracy')
ax.set_xticks(x + width, all_names)
ax.legend(loc='upper left', ncols=2)
ax.set_ylim(0, 1)

plt.show()