In [1]:

import pandas as pd
from sklearn import metrics
from sklearn.calibration import CalibratedClassifierCV
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix


In [3]:
X = pd.read_csv("../dataset/extracted_features.csv").values
y = pd.read_csv("../dataset/labels.csv").values
# images = pd.read_csv("../dataset/raw_images.csv").values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True, stratify=y)
y_train = y_train.ravel()

scaler_standard = StandardScaler().fit(X_train)
X_train_standard = scaler_standard.transform(X_train)
X_test_standard = scaler_standard.transform(X_test)

scaler_robust = RobustScaler().fit(X_train)
X_train_robust = scaler_robust.transform(X_train)
X_test_robust = scaler_robust.transform(X_test)

scaler_minmax = MinMaxScaler().fit(X_train)
X_train_minmax = scaler_minmax.transform(X_train)
X_test_minmax = scaler_minmax.transform(X_test)

pca = PCA(n_components=56, random_state=42)
X_train_pca = pca.fit_transform(X_train_standard)
X_test_pca = pca.transform(X_test_standard)


2219
1775
444


## knn

In [4]:

knn = KNeighborsClassifier(n_neighbors=5).fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
# roc_knn = roc_curve(y_test, y_pred_knn)

print(knn.score(X_train, y_train))
print(knn.score(X_test, y_test))
print(roc_auc_score(y_test, knn.predict_proba(X_test), multi_class='ovr'))

0.9092957746478874
0.8536036036036037
0.9744479873193376


In [6]:
matrix_knn = confusion_matrix(y_test, y_pred_knn)
print(matrix_knn.diagonal() / matrix_knn.sum(axis=1))

[0.96923077 0.8        0.69230769 0.87096774 0.92857143 0.8245614
 0.92307692 0.90625    0.63333333]


## logreg

In [9]:
logreg = LogisticRegression(C=4.817948717948719).fit(X_train_minmax, y_train)
y_pred_logreg = logreg.predict(X_test_minmax)

print(logreg.score(X_train_minmax, y_train))
print(logreg.score(X_test_minmax, y_test))
print(roc_auc_score(y_test, logreg.predict_proba(X_test_minmax), multi_class='ovr'))


0.9425352112676056
0.9234234234234234
0.9928545380844199


## Linear SVC

In [18]:
lsvc = LinearSVC(C=1.6326530612244898)
clf = CalibratedClassifierCV(lsvc).fit(X_train_minmax, y_train)
y_pred_lsvc = clf.predict(X_test_minmax)

print(clf.score(X_train_minmax, y_train))
print(clf.score(X_test_minmax, y_test))
roc_auc_score(y_test, clf.predict_proba(X_test_minmax), multi_class='ovr')


0.9363380281690141
0.918918918918919


0.9878363185799935

## mulinomial naive bayes

In [11]:

mnb = MultinomialNB(alpha=0.15789473684210525).fit(X_train, y_train)
y_pred_mnb = mnb.predict(X_test)

print(mnb.score(X_train, y_train))
print(mnb.score(X_test, y_test))
roc_auc_score(y_test, mnb.predict_proba(X_test), multi_class='ovr')



0.9014084507042254
0.8986486486486487


0.987718357451597

# forests

## random forest

In [12]:
rf = RandomForestClassifier(max_features=5, n_estimators=500, random_state=42).fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print(rf.score(X_train, y_train))
print(rf.score(X_test, y_test))
roc_auc_score(y_test, rf.predict_proba(X_test), multi_class='ovr')



0.9915492957746479
0.8918918918918919


0.9866071692017238

## gradient boost

In [13]:
gbrt = GradientBoostingClassifier(
    n_estimators=500,
    max_depth=1,
    learning_rate=0.1,
).fit(X_train_robust, y_train)
y_pred_gbrt = gbrt.predict(X_test_robust)

print(gbrt.score(X_train_robust, y_train))
print(gbrt.score(X_test_robust, y_test))

print(roc_auc_score(y_test, gbrt.predict_proba(X_test_robust), multi_class='ovr'))


0.9819718309859155
0.8873873873873874
0.9893634599635737


## ksvm

In [15]:
ksvm = SVC(C=5, degree=1, kernel='poly', probability=True).fit(X_train, y_train)
y_pred_ksvm = ksvm.predict(X_test)

print(ksvm.score(X_train, y_train))
print(ksvm.score(X_test, y_test))
print(roc_auc_score(y_test, ksvm.predict_proba(X_test), multi_class='ovr'))


0.9414084507042253
0.9009009009009009
0.9935144809138793


## MLP

In [16]:
mlp = MLPClassifier(random_state=42, activation='relu', hidden_layer_sizes=256, alpha=0.05).fit(X_train_minmax, y_train)
y_pred_mlp = mlp.predict(X_test_minmax)

print(mlp.score(X_train_minmax, y_train))
print(mlp.score(X_test_minmax, y_test))
print(roc_auc_score(y_test, mlp.predict_proba(X_test_minmax), multi_class='ovr'))


0.9847887323943662
0.9436936936936937
0.9969471380675198




In [21]:
matrix_knn = confusion_matrix(y_test, y_pred_knn)
matrix_logreg = confusion_matrix(y_test, y_pred_logreg)
matrix_lsvc = confusion_matrix(y_test, y_pred_lsvc)
matrix_mnb = confusion_matrix(y_test, y_pred_mnb)
matrix_rf = confusion_matrix(y_test, y_pred_rf)
matrix_gbrt = confusion_matrix(y_test, y_pred_gbrt)
matrix_ksvm = confusion_matrix(y_test, y_pred_ksvm)
matrix_mlp = confusion_matrix(y_test, y_pred_mlp)
class_scores_knn = matrix_knn.diagonal() / matrix_knn.sum(axis=1)
class_scores_logreg = matrix_logreg.diagonal() / matrix_logreg.sum(axis=1)
class_scores_lsvc = matrix_lsvc.diagonal() / matrix_lsvc.sum(axis=1)
class_scores_mnb = matrix_mnb.diagonal() / matrix_mnb.sum(axis=1)
class_scores_rf = matrix_rf.diagonal() / matrix_rf.sum(axis=1)
class_scores_gbrt = matrix_gbrt.diagonal() / matrix_gbrt.sum(axis=1)
class_scores_ksvm = matrix_ksvm.diagonal() / matrix_ksvm.sum(axis=1)
class_scores_mlp = matrix_mlp.diagonal() / matrix_mlp.sum(axis=1)

scores_data = {
    'knn': class_scores_knn,
    'logreg': class_scores_logreg,
    'lsvc': class_scores_lsvc,
    'mnb': class_scores_mnb,
    'rf': class_scores_rf,
    'gbrt': class_scores_gbrt,
    'ksvm': class_scores_ksvm,
    'mlp': class_scores_mlp,
}
scores_df = pd.DataFrame(data=scores_data)
scores_df

Unnamed: 0,knn,logreg,lsvc,mnb,rf,gbrt,ksvm,mlp
0,0.969231,0.984615,1.0,0.984615,0.969231,0.938462,0.969231,0.984615
1,0.8,0.9,0.95,0.95,0.8,0.85,0.85,0.95
2,0.692308,0.884615,0.884615,0.884615,0.807692,0.923077,0.846154,0.923077
3,0.870968,0.967742,0.935484,0.967742,0.903226,0.935484,0.935484,1.0
4,0.928571,0.964286,0.964286,0.910714,0.910714,0.928571,0.964286,0.964286
5,0.824561,0.964912,0.964912,0.982456,0.947368,0.877193,0.964912,0.964912
6,0.923077,0.923077,0.938462,0.923077,0.923077,0.907692,0.923077,0.953846
7,0.90625,0.9375,0.921875,0.953125,0.875,0.859375,0.875,0.921875
8,0.633333,0.766667,0.716667,0.583333,0.783333,0.783333,0.733333,0.85
