In [53]:

import pandas as pd
from sklearn import metrics
from sklearn.calibration import CalibratedClassifierCV
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.svm import LinearSVC, SVC


In [9]:
X = pd.read_csv("../dataset/extracted_features.csv").values
y = pd.read_csv("../dataset/labels.csv").values
# images = pd.read_csv("../dataset/raw_images.csv").values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True, stratify=y)
y_train = y_train.ravel()

scaler_standard = StandardScaler().fit(X_train)
X_train_standard = scaler_standard.transform(X_train)
X_test_standard = scaler_standard.transform(X_test)

scaler_robust = RobustScaler().fit(X_train)
X_train_robust = scaler_robust.transform(X_train)
X_test_robust = scaler_robust.transform(X_test)

scaler_minmax = MinMaxScaler().fit(X_train)
X_train_minmax = scaler_minmax.transform(X_train)
X_test_minmax = scaler_minmax.transform(X_test)

pca = PCA(n_components=56, random_state=42)
X_train_pca = pca.fit_transform(X_train_standard)
X_test_pca = pca.transform(X_test_standard)


2219
1775
444


## knn

In [57]:
from sklearn.metrics import roc_auc_score, roc_curve

knn = KNeighborsClassifier(n_neighbors=5).fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
# roc_knn = roc_curve(y_test, y_pred_knn)

print(knn.score(X_train, y_train))
print(knn.score(X_test, y_test))
print(roc_auc_score(y_test, knn.predict_proba(X_test), multi_class='ovr'))

0.9092957746478874
0.8536036036036037
0.9744479873193376


## logreg

In [36]:
logreg = LogisticRegression(C=4.817948717948719).fit(X_train_minmax, y_train)
print(logreg.score(X_train_minmax, y_train))
print(logreg.score(X_test_minmax, y_test))
print(roc_auc_score(y_test, logreg.predict_proba(X_test_minmax), multi_class='ovr'))


0.9425352112676056
0.9234234234234234
0.9928545380844199
0.996147361635756


## Linear SVC

In [49]:
lsvc = LinearSVC(C=1.6326530612244898)
clf = CalibratedClassifierCV(lsvc).fit(X_train_minmax, y_train)
print(clf.score(X_train_minmax, y_train))
print(clf.score(X_test_minmax, y_test))
roc_auc_score(y_test, clf.predict_proba(X_test_minmax), multi_class='ovr')


0.9363380281690141
0.918918918918919


0.9878363185799935

## mulinomial naive bayes

In [32]:

mnb = MultinomialNB(alpha=0.15789473684210525).fit(X_train, y_train)
print(mnb.score(X_train, y_train))
print(mnb.score(X_test, y_test))
roc_auc_score(y_test, mnb.predict_proba(X_test), multi_class='ovr')



0.9014084507042254
0.8986486486486487


0.987718357451597

# forests

## random forest

In [33]:
rf = RandomForestClassifier(max_features=5, n_estimators=500, random_state=42).fit(X_train, y_train)
print(rf.score(X_train, y_train))
print(rf.score(X_test, y_test))
roc_auc_score(y_test, rf.predict_proba(X_test), multi_class='ovr')



0.9915492957746479
0.8918918918918919


0.9866071692017238

## gradient boost

In [38]:
gbrt = GradientBoostingClassifier(
    n_estimators=500,
    max_depth=1,
    learning_rate=0.1,
).fit(X_train_robust, y_train)
print(gbrt.score(X_train_robust, y_train))
print(gbrt.score(X_test_robust, y_test))

print(roc_auc_score(y_test, gbrt.predict_proba(X_test_robust), multi_class='ovr'))


0.9819718309859155
0.8873873873873874
0.9893634599635737


## ksvm

In [40]:
ksvm = SVC(C=5, degree=1, kernel='poly', probability=True).fit(X_train, y_train)
print(ksvm.score(X_train, y_train))
print(ksvm.score(X_test, y_test))
print(roc_auc_score(y_test, ksvm.predict_proba(X_test), multi_class='ovr'))


0.9414084507042253
0.9009009009009009
0.9935097687128214


## MLP

In [42]:
mlp = MLPClassifier(random_state=42, activation='relu', hidden_layer_sizes=256, alpha=0.05).fit(X_train_minmax, y_train)
print(mlp.score(X_train_minmax, y_train))
print(mlp.score(X_test_minmax, y_test))
print(roc_auc_score(y_test, mlp.predict_proba(X_test_minmax), multi_class='ovr'))


0.9847887323943662
0.9436936936936937
0.9969471380675198


