In [24]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.calibration import CalibratedClassifierCV
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.feature_selection import SelectPercentile, SelectFromModel, RFE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, f1_score


In [9]:
X = pd.read_csv("../dataset/extracted_features.csv").values
y = pd.read_csv("../dataset/labels.csv").values
# images = pd.read_csv("../dataset/raw_images.csv").values
X_train_default, X_test_default, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True, stratify=y
)
y_train = y_train.ravel()

index_for_removal = np.array(
    [
        0, 10, 14, 17, 25, 26, 35, 39, 49, 59, 62,
    ]
)
X_train_zero_variance = np.delete(X_train_default, index_for_removal, 1)
X_test_zero_variance = np.delete(X_test_default, index_for_removal, 1)

select = SelectPercentile(percentile=90)
select.fit(X_train_zero_variance, y_train)
X_train_univariate = select.transform(X_train_zero_variance)
X_test_univariate = select.transform(X_test_zero_variance)

rf = RandomForestClassifier(
    n_estimators=200,
    max_features=3,
).fit(X_train_default, y_train)

select_sfm = SelectFromModel(
    rf,
    threshold='0.1*mean',
)
select_sfm.fit(X_train_univariate, y_train)
X_train_from_model = select_sfm.transform(X_train_univariate)
X_test_from_model = select_sfm.transform(X_test_univariate)

select_rfe = RFE(
    RandomForestClassifier(
        n_estimators=200, random_state=42
    ),
    n_features_to_select=40
)

select_rfe.fit(X_train_from_model, y_train)
X_train_iterative = select_rfe.transform(X_train_from_model)
X_test_iterative = select_rfe.transform(X_test_from_model)

scaler_standard_default = StandardScaler().fit(X_train_default)
X_train_standard_default = scaler_standard_default.transform(X_train_default)
X_test_standard_default = scaler_standard_default.transform(X_test_default)

scaler_standard_variance = StandardScaler().fit(X_train_zero_variance)
X_train_standard_variance = scaler_standard_variance.transform(X_train_zero_variance)
X_test_standard_variance = scaler_standard_variance.transform(X_test_zero_variance)

scaler_standard_refined = StandardScaler().fit(X_train_iterative)
X_train_standard_refined = scaler_standard_refined.transform(X_train_iterative)
X_test_standard_refined = scaler_standard_refined.transform(X_test_iterative)

scaler_robust_default = RobustScaler().fit(X_train_default)
X_train_robust_default = scaler_robust_default.transform(X_train_default)
X_test_robust_default = scaler_robust_default.transform(X_test_default)

scaler_robust_variance = RobustScaler().fit(X_train_zero_variance)
X_train_robust_variance = scaler_robust_variance.transform(X_train_zero_variance)
X_test_robust_variance = scaler_robust_variance.transform(X_test_zero_variance)

scaler_robust_refined = RobustScaler().fit(X_train_iterative)
X_train_robust_refined = scaler_robust_refined.transform(X_train_iterative)
X_test_robust_refined = scaler_robust_refined.transform(X_test_iterative)

scaler_minmax_default = MinMaxScaler().fit(X_train_default)
X_train_minmax_default = scaler_minmax_default.transform(X_train_default)
X_test_minmax_default = scaler_minmax_default.transform(X_test_default)

scaler_minmax_variance = MinMaxScaler().fit(X_train_zero_variance)
X_train_minmax_variance = scaler_minmax_variance.transform(X_train_zero_variance)
X_test_minmax_variance = scaler_minmax_variance.transform(X_test_zero_variance)

scaler_minmax_refined = MinMaxScaler().fit(X_train_iterative)
X_train_minmax_refined = scaler_minmax_refined.transform(X_train_iterative)
X_test_minmax_refined = scaler_minmax_refined.transform(X_test_iterative)

pca_default = PCA(n_components=56, random_state=42)
X_train_pca_default = pca_default.fit_transform(X_train_standard_default)
X_test_pca_default = pca_default.transform(X_test_standard_default)

pca_refined = PCA(n_components=37, random_state=42)
X_train_pca_refined = pca_refined.fit_transform(X_train_standard_refined)
X_test_pca_refined = pca_refined.transform(X_test_standard_refined)


In [1]:
'hw'

'hw'

## knn

In [10]:

knn = KNeighborsClassifier(n_neighbors=5).fit(X_train_iterative, y_train)
y_pred_knn = knn.predict(X_test_iterative)
# roc_knn = roc_curve(y_test, y_pred_knn)

print(knn.score(X_train_iterative, y_train))
print(knn.score(X_test_iterative, y_test))
print(roc_auc_score(y_test, knn.predict_proba(X_test_iterative), multi_class='ovr'))

0.9087323943661972
0.8603603603603603
0.9746155749769753


In [11]:
matrix_knn = confusion_matrix(y_test, y_pred_knn)
print(matrix_knn.diagonal() / matrix_knn.sum(axis=1))

[0.98461538 0.85       0.73076923 0.87096774 0.92857143 0.87719298
 0.89230769 0.90625    0.61666667]


## logreg

In [12]:
logreg = LogisticRegression(C=4.736842).fit(X_train_minmax_default, y_train)
y_pred_logreg = logreg.predict(X_test_minmax_default)

print(logreg.score(X_train_minmax_default, y_train))
print(logreg.score(X_test_minmax_default, y_test))
print(roc_auc_score(y_test, logreg.predict_proba(X_test_minmax_default), multi_class='ovr'))


0.9419718309859155
0.9234234234234234
0.9928309914836293


## Linear SVC

In [13]:
lsvc = LinearSVC(C=1.842105)
clf = CalibratedClassifierCV(lsvc).fit(X_train_minmax_variance, y_train)
y_pred_lsvc = clf.predict(X_test_minmax_variance)

print(clf.score(X_train_minmax_variance, y_train))
print(clf.score(X_test_minmax_variance, y_test))
roc_auc_score(y_test, clf.predict_proba(X_test_minmax_variance), multi_class='ovr')


0.9385915492957746
0.9211711711711712


0.9877975677362554

## mulinomial naive bayes

In [14]:

mnb = MultinomialNB(alpha=0.210526).fit(X_train_default, y_train)
y_pred_mnb = mnb.predict(X_test_default)

print(mnb.score(X_train_default, y_train))
print(mnb.score(X_test_default, y_test))
roc_auc_score(y_test, mnb.predict_proba(X_test_default), multi_class='ovr')



0.9014084507042254
0.8986486486486487


0.9877482490068411

# forests

## random forest

In [15]:
rf = RandomForestClassifier(max_features=3, n_estimators=400, random_state=42).fit(X_train_zero_variance, y_train)
y_pred_rf = rf.predict(X_test_zero_variance)

print(rf.score(X_train_zero_variance, y_train))
print(rf.score(X_test_zero_variance, y_test))
roc_auc_score(y_test, rf.predict_proba(X_test_zero_variance), multi_class='ovr')



0.9915492957746479
0.8918918918918919


0.9853635108360425

## gradient boost

In [16]:
gbrt = GradientBoostingClassifier(
    n_estimators=400,
    max_depth=1,
    learning_rate=0.1,
).fit(X_train_zero_variance, y_train)
y_pred_gbrt = gbrt.predict(X_test_zero_variance)

print(gbrt.score(X_train_zero_variance, y_train))
print(gbrt.score(X_test_zero_variance, y_test))

print(roc_auc_score(y_test, gbrt.predict_proba(X_test_zero_variance), multi_class='ovr'))


0.9752112676056338
0.8873873873873874
0.9886846924736518


## ksvm

In [18]:
ksvm = SVC(
    C=5,
    degree=1,
    kernel='rbf',
    probability=True
).fit(X_train_minmax_variance, y_train)
y_pred_ksvm = ksvm.predict(X_test_minmax_variance)

print(ksvm.score(X_train_minmax_variance, y_train))
print(ksvm.score(X_test_minmax_variance, y_test))
print(roc_auc_score(y_test, ksvm.predict_proba(X_test_minmax_variance), multi_class='ovr'))


0.9853521126760564
0.9211711711711712
0.9947236794465584


## MLP

In [20]:
mlp = MLPClassifier(
    random_state=42,
    activation='relu',
    hidden_layer_sizes=512,
    alpha=0.01,
).fit(X_train_minmax_default, y_train)
y_pred_mlp = mlp.predict(X_test_minmax_default)

print(mlp.score(X_train_minmax_default, y_train))
print(mlp.score(X_test_minmax_default, y_test))
print(roc_auc_score(y_test, mlp.predict_proba(X_test_minmax_default), multi_class='ovr'))


0.9870422535211267
0.9504504504504504
0.9972273902976444




In [22]:
matrix_knn = confusion_matrix(y_test, y_pred_knn)
matrix_logreg = confusion_matrix(y_test, y_pred_logreg)
matrix_lsvc = confusion_matrix(y_test, y_pred_lsvc)
matrix_mnb = confusion_matrix(y_test, y_pred_mnb)
matrix_rf = confusion_matrix(y_test, y_pred_rf)
matrix_gbrt = confusion_matrix(y_test, y_pred_gbrt)
matrix_ksvm = confusion_matrix(y_test, y_pred_ksvm)
matrix_mlp = confusion_matrix(y_test, y_pred_mlp)
class_scores_knn = matrix_knn.diagonal() / matrix_knn.sum(axis=1)
class_scores_logreg = matrix_logreg.diagonal() / matrix_logreg.sum(axis=1)
class_scores_lsvc = matrix_lsvc.diagonal() / matrix_lsvc.sum(axis=1)
class_scores_mnb = matrix_mnb.diagonal() / matrix_mnb.sum(axis=1)
class_scores_rf = matrix_rf.diagonal() / matrix_rf.sum(axis=1)
class_scores_gbrt = matrix_gbrt.diagonal() / matrix_gbrt.sum(axis=1)
class_scores_ksvm = matrix_ksvm.diagonal() / matrix_ksvm.sum(axis=1)
class_scores_mlp = matrix_mlp.diagonal() / matrix_mlp.sum(axis=1)

scores_data = {
    'knn': class_scores_knn,
    'logreg': class_scores_logreg,
    'lsvc': class_scores_lsvc,
    'mnb': class_scores_mnb,
    'rf': class_scores_rf,
    'gbrt': class_scores_gbrt,
    'ksvm': class_scores_ksvm,
    'mlp': class_scores_mlp,
}
scores_df = pd.DataFrame(data=scores_data)
scores_df

Unnamed: 0,knn,logreg,lsvc,mnb,rf,gbrt,ksvm,mlp
0,0.984615,0.984615,1.0,0.984615,0.984615,0.938462,0.953846,0.984615
1,0.85,0.9,0.95,0.95,0.7,0.85,0.8,0.95
2,0.730769,0.884615,0.884615,0.884615,0.807692,0.923077,0.923077,0.923077
3,0.870968,0.967742,0.935484,0.967742,0.870968,0.935484,0.935484,1.0
4,0.928571,0.964286,0.964286,0.910714,0.892857,0.928571,0.946429,0.964286
5,0.877193,0.964912,0.964912,0.982456,0.947368,0.877193,0.912281,0.964912
6,0.892308,0.923077,0.953846,0.923077,0.938462,0.907692,0.923077,0.938462
7,0.90625,0.9375,0.921875,0.953125,0.875,0.859375,0.9375,0.9375
8,0.616667,0.766667,0.716667,0.583333,0.816667,0.783333,0.883333,0.9


In [29]:
f1_knn = f1_score(y_true=y_test, y_pred=y_pred_knn, average=None)
f1_logreg = f1_score(y_true=y_test, y_pred=y_pred_logreg, average=None)
f1_lsvc = f1_score(y_true=y_test, y_pred=y_pred_lsvc, average=None)
f1_mnb = f1_score(y_true=y_test, y_pred=y_pred_mnb, average=None)
f1_rf = f1_score(y_true=y_test, y_pred=y_pred_rf, average=None)
f1_gbrt = f1_score(y_true=y_test, y_pred=y_pred_gbrt, average=None)
f1_ksvm = f1_score(y_true=y_test, y_pred=y_pred_ksvm, average=None)
f1_mlp = f1_score(y_true=y_test, y_pred=y_pred_mlp, average=None)

In [32]:
print(f1_knn)

[0.88275862 0.80952381 0.79166667 0.84375    0.90434783 0.88495575
 0.90625    0.88549618 0.7254902 ]


In [31]:
pd.DataFrame(
    data={
        'knn': f1_knn,
        'logreg': f1_logreg,
        'lsvc': f1_lsvc,
        'mnb': f1_mnb,
        'rf': f1_rf,
        'gbrt': f1_gbrt,
        'ksvm': f1_ksvm,
        'mlp': f1_mlp,
    },
)

Unnamed: 0,knn,logreg,lsvc,mnb,rf,gbrt,ksvm,mlp
0,0.882759,0.984615,0.992366,0.984615,0.955224,0.945736,0.946565,0.977099
1,0.809524,0.878049,0.904762,0.844444,0.736842,0.829268,0.842105,0.926829
2,0.791667,0.92,0.92,0.779661,0.893617,0.888889,0.96,0.941176
3,0.84375,0.967742,0.95082,0.9375,0.9,0.95082,0.95082,0.96875
4,0.904348,0.93913,0.931034,0.894737,0.877193,0.912281,0.946429,0.955752
5,0.884956,0.956522,0.956522,0.982456,0.93913,0.917431,0.945455,0.973451
6,0.90625,0.9375,0.925373,0.9375,0.924242,0.893939,0.923077,0.953125
7,0.885496,0.9375,0.921875,0.931298,0.88189,0.88,0.9375,0.9375
8,0.72549,0.773109,0.774775,0.679612,0.809917,0.764228,0.828125,0.907563
