In [15]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.feature_selection import SelectPercentile, SelectFromModel, RFE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier


In [1]:

X = pd.read_csv("../dataset/extracted_features.csv").values
index_for_removal = np.array(
    [
        0, 10, 14, 17, 25, 26, 35, 39, 49, 59, 62,
    ]
)
X_indexed = np.delete(X, index_for_removal, 1)

y = pd.read_csv("../dataset/labels.csv").values
# images = pd.read_csv("../dataset/raw_images.csv").values
X_train, X_test, y_train, y_test = train_test_split(
    X_indexed, y, test_size=0.2, random_state=42, shuffle=True, stratify=y
)
y_train = y_train.ravel()

select = SelectPercentile(percentile=90)
select.fit(X_train, y_train)
X_train = select.transform(X_train)
X_test = select.transform(X_test)

rf = RandomForestClassifier(
    n_estimators=200,
    max_features=3,
).fit(X_train, y_train)

select_sfm = SelectFromModel(
    rf,
    threshold='0.1*mean',
)
select_sfm.fit(X_train, y_train)
X_train = select_sfm.transform(X_train)
X_test = select_sfm.transform(X_test)

select_rfe = RFE(
    RandomForestClassifier(
        n_estimators=200, random_state=42
    ), n_features_to_select=40
)

select_rfe.fit(X_train, y_train)
X_train = select_rfe.transform(X_train)
X_test = select_rfe.transform(X_test)

scaler_standard = StandardScaler().fit(X_train)
X_train_standard = scaler_standard.transform(X_train)
X_test_standard = scaler_standard.transform(X_test)

scaler_robust = RobustScaler().fit(X_train)
X_train_robust = scaler_robust.transform(X_train)
X_test_robust = scaler_robust.transform(X_test)

scaler_minmax = MinMaxScaler().fit(X_train)
X_train_minmax = scaler_minmax.transform(X_train)
X_test_minmax = scaler_minmax.transform(X_test)

pca = PCA(n_components=37, random_state=42)
X_train_pca = pca.fit_transform(X_train_standard)
X_test_pca = pca.transform(X_test_standard)


In [26]:

knn = KNeighborsClassifier(n_neighbors=5)
knn_pipeline = make_pipeline(
    MinMaxScaler(),
    PCA(n_components=37),
    knn
)

ksvm_pipeline = make_pipeline(
    MinMaxScaler(),
    SVC(
        kernel='poly',
        C=5,
        degree=1,
        random_state=42
    )
)

logreg = LogisticRegression(C=4.817948717948719)
logreg_pipeline = make_pipeline(
    MinMaxScaler(),
    logreg
)

lsvc = LinearSVC(C=0.4081632653061224)
lsvc_pipeline = make_pipeline(
    MinMaxScaler(),
    lsvc
)

mnb = MultinomialNB(alpha=0.15789473684210525, )
mnb_pipeline = make_pipeline(
    MinMaxScaler(),
    mnb
)

rf = RandomForestClassifier(n_estimators=500, max_features=5, random_state=42)
rf_pipeline = make_pipeline(
    rf
)

gbrt_pipeline = make_pipeline(
    RobustScaler(),
    GradientBoostingClassifier(
        n_estimators=500,
        max_depth=1,
        learning_rate=0.1,
        random_state=42
    )
)
mlp_pipeline = make_pipeline(
    MinMaxScaler(),
    MLPClassifier(
        activation='relu',
        hidden_layer_sizes=256,
        alpha=0.05,
        random_state=42
    )
)



In [9]:
knn = DecisionTreeClassifier(
    criterion='entropy',
    max_depth=1,
    random_state=42
)

ada = AdaBoostClassifier(
    base_estimator=knn,
    n_estimators=1000,
    learning_rate=0.1,
    random_state=42
).fit(X_train_pca, y_train)

In [10]:
print(ada.score(X_test_pca, y_test))

0.6666666666666666


In [28]:
votingClf = VotingClassifier(
    [
        ('clf1', logreg),
        ('clf2', rf)
    ],
    voting='soft'
)
adaCombo = AdaBoostClassifier(base_estimator=votingClf)
adaCombo.fit(X_train_pca, y_train)


AdaBoostClassifier(base_estimator=VotingClassifier(estimators=[('clf1',
                                                                LogisticRegression(C=4.817948717948719)),
                                                               ('clf2',
                                                                RandomForestClassifier(max_features=5,
                                                                                       n_estimators=500,
                                                                                       random_state=42))],
                                                   voting='soft'))

In [29]:
print(adaCombo.score(X_test_pca, y_test))

0.8400900900900901
