In [86]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn

import time

import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

In [42]:
train_df = pd.read_csv("option_train.csv")
test_df = pd.read_csv("option_test_wolabel.csv")

In [43]:
y_train = np.where(train_df["BS"]=="Over", 1, 0)
X_train = train_df.drop(["BS", "Value"], axis=1)

In [44]:
kfolds = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)

## Logistic Regression

In [59]:
pl_log = make_pipeline(
    StandardScaler(),
    LogisticRegression())

param_grid = {"logisticregression__C":[0.001,0.01,0.1,1,10]}

grid_cv_log = GridSearchCV(pl_log, param_grid, cv=kfolds)
grid_cv_log.fit(X_train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=0, shuffle=True),
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('logisticregression',
                                        LogisticRegression())]),
             param_grid={'logisticregression__C': [0.001, 0.01, 0.1, 1, 10]})

In [72]:
pd.DataFrame(grid_cv_log.cv_results_).iloc[:, np.r_[4,6:12,13]]\
    .rename(columns={"param_logisticregression__C":"C"})\
    .sort_values("rank_test_score")

Unnamed: 0,C,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,rank_test_score
2,0.1,0.949405,0.901786,0.931548,0.889881,0.904762,0.915476,1
3,1.0,0.946429,0.89881,0.931548,0.886905,0.89881,0.9125,2
4,10.0,0.946429,0.89881,0.931548,0.886905,0.89881,0.9125,2
1,0.01,0.925595,0.889881,0.922619,0.880952,0.904762,0.904762,4
0,0.001,0.830357,0.803571,0.827381,0.785714,0.797619,0.808929,5


## KNN

In [61]:
pl_knn = make_pipeline(
    StandardScaler(),
    KNeighborsClassifier())

param_grid = {"kneighborsclassifier__n_neighbors":[3,5,7,10,20]}

grid_cv_knn = GridSearchCV(pl_knn, param_grid, cv=kfolds)
grid_cv_knn.fit(X_train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=0, shuffle=True),
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('kneighborsclassifier',
                                        KNeighborsClassifier())]),
             param_grid={'kneighborsclassifier__n_neighbors': [3, 5, 7, 10,
                                                               20]})

In [73]:
pd.DataFrame(grid_cv_knn.cv_results_)\
    .iloc[:, np.r_[4,6:12,13]].rename(columns={"param_kneighborsclassifier__n_neighbors":"n_neighbors"})\
    .sort_values("rank_test_score")

Unnamed: 0,n_neighbors,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,rank_test_score
4,20,0.96131,0.916667,0.9375,0.904762,0.916667,0.927381,1
3,10,0.955357,0.904762,0.946429,0.901786,0.919643,0.925595,2
2,7,0.946429,0.89881,0.940476,0.910714,0.922619,0.92381,3
0,3,0.928571,0.901786,0.940476,0.910714,0.928571,0.922024,4
1,5,0.934524,0.901786,0.934524,0.910714,0.925595,0.921429,5


## Decision Tree

In [66]:
pl_dt = make_pipeline(
    StandardScaler(),
    DecisionTreeClassifier())

param_grid = {"decisiontreeclassifier__max_depth":[None,20,10,5],
              "decisiontreeclassifier__min_samples_split":[2,5,10],
              "decisiontreeclassifier__min_samples_leaf":[1,5,10],
              "decisiontreeclassifier__max_features":[None,"auto"]}

grid_cv_dt = GridSearchCV(pl_dt, param_grid, cv=kfolds)
grid_cv_dt.fit(X_train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=0, shuffle=True),
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('decisiontreeclassifier',
                                        DecisionTreeClassifier())]),
             param_grid={'decisiontreeclassifier__max_depth': [None, 20, 10, 5],
                         'decisiontreeclassifier__max_features': [None, 'auto'],
                         'decisiontreeclassifier__min_samples_leaf': [1, 5, 10],
                         'decisiontreeclassifier__min_samples_split': [2, 5,
                                                                       10]})

In [71]:
pd.DataFrame(grid_cv_dt.cv_results_)\
    .iloc[:, np.r_[4:8, 9:15, 16]]\
    .rename(columns={"param_decisiontreeclassifier__max_depth":"max_depth",
                     "param_decisiontreeclassifier__max_features":"max_features",
                     "param_decisiontreeclassifier__min_samples_leaf":"min_samples_leaf",
                     "param_decisiontreeclassifier__min_samples_split":"min_samples_split"})\
    .sort_values("rank_test_score")\
    .head(10)

Unnamed: 0,max_depth,max_features,min_samples_leaf,min_samples_split,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,rank_test_score
45,10,auto,1,2,0.946429,0.916667,0.91369,0.89881,0.922619,0.919643,1
35,20,auto,10,10,0.955357,0.883929,0.9375,0.904762,0.89881,0.916071,2
62,5,,10,10,0.9375,0.901786,0.928571,0.910714,0.89881,0.915476,3
61,5,,10,5,0.9375,0.901786,0.928571,0.910714,0.89881,0.915476,3
60,5,,10,2,0.9375,0.901786,0.928571,0.910714,0.89881,0.915476,3
47,10,auto,1,10,0.949405,0.910714,0.91369,0.89881,0.904762,0.915476,3
24,20,,10,2,0.9375,0.904762,0.928571,0.910714,0.895833,0.915476,7
25,20,,10,5,0.9375,0.904762,0.928571,0.910714,0.895833,0.915476,7
26,20,,10,10,0.9375,0.904762,0.928571,0.910714,0.895833,0.915476,7
43,10,,10,5,0.9375,0.904762,0.928571,0.910714,0.895833,0.915476,7


## Random Forest

In [75]:
pl_rf = make_pipeline(
    StandardScaler(),
    RandomForestClassifier())

param_grid = {"randomforestclassifier__n_estimators":[200,400,800],
              "randomforestclassifier__max_depth":[None,5,10,20],
              "randomforestclassifier__min_samples_split":[2,5,10],
              "randomforestclassifier__max_features":["auto",None]}

grid_cv_rf = GridSearchCV(pl_rf, param_grid, cv=kfolds)
grid_cv_rf.fit(X_train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=0, shuffle=True),
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('randomforestclassifier',
                                        RandomForestClassifier())]),
             param_grid={'randomforestclassifier__max_depth': [None, 5, 10, 20],
                         'randomforestclassifier__max_features': ['auto', None],
                         'randomforestclassifier__min_samples_split': [2, 5,
                                                                       10],
                         'randomforestclassifier__n_estimators': [200, 400, 800,
                                                                  1000]})

In [79]:
pd.DataFrame(grid_cv_rf.cv_results_)\
    .iloc[:, np.r_[4:8, 9:15, 16]]\
    .rename(columns={"param_randomforestclassifier__n_estimators":"n_estimators",
                     "param_randomforestclassifier__max_depth":"max_depth",
                     "param_randomforestclassifier__min_samples_split":"min_samples_split",
                     "randomforestclassifier__max_features":"max_features"})\
    .sort_values("rank_test_score")\
    .head(10)

Unnamed: 0,max_depth,param_randomforestclassifier__max_features,min_samples_split,n_estimators,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,rank_test_score
4,,auto,5,200,0.958333,0.928571,0.949405,0.904762,0.9375,0.935714,1
7,,auto,5,1000,0.955357,0.925595,0.952381,0.904762,0.9375,0.935119,2
78,20.0,auto,5,800,0.955357,0.925595,0.943452,0.907738,0.940476,0.934524,3
77,20.0,auto,5,400,0.958333,0.925595,0.943452,0.904762,0.9375,0.933929,4
79,20.0,auto,5,1000,0.955357,0.925595,0.946429,0.901786,0.9375,0.933333,5
49,10.0,auto,2,400,0.955357,0.925595,0.949405,0.907738,0.928571,0.933333,5
1,,auto,2,400,0.958333,0.919643,0.943452,0.907738,0.9375,0.933333,7
54,10.0,auto,5,800,0.958333,0.925595,0.940476,0.904762,0.9375,0.933333,7
58,10.0,auto,10,800,0.952381,0.925595,0.949405,0.901786,0.934524,0.932738,9
3,,auto,2,1000,0.949405,0.928571,0.946429,0.907738,0.931548,0.932738,9


## Support Vector Classifier

In [81]:
pl_svc = make_pipeline(
    StandardScaler(),
    SVC())

param_grid = [{"svc__kernel":["rbf"],
               "svc__C":[100,10,1,0.1,0.01,0.001],
               "svc__gamma":[100,10,1,0.1,0.01,0.001]},
              {"svc__kernel":["linear"],
               "svc__C":[100,10,1,0.1,0.01,0.001]}]

grid_cv_svc = GridSearchCV(pl_svc, param_grid, cv=kfolds)
grid_cv_svc.fit(X_train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=0, shuffle=True),
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('svc', SVC())]),
             param_grid=[{'svc__C': [100, 10, 1, 0.1, 0.01, 0.001],
                          'svc__gamma': [100, 10, 1, 0.1, 0.01, 0.001],
                          'svc__kernel': ['rbf']},
                         {'svc__C': [100, 10, 1, 0.1, 0.01, 0.001],
                          'svc__kernel': ['linear']}])

In [85]:
pd.DataFrame(grid_cv_rf.cv_results_)\
    .iloc[:, np.r_[4:7, 8:14, 15]]\
    .rename(columns={"param_svc__C":"C",
                     "param_svc__gamma":"gamma",
                     "param_svc__kernel":"kernel"})\
    .sort_values("rank_test_score")\
    .head(10)

Unnamed: 0,C,gamma,kernel,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,rank_test_score
14,1.0,1.0,rbf,0.964286,0.931548,0.940476,0.904762,0.931548,0.934524,1
3,100.0,0.1,rbf,0.967262,0.922619,0.943452,0.901786,0.919643,0.930952,2
9,10.0,0.1,rbf,0.964286,0.919643,0.934524,0.901786,0.931548,0.930357,3
20,0.1,1.0,rbf,0.96131,0.91369,0.9375,0.904762,0.928571,0.929167,4
10,10.0,0.01,rbf,0.96131,0.919643,0.934524,0.910714,0.916667,0.928571,5
8,10.0,1.0,rbf,0.952381,0.925595,0.940476,0.910714,0.910714,0.927976,6
15,1.0,0.1,rbf,0.970238,0.91369,0.928571,0.89881,0.925595,0.927381,7
4,100.0,0.01,rbf,0.964286,0.919643,0.928571,0.89881,0.919643,0.92619,8
2,100.0,1.0,rbf,0.934524,0.91369,0.949405,0.91369,0.901786,0.922619,9
21,0.1,0.1,rbf,0.946429,0.901786,0.9375,0.892857,0.916667,0.919048,10


## Multi-layer Perceptron Classifier

In [89]:
pl_mlp = make_pipeline(
    StandardScaler(),
    MLPClassifier())

param_grid = {"mlpclassifier__activation":["relu","logistic"],
              "mlpclassifier__learning_rate":["constant","adaptive"],
              "mlpclassifier__alpha":[0.00001,0.0001,0.001,0.01,1],
              "mlpclassifier__hidden_layer_sizes":[(100,),(8,5),(10,4)]}

grid_cv_mlp = GridSearchCV(pl_mlp, param_grid, cv=kfolds)
grid_cv_mlp.fit(X_train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=0, shuffle=True),
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('mlpclassifier', MLPClassifier())]),
             param_grid={'mlpclassifier__activation': ['relu', 'logistic'],
                         'mlpclassifier__alpha': [1e-05, 0.0001, 0.001, 0.01,
                                                  1],
                         'mlpclassifier__hidden_layer_sizes': [(100,), (8, 5),
                                                               (10, 4)],
                         'mlpclassifier__learning_rate': ['constant',
                                                          'adaptive']})

In [90]:
pd.DataFrame(grid_cv_mlp.cv_results_)\
    .iloc[:, np.r_[4:8, 9:15, 16]]\
    .rename(columns={"param_mlpclassifier__activation":"activation",
                     "param_mlpclassifier__learning_rate":"learning_rate",
                     "param_mlpclassifier__alpha":"alpha",
                     "param_mlpclassifier__hidden_layer_sizes":"layer_size"})\
    .sort_values("rank_test_score")\
    .head(10)

Unnamed: 0,activation,alpha,layer_size,learning_rate,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,rank_test_score
14,relu,0.001,"(8, 5)",constant,0.964286,0.925595,0.9375,0.907738,0.943452,0.935714,1
7,relu,0.0001,"(100,)",adaptive,0.967262,0.934524,0.931548,0.904762,0.928571,0.933333,2
19,relu,0.01,"(100,)",adaptive,0.967262,0.934524,0.934524,0.901786,0.928571,0.933333,2
18,relu,0.01,"(100,)",constant,0.967262,0.928571,0.931548,0.904762,0.931548,0.932738,4
12,relu,0.001,"(100,)",constant,0.964286,0.925595,0.934524,0.910714,0.928571,0.932738,4
1,relu,1e-05,"(100,)",adaptive,0.964286,0.931548,0.934524,0.901786,0.928571,0.932143,6
0,relu,1e-05,"(100,)",constant,0.967262,0.928571,0.931548,0.904762,0.925595,0.931548,7
4,relu,1e-05,"(10, 4)",constant,0.964286,0.922619,0.946429,0.89881,0.925595,0.931548,7
17,relu,0.001,"(10, 4)",adaptive,0.964286,0.925595,0.928571,0.895833,0.940476,0.930952,9
13,relu,0.001,"(100,)",adaptive,0.964286,0.925595,0.931548,0.907738,0.925595,0.930952,9
