In [1]:
import pandas as pd
import numpy as np
import joblib

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [2]:
X_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('y_train.csv')

In [3]:
# scale features
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)

# Logistic Regression

In [4]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(random_state=1)

parameters = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
}

lr_cv = GridSearchCV(lr, parameters, cv=5)
lr_cv.fit(X_train_std, y_train.values.ravel())
print('Mean Test Scores:', lr_cv.cv_results_['mean_test_score'])

Mean Test Scores: [0.78011834 0.80071006 0.80047337 0.80260355 0.80307692 0.80260355
 0.80260355]


In [5]:
joblib.dump(lr_cv.best_estimator_, 'LR.pkl')

['LR.pkl']

# Random Forest

In [6]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=1)

parameters = {
    'n_estimators': [5, 25, 50, 100, 250],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [2, 4, 8, 16, 32, 64, None]
}

rf_cv = GridSearchCV(rf, parameters, cv=5)
rf_cv.fit(X_train_std, y_train.values.ravel())
print('Mean Test Scores:', rf_cv.cv_results_['mean_test_score'])

Mean Test Scores: [0.77349112 0.75502959 0.75218935 0.75053254 0.75526627 0.77349112
 0.75502959 0.75218935 0.75053254 0.75526627 0.73443787 0.74579882
 0.73940828 0.74343195 0.74508876 0.78272189 0.78840237 0.78650888
 0.78769231 0.78627219 0.78272189 0.78840237 0.78650888 0.78769231
 0.78627219 0.77065089 0.78650888 0.78366864 0.78745562 0.78414201
 0.78224852 0.79573964 0.7947929  0.79786982 0.79739645 0.78224852
 0.79573964 0.7947929  0.79786982 0.79739645 0.78295858 0.79218935
 0.79502959 0.79573964 0.7983432  0.77017751 0.7808284  0.78461538
 0.7860355  0.7895858  0.77017751 0.7808284  0.78461538 0.7860355
 0.7895858  0.76757396 0.78792899 0.78887574 0.78556213 0.78792899
 0.76544379 0.78343195 0.78508876 0.78508876 0.78840237 0.76544379
 0.78343195 0.78508876 0.78508876 0.78840237 0.76733728 0.78745562
 0.79218935 0.78863905 0.78982249 0.76544379 0.78343195 0.78508876
 0.78508876 0.78840237 0.76544379 0.78343195 0.78508876 0.78508876
 0.78840237 0.76733728 0.78745562 0.79218935 

In [7]:
joblib.dump(rf_cv.best_estimator_, 'RF.pkl')

['RF.pkl']

# Gradient Boosting

In [8]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(random_state=1)

parameters = {
    'n_estimators' : [5, 50, 100, 250, 500], 
    'max_depth' : [2, 4, 6, 8, 10],
    'learning_rate' : [0.01, 0.1, 1, 10, 100]
}

gb_cv = GridSearchCV(gb, parameters, cv=5)
gb_cv.fit(X_train_std, y_train.values.ravel())
print('Mean Test Scores:', gb_cv.cv_results_['mean_test_score'])

Mean Test Scores: [0.72804734 0.72804734 0.75786982 0.78792899 0.7964497  0.72804734
 0.75408284 0.77680473 0.79621302 0.80094675 0.72804734 0.75502959
 0.77633136 0.79431953 0.79147929 0.72804734 0.75431953 0.77467456
 0.78485207 0.78035503 0.72804734 0.75195266 0.7635503  0.77585799
 0.77514793 0.72804734 0.79763314 0.80284024 0.80047337 0.79053254
 0.75408284 0.80260355 0.79550296 0.78461538 0.77751479 0.75597633
 0.79100592 0.78721893 0.77940828 0.77704142 0.75526627 0.78295858
 0.77822485 0.77278107 0.77278107 0.75147929 0.77940828 0.77491124
 0.7756213  0.77727811 0.78698225 0.77988166 0.76781065 0.74532544
 0.74011834 0.78319527 0.74769231 0.74319527 0.75005917 0.75242604
 0.76473373 0.74698225 0.75218935 0.76284024 0.76828402 0.75005917
 0.75195266 0.76449704 0.76828402 0.7704142  0.73230769 0.76071006
 0.77349112 0.77183432 0.77538462 0.29893491 0.29893491 0.29893491
 0.29893491 0.29893491 0.48118343 0.48118343 0.48118343 0.48118343
 0.48118343 0.46272189 0.45538462 0.45538462

In [9]:
joblib.dump(gb_cv.best_estimator_, 'GB.pkl')

['GB.pkl']