In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import GridSearchCV

In [2]:
X_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('y_train.csv')

# Logistic Regression

In [3]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(random_state=1)

parameters = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
}

lr_cv = GridSearchCV(estimator=lr, param_grid=parameters, scoring='f1', cv=5)
lr_cv.fit(X_train, y_train.values.ravel())
print('Mean Test Scores:', lr_cv.cv_results_['mean_test_score'])

Mean Test Scores: [0.74755263 0.75190371 0.75586711 0.75528018 0.75632574 0.75670737
 0.75670737]


In [5]:
joblib.dump(lr_cv.best_estimator_, 'LR.pkl')

['LR.pkl']

# Random Forest

In [6]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=1)

parameters = {
    'n_estimators': [5, 25, 50, 100, 250],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [2, 4, 8, 16, 32, 64, None]
}

rf_cv = GridSearchCV(estimator=rf, param_grid=parameters, scoring='f1', cv=5)
rf_cv.fit(X_train, y_train.values.ravel())
print('Mean Test Scores:', rf_cv.cv_results_['mean_test_score'])

Mean Test Scores: [0.69237474 0.73182731 0.72220465 0.73025216 0.73764485 0.69237474
 0.73182731 0.72220465 0.73025216 0.73764485 0.72285302 0.73853663
 0.75083702 0.74002338 0.73960907 0.73884146 0.74358443 0.74043022
 0.74045931 0.7484626  0.73884146 0.74358443 0.74043022 0.74045931
 0.7484626  0.73864298 0.74457589 0.75000468 0.75033819 0.75165126
 0.75610266 0.768573   0.76952736 0.77074061 0.77324189 0.75610266
 0.768573   0.76952736 0.77074061 0.77324189 0.74966906 0.76549025
 0.77122256 0.77035971 0.77049728 0.75319294 0.78342263 0.78693052
 0.79218129 0.79462157 0.75319294 0.78342263 0.78693052 0.79218129
 0.79462157 0.75652386 0.7838905  0.79066051 0.79715337 0.79639746
 0.76156544 0.77908816 0.78590722 0.78727252 0.78936589 0.76156544
 0.77908816 0.78590722 0.78727252 0.78936589 0.75697841 0.78217213
 0.78566358 0.78721453 0.79513634 0.76156544 0.77908816 0.78590722
 0.78727252 0.78936589 0.76156544 0.77908816 0.78590722 0.78727252
 0.78936589 0.75697841 0.78217213 0.78566358

In [7]:
joblib.dump(rf_cv.best_estimator_, 'RF.pkl')

['RF.pkl']

# Gradient Boosting

In [8]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(random_state=1)

parameters = {
    'n_estimators' : [5, 50, 100, 250, 500], 
    'max_depth' : [2, 4, 6, 8, 10],
    'learning_rate' : [0.01, 0.1, 1, 10, 100]
}

gb_cv = GridSearchCV(estimator=gb, param_grid=parameters, scoring='f1', cv=5)
gb_cv.fit(X_train, y_train.values.ravel())
print('Mean Test Scores:', gb_cv.cv_results_['mean_test_score'])

Mean Test Scores: [0.         0.68873853 0.74276136 0.75732814 0.7556091  0.
 0.7367717  0.75573944 0.75956963 0.75960938 0.         0.73694518
 0.75355247 0.76556043 0.77039464 0.         0.74097311 0.76250125
 0.7758012  0.78193158 0.         0.74948072 0.76436776 0.77536204
 0.77976962 0.71109129 0.75425612 0.75727931 0.75488452 0.75777087
 0.73939306 0.75722456 0.76104612 0.76571197 0.77526248 0.73034465
 0.76590472 0.77239248 0.77641234 0.77744522 0.74049767 0.77318926
 0.77923533 0.77748262 0.77785026 0.74249829 0.77938935 0.78130533
 0.77455763 0.77550989 0.75106615 0.73706282 0.73139384 0.73213379
 0.7426004  0.7411674  0.73882542 0.75305531 0.75273216 0.75447559
 0.74248942 0.75708969 0.76107577 0.76730132 0.76866642 0.74511557
 0.76857396 0.77597235 0.77243855 0.77407479 0.7447867  0.76300128
 0.77069578 0.76862011 0.76911544 0.14512369 0.14512369 0.14512369
 0.14512369 0.14512369 0.18910724 0.18910724 0.18910724 0.18910724
 0.18910724 0.55491292 0.54773541 0.54773541 0.54773

In [9]:
joblib.dump(gb_cv.best_estimator_, 'GB.pkl')

['GB.pkl']