In [1]:
import pandas as pd
import numpy as np

import os,gc,copy
import warnings


from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from scipy.stats import loguniform
from sklearn import metrics
from sklearn.model_selection import train_test_split


import hyperopt
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

warnings.filterwarnings("ignore")

In [2]:
X_train = pd.read_csv("X_tr_encoded.csv")
Y_train = pd.read_csv("y_tr.csv")
y_train = Y_train

In [3]:
%%time
## Tune logistic using both l1 ad l2 regularization

cols_log_from_sfs = ['Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'StreamingTV', 'Contract', 'PaymentMethod', 'MonthlyCharges']
grid = dict()
grid['solver'] = ['newton-cg', 'lbfgs', 'liblinear','saga']
grid['penalty'] = ['none', 'l1', 'l2', 'elasticnet']
grid['C'] = np.logspace(-3,3,9)

logreg=LogisticRegression()
# columns_logit = [col for col in X_train.columns if col != "customerID"]
logreg_cv=GridSearchCV(logreg,grid,cv=5)
logreg_cv.fit(X_train[cols_log_from_sfs],y_train)
print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
print("accuracy :",logreg_cv.best_score_)

tuned hpyerparameters :(best parameters)  {'C': 1.0, 'penalty': 'l2', 'solver': 'newton-cg'}
accuracy : 0.7762335075460978
Wall time: 34.2 s


In [4]:
%%time 
## Tune decision trees
grid = {
    'max_depth': [5, 10, 15 ,20],
    'min_samples_leaf': [100, 150, 200],
    'criterion': ["gini", "entropy","log_loss"],
    'max_features' : ['auto','sqrt','log2']
}
cols_dt_from_sfs = ['PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'TechSupport', 'StreamingTV', 'Contract', 'PaperlessBilling']
dt_model = DecisionTreeClassifier(random_state = 111)
# columns_dt = [col for col in X_train.columns if col != "customerID"]
dtreg_cv=GridSearchCV(dt_model,grid,cv=5,scoring = "accuracy")
dtreg_cv.fit(X_train[cols_dt_from_sfs],y_train)
print("tuned hpyerparameters :(best parameters) ",dtreg_cv.best_params_)
print("accuracy :",dtreg_cv.best_score_)

tuned hpyerparameters :(best parameters)  {'criterion': 'gini', 'max_depth': 5, 'max_features': 'auto', 'min_samples_leaf': 200}
accuracy : 0.7782874697937545
Wall time: 3.1 s


In [5]:
%%time 
## Tune Random Forest
grid = {
    'n_estimators' : [25,50,125,150,175],
    'max_depth': [5, 10, 15,20],
    'min_samples_leaf': [3, 5, 10, 20, 50],
    'criterion': ["gini"],
    'max_features' : ['auto']
}
cols_rf_from_sfs = ['InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'Contract', 'MonthlyCharges', 'TotalCharges']
rf_model = RandomForestClassifier(random_state = 96,n_jobs=-1)
# columns_rf = [col for col in X_train.columns if col != "customerID"]
rfreg_cv=GridSearchCV(rf_model,grid,cv=5,scoring = "accuracy",verbose = -1)
rfreg_cv.fit(X_train[cols_rf_from_sfs],y_train)
print("tuned hpyerparameters :(best parameters) ",rfreg_cv.best_params_)
print("accuracy :",rfreg_cv.best_score_)

tuned hpyerparameters :(best parameters)  {'criterion': 'gini', 'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 10, 'n_estimators': 175}
accuracy : 0.8005674715421265
Wall time: 1min 36s


In [6]:
%%time 
## Tune GBM Classifier


def objective(space):

    gbm = GradientBoostingClassifier(n_estimators = space['n_estimators'],
                            max_depth = int(space['max_depth']),
                            learning_rate = space['learning_rate'],
                            subsample = space['subsample']
                            )
    
    cols_gb_from_sfs = ['tenure', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'TechSupport', 'StreamingTV', 'Contract', 'PaymentMethod']
#     columns_gb = [col for col in X_train.columns if col != "customerID"]
    gbm.fit(X_train[cols_gb_from_sfs], Y_train)

    # Applying k-Fold Cross Validation
    from sklearn.model_selection import cross_val_score
    accuracies = cross_val_score(estimator = gbm, X = X_train[cols_gb_from_sfs], y = Y_train, cv = 5)
    CrossValMean = accuracies.mean()

    print("CrossValMean:", CrossValMean)

    return{'loss':1-CrossValMean, 'status': STATUS_OK }

# space = {
#     'max_depth' : hp.choice('max_depth', range(5, 30, 1)),
#     'learning_rate' : hp.quniform('learning_rate', 0.01, 0.5, 0.01),
#     'n_estimators' : hp.choice('n_estimators', range(20, 205, 5)),
#     'gamma' : hp.quniform('gamma', 0, 0.50, 0.01),
#     'min_child_weight' : hp.quniform('min_child_weight', 1, 10, 1),
#     'subsample' : hp.quniform('subsample', 0.1, 1, 0.01),
#     'colsample_bytree' : hp.quniform('colsample_bytree', 0.1, 1.0, 0.01)}

space = {'max_depth': hp.choice('max_depth', range(5, 30, 1)),
        'learning_rate': hp.quniform('learning_rate', 0.01, 0.5, 0.01),
        'n_estimators': hp.choice('n_estimators', range(20, 205, 5)),
        'subsample':  hp.quniform('subsample', 0.1, 1, 0.01)}


trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=50,
            trials=trials)

print("Best: ", best)



CrossValMean:                                                                                                          
0.6733584349574459                                                                                                     
CrossValMean:                                                                                                          
0.7580611805256355                                                                                                     
CrossValMean:                                                                                                          
0.783658468051627                                                                                                      
CrossValMean:                                                                                                          
0.7779725130971783                                                                                                     
CrossValMean:                           

CrossValMean:                                                                                                          
0.7808146163884883                                                                                                     
CrossValMean:                                                                                                          
0.7654857663800586                                                                                                     
CrossValMean:                                                                                                          
0.7684873461588896                                                                                                     
CrossValMean:                                                                                                          
0.7978813479946799                                                                                                     
CrossValMean:                           

In [7]:
%%time


def objective(space):

    xgb = XGBClassifier(n_estimators = space['n_estimators'],
                            max_depth = int(space['max_depth']),
                            learning_rate = space['learning_rate'],
                            gamma = space['gamma'],
                            min_child_weight = space['min_child_weight'],
                            subsample = space['subsample'],
                            colsample_bytree = space['colsample_bytree']
                            )
    cols_xgb_from_sfs = ['MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'Contract', 'MonthlyCharges', 'TotalCharges']
#     columns_xgb = [col for col in X_train.columns if col != "customerID"]
    xgb.fit(X_train[cols_xgb_from_sfs], Y_train)

    # Applying k-Fold Cross Validation
    from sklearn.model_selection import cross_val_score
    accuracies = cross_val_score(estimator = xgb, X = X_train[cols_xgb_from_sfs], y = Y_train, cv = 5)
    CrossValMean = accuracies.mean()

    print("CrossValMean:", CrossValMean)

    return{'loss':1-CrossValMean, 'status': STATUS_OK }

space = {
    'max_depth' : hp.choice('max_depth', range(5, 30, 1)),
    'learning_rate' : hp.quniform('learning_rate', 0.01, 0.5, 0.01),
    'n_estimators' : hp.choice('n_estimators', range(20, 205, 5)),
    'gamma' : hp.quniform('gamma', 0, 0.50, 0.01),
    'min_child_weight' : hp.quniform('min_child_weight', 1, 10, 1),
    'subsample' : hp.quniform('subsample', 0.1, 1, 0.01),
    'colsample_bytree' : hp.quniform('colsample_bytree', 0.1, 1.0, 0.01)}

trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=50,
            trials=trials)

print("Best: ", best)


CrossValMean:                                                                                                          
0.7730725761634478                                                                                                     
CrossValMean:                                                                                                          
0.7806558892031795                                                                                                     
CrossValMean:                                                                                                          
0.7738617162767174                                                                                                     
CrossValMean:                                                                                                          
0.7820788141043653                                                                                                     
CrossValMean:                           

CrossValMean:                                                                                                          
0.7898219782827242                                                                                                     
CrossValMean:                                                                                                          
0.7966180244647172                                                                                                     
CrossValMean:                                                                                                          
0.7827115998226651                                                                                                     
CrossValMean:                                                                                                          
0.7893461713779042                                                                                                     
CrossValMean:                           