In [117]:
import numpy as np 
import pandas as pd 
 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold 
from sklearn.metrics import  confusion_matrix, f1_score, make_scorer, recall_score
from sklearn.preprocessing import StandardScaler  
from sklearn.neighbors import KNeighborsClassifier 

from imblearn.over_sampling import SMOTE 
import matplotlib.pyplot as plt 

df = pd.read_csv('journeys.csv') 
targets = df['Conversion'] 
df.drop(['Journey Start Date', 'Journey End Date', 'Events Combo', 'User-Journey'], axis=1, inplace=True) 
df = pd.get_dummies(df) 
 
X_train, X_test, y_train, y_test = train_test_split(df, targets, stratify=targets) 
X_train.drop(['Conversion'], axis=1, inplace=True) 
X_test.drop(['Conversion'], axis=1, inplace=True) 

#scaler = StandardScaler() 
#scaler.fit(X_train) 
 
#X_train = scaler.transform(X_train) 
#X_test = scaler.transform(X_test) b

In [153]:
param_grid = { 
    'min_samples_split': [3, 5, 10],  
    'n_estimators' : [300], 
    'max_depth': [3, 5, 15, 25], 
    'max_features': [3, 5, 10, 19]

} 
  
def grid_search_wrapper(_model = '', refit_score='fb_score', param_grid=param_grid): 
    """ 
    fits a GridSearchCV classifier using refit_score for optimization 
    prints classifier performance metrics 
    """ 
    skf = StratifiedKFold(n_splits=10) 
    grid_search = GridSearchCV(_model, param_grid, scoring=make_scorer(f1_score), refit=refit_score, 
                           cv=skf, return_train_score=True, n_jobs=-1, error_score='raise') 
    grid_search.fit(X_train.values, y_train.values) 
 
    # make the predictions 
    y_pred = grid_search.predict(X_test.values) 
 
    print('Best params for {}'.format(refit_score)) 
    print(grid_search.best_params_) 
 
    # confusion matrix on the test data. 
    print('\nConfusion matrix of Random Forest optimized for {} on the test data:'.format(refit_score)) 
    print(pd.DataFrame(confusion_matrix(y_test, y_pred), 
                 columns=['pred_neg', 'pred_pos'], index=['neg', 'pos'])) 
    return grid_search

def show_model_output(grid_search_output, num_to_show = 10): 
    df1 = pd.DataFrame(grid_search_output.cv_results_['params'])
    df2 = pd.DataFrame(grid_search_output.cv_results_['mean_train_score']).rename(columns={0: "mean_train_score"})
    df3 = pd.DataFrame(grid_search_output.cv_results_['mean_test_score']).rename(columns={0: "mean_test_score"})
    result = pd.concat([df2,df3,df1], axis = 1).sort_values(by='mean_test_score', ascending=False)
    result = result.head(num_to_show)
    return result



### Random Forest

In [154]:
clf = RandomForestClassifier(n_jobs=-1)
grid_search_clf = grid_search_wrapper(_model = clf, refit_score='fb_score', param_grid=param_grid) 
#y_scores = grid_search_clf.predict_proba(X_test)[:, 1]  

Best params for fb_score
{'max_depth': 15, 'max_features': 10, 'min_samples_split': 10, 'n_estimators': 300}

Confusion matrix of Random Forest optimized for fb_score on the test data:
     pred_neg  pred_pos
neg      1598         7
pos       110       150


In [155]:
show_model_output(grid_search_output= grid_search_clf, num_to_show=10)

Unnamed: 0,mean_train_score,mean_test_score,max_depth,max_features,min_samples_split,n_estimators
32,0.718585,0.693738,15,10,10,300
22,0.694607,0.691036,5,19,5,300
47,0.722778,0.690112,25,19,10,300
44,0.718138,0.689929,25,10,10,300
41,0.717524,0.689854,25,5,10,300
29,0.716529,0.689224,15,5,10,300
31,0.737654,0.687999,15,10,5,300
21,0.693772,0.687967,5,19,3,300
23,0.692815,0.687967,5,19,10,300
28,0.734711,0.686524,15,5,5,300


### Logistic Regression

In [167]:
param_grid = {
     'penalty': ['l1', 'l2', 'elasticnet'],
     'l1_ratio': [0, .25, .5, .75, 1]

}

In [169]:
logreg = LogisticRegression(random_state=0, solver='saga')
grid_search_lr = grid_search_wrapper(_model = logreg, refit_score='fb_score', param_grid=param_grid) 

Best params for fb_score
{'l1_ratio': 1, 'penalty': 'elasticnet'}

Confusion matrix of Random Forest optimized for fb_score on the test data:
     pred_neg  pred_pos
neg      1605         0
pos       194        66




In [162]:
show_model_output(grid_search_output= grid_search_lr, num_to_show=10)

Unnamed: 0,mean_train_score,mean_test_score,l1_ratio,penalty
0,0.419341,0.414091,0.0,l1
3,0.419341,0.414091,0.25,l1
6,0.419341,0.414091,0.5,l1
9,0.419341,0.414091,0.75,l1
12,0.419341,0.414091,1.0,l1
14,0.419341,0.414091,1.0,elasticnet
11,0.416935,0.409374,0.75,elasticnet
8,0.416857,0.407908,0.5,elasticnet
5,0.415421,0.406161,0.25,elasticnet
1,0.413592,0.401792,0.0,l2
