In [16]:
import numpy as np 
import pandas as pd 
 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold 
from sklearn.metrics import  confusion_matrix, f1_score, make_scorer, recall_score
from sklearn.preprocessing import StandardScaler  
from sklearn.neighbors import KNeighborsClassifier 

from imblearn.over_sampling import SMOTE 
import matplotlib.pyplot as plt 

df = pd.read_csv('journeys.csv') 
targets = df['Conversion'] 
df.drop(['Journey Start Date', 'Journey End Date', 'Events Combo', 'User-Journey'], axis=1, inplace=True) 
df = pd.get_dummies(df) 
 
X_train, X_test, y_train, y_test = train_test_split(df, targets, stratify=targets) 
X_train.drop(['Conversion'], axis=1, inplace=True) 
X_test.drop(['Conversion'], axis=1, inplace=True) 

#scaler = StandardScaler() 
#scaler.fit(X_train) 
 
#X_train = scaler.transform(X_train) 
#X_test = scaler.transform(X_test) b

In [107]:
param_grid = { 
    'min_samples_split': [3, 5, 10],  
    'n_estimators' : [300], 
    'max_depth': [3, 5, 15, 25], 
    'max_features': [3, 5, 10, 19]

} 
  
def grid_search_wrapper(clf = '', refit_score='fb_score', param_grid=param_grid): 
    """ 
    fits a GridSearchCV classifier using refit_score for optimization 
    prints classifier performance metrics 
    """ 
    skf = StratifiedKFold(n_splits=10) 
    grid_search = GridSearchCV(clf, param_grid, scoring=make_scorer(f1_score), refit=refit_score, 
                           cv=skf, return_train_score=True, n_jobs=-1) 
    grid_search.fit(X_train.values, y_train.values) 
 
    # make the predictions 
    y_pred = grid_search.predict(X_test.values) 
 
    print('Best params for {}'.format(refit_score)) 
    print(grid_search.best_params_) 
 
    # confusion matrix on the test data. 
    print('\nConfusion matrix of Random Forest optimized for {} on the test data:'.format(refit_score)) 
    print(pd.DataFrame(confusion_matrix(y_test, y_pred), 
                 columns=['pred_neg', 'pred_pos'], index=['neg', 'pos'])) 
    return grid_search

def show_model_output(grid_search_output): 
    df1 = pd.DataFrame(grid_search_output.cv_results_['params'])
    df2 = pd.DataFrame(grid_search_output.cv_results_['mean_train_score']).rename(columns={0: "mean_train_score"})
    df3 = pd.DataFrame(grid_search_output.cv_results_['mean_test_score']).rename(columns={0: "mean_test_score"})
    result = pd.concat([df2,df3,df1], axis = 1).sort_values(by='mean_test_score', ascending=False)
    return result



Best params for fb_score
{'max_depth': 15, 'max_features': 19, 'min_samples_split': 10, 'n_estimators': 300}

Confusion matrix of Random Forest optimized for fb_score on the test data:
     pred_neg  pred_pos
neg      1597         8
pos       122       138


In [None]:
clf = RandomForestClassifier(n_jobs=-1)
grid_search_clf = grid_search_wrapper(clf = clf, refit_score='fb_score') 
#y_scores = grid_search_clf.predict_proba(X_test)[:, 1]  

In [111]:
show_model_output(grid_search_output= grid_search_clf)

Unnamed: 0,mean_train_score,mean_test_score,max_depth,max_features,min_samples_split,n_estimators
35,0.733653,0.713308,15,19,10,300
47,0.735499,0.712731,25,19,10,300
29,0.733327,0.711813,15,5,10,300
44,0.733723,0.711118,25,10,10,300
32,0.732827,0.710578,15,10,10,300
26,0.733,0.707886,15,3,10,300
41,0.734837,0.706866,25,5,10,300
21,0.709923,0.706478,5,19,3,300
22,0.709923,0.706478,5,19,5,300
31,0.748804,0.706221,15,10,5,300
