In [32]:
import numpy as np 
import pandas as pd 
 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold 
from sklearn.metrics import  confusion_matrix, f1_score, make_scorer, recall_score
from sklearn.preprocessing import StandardScaler  
from sklearn.neighbors import KNeighborsClassifier 

from xgboost import XGBClassifier

from imblearn.over_sampling import SMOTE 
import matplotlib.pyplot as plt 

df = pd.read_csv('journeys.csv') 
targets = df['Conversion'] 
df.drop(['Journey Start Date', 'Journey End Date', 'Events Combo', 'User-Journey'], axis=1, inplace=True) 
df = pd.get_dummies(df) 
 
X_train, X_test, y_train, y_test = train_test_split(df, targets, test_size=0.3) 
X_train.drop(['Conversion'], axis=1, inplace=True) 
X_test.drop(['Conversion'], axis=1, inplace=True) 

scaler = StandardScaler() 
scaler.fit(X_train) 
 
s_X_train = scaler.transform(X_train) 
s_X_test = scaler.transform(X_test) 

X_train = pd.DataFrame(s_X_train, columns = X_train.columns)
X_test = pd.DataFrame(s_X_test, columns = X_test.columns)

In [81]:
param_grid = { 
    'min_samples_split': [3, 5, 10],  
    'n_estimators' : [300], 
    'max_depth': [3, 5, 15, 25], 
    'max_features': [3, 5, 10, 19]

} 
  
#https://towardsdatascience.com/fine-tuning-a-classifier-in-scikit-learn-66e048c21e65  
def grid_search_wrapper(_model = '', refit_score='fb_score', param_grid=param_grid): 
    """ 
    fits a GridSearchCV classifier using refit_score for optimization 
    prints classifier performance metrics 
    """ 
    grid_search = GridSearchCV(_model, param_grid, scoring=make_scorer(f1_score), refit=refit_score, 
        return_train_score=True, n_jobs=-1, error_score='raise', cv=10) 
    grid_search.fit(X_train.values, y_train.values) 
 
    # make the predictions 
    y_pred = grid_search.predict(X_test.values) 
 
    print('Best params for {}'.format(refit_score)) 
    print(grid_search.best_params_) 
 
    # confusion matrix on the test data. 
    print('\nConfusion matrix of Random Forest optimized for {} on the test data:'.format(refit_score)) 
    print(pd.DataFrame(confusion_matrix(y_test, y_pred), 
                 columns=['pred_neg', 'pred_pos'], index=['neg', 'pos'])) 
    return grid_search

def show_model_output(grid_search_output, num_to_show = 10): 
    try:
        df1 = pd.DataFrame(grid_search_output.cv_results_['params'])
        df2 = pd.DataFrame(grid_search_output.cv_results_['mean_train_score']).rename(columns={0: "mean_train_score"})
        df3 = pd.DataFrame(grid_search_output.cv_results_['mean_test_score']).rename(columns={0: "mean_test_score"})
        result = pd.concat([df2,df3,df1], axis = 1).sort_values(by='mean_test_score', ascending=False)
        result = result.head(num_to_show)
    except Exception:
    ## for some reason logistic regression doesnt have mean_train_score
        df1 = pd.DataFrame(grid_search_output.cv_results_['params'])
        df3 = pd.DataFrame(grid_search_output.cv_results_['mean_test_score']).rename(columns={0: "mean_test_score"})
        result = pd.concat([df3,df1], axis = 1).sort_values(by='mean_test_score', ascending=False)
        result = result.head(num_to_show)
    return result



### Random Forest

In [None]:
clf = RandomForestClassifier(n_jobs=-1)
grid_search_clf = grid_search_wrapper(_model = clf, refit_score='fb_score', param_grid=param_grid) 
#y_scores = grid_search_clf.predict_proba(X_test)[:, 1]  

In [None]:
show_model_output(grid_search_output= grid_search_clf, num_to_show=10)

### Logistic Regression

In [67]:
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': np.logspace(-3, 3, 7),
    'solver': ['newton-cg', 'lbfgs', 'liblinear'],
}

In [79]:
logreg = LogisticRegression(random_state=42)
log_reg = GridSearchCV(logreg, param_grid=param_grid, scoring=make_scorer(f1_score), cv=10)
log_reg.fit(X_train, y_train)

140 fits failed out of a total of 420.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
70 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Mark\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Mark\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\Mark\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver newto

GridSearchCV(cv=10, estimator=LogisticRegression(random_state=42),
             param_grid={'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
                         'penalty': ['l1', 'l2'],
                         'solver': ['newton-cg', 'lbfgs', 'liblinear']},
             scoring=make_scorer(f1_score))

In [83]:
show_model_output(grid_search_output= log_reg, num_to_show=10)

Unnamed: 0,mean_test_score,C,penalty,solver
41,0.420749,1000.0,l2,liblinear
40,0.420749,1000.0,l2,lbfgs
39,0.420749,1000.0,l2,newton-cg
38,0.420749,1000.0,l1,liblinear
35,0.420749,100.0,l2,liblinear
34,0.420749,100.0,l2,lbfgs
33,0.420749,100.0,l2,newton-cg
32,0.420749,100.0,l1,liblinear
29,0.420749,10.0,l2,liblinear
28,0.420749,10.0,l2,lbfgs


### XGBoost

In [75]:
# https://www.kaggle.com/code/tilii7/hyperparameter-grid-search-with-xgboost/notebook
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

In [76]:
xgb = XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic',
                    silent=True, nthread=1)

In [77]:
grid_search_xgb = grid_search_wrapper(_model = xgb, refit_score='fb_score', param_grid=param_grid) 



Parameters: { "C", "penalty", "silent", "solver" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Best params for fb_score
{'C': 0.001, 'penalty': 'l1', 'solver': 'newton-cg'}

Confusion matrix of Random Forest optimized for fb_score on the test data:
     pred_neg  pred_pos
neg      1948        10
pos       115       165


In [78]:
show_model_output(grid_search_output= grid_search_xgb, num_to_show=10)

Unnamed: 0,mean_train_score,mean_test_score,C,penalty,solver
0,0.728605,0.697394,0.001,l1,newton-cg
31,0.728605,0.697394,100.0,l1,lbfgs
23,0.728605,0.697394,1.0,l2,liblinear
24,0.728605,0.697394,10.0,l1,newton-cg
25,0.728605,0.697394,10.0,l1,lbfgs
26,0.728605,0.697394,10.0,l1,liblinear
27,0.728605,0.697394,10.0,l2,newton-cg
28,0.728605,0.697394,10.0,l2,lbfgs
29,0.728605,0.697394,10.0,l2,liblinear
30,0.728605,0.697394,100.0,l1,newton-cg


### SHAP

##### Random Forest

In [None]:
model = grid_search_clf.best_estimator_.fit(X_train, y_train)
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_train)

In [None]:
shap.summary_plot(shap_values[1], X_train, plot_type = 'bar', class_names=model.classes_)

In [None]:
shap.summary_plot(shap_values[1], X_train, class_names=model.classes_)

##### Logistic Regression

In [None]:
model = grid_search_lr.best_estimator_.fit(X_train, y_train)
explainer = shap.Explainer(model, X_train)
shap_values = explainer.shap_values(X_train)

In [None]:
shap.summary_plot(shap_values, X_train, plot_type="bar", class_names=model.classes_)

In [None]:
shap.summary_plot(shap_values, X_train, plot_type="dot", class_names=model.classes_)

##### XGBooost

In [None]:
model = grid_search_xgb.best_estimator_.fit(X_train, y_train)
explainer = shap.Explainer(model, X_train)
shap_values = explainer.shap_values(X_train)

In [None]:
shap.summary_plot(shap_values, X_train, plot_type="bar")

In [None]:
shap.summary_plot(shap_values, X_train, plot_type="dot", class_names=model.classes_)