In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate, GridSearchCV
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import numpy as np
import shap
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


In [2]:
'''Load dataset and Preprocessing'''
df = pd.read_csv("online_shoppers_intention.csv")

# remove visitortype is other
df = df[df["VisitorType"] != "Other"]

# transform visitortype, weekend and revenue into numerical
df["VisitorType"] = df["VisitorType"].map({"New_Visitor": 0, "Returning_Visitor": 1})
df["Weekend"] = df["Weekend"].map({False: 0, True: 1})
df["Revenue"] = df["Revenue"].map({False: 0, True: 1})

# transform month into numerical by one-hot encoding
df = pd.get_dummies(df, columns=['Month'], prefix='Month')

# splitting visitortype into new and returning groups
new_visitors = df[df["VisitorType"] == 0]
returning_visitors = df[df["VisitorType"] == 1]

In [3]:
# get X, y from new / returning visitor, and deleting visitortype
X_new = new_visitors.drop(columns=['Revenue', 'VisitorType'])
y_new = new_visitors['Revenue']

X_return = returning_visitors.drop(columns=['Revenue', 'VisitorType'])
y_return = returning_visitors['Revenue']

# Columns needed to be standardized
numerical_cols = [
    'Administrative', 'Administrative_Duration', 'Informational',
    'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration',
    'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay']

# Train-test split
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(
    X_new, y_new, test_size=0.2, stratify=y_new, random_state=123)
X_train_return, X_test_return, y_train_return, y_test_return = train_test_split(
    X_return, y_return, test_size=0.2, stratify=y_return, random_state=123)

In [4]:
'''Step 1 Logistic Regression'''
# tuning hyperparameters

def cross_validate_lr(X, y, lr_param_grid, label=''):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)
    
    best_score = 0
    best_params = {}
    results = []

    for C in lr_param_grid['C']:
        for penalty in lr_param_grid['penalty']:
            for solver in lr_param_grid['solver']:
                
                acc_scores, prec_scores, rec_scores, f1_scores, roc_auc_scores = [], [], [], [], []
                
                for train_idx, val_idx in skf.split(X, y):
                    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
                    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
                    
                    scaler = StandardScaler()
                    X_train_scaled = X_train.copy()
                    X_val_scaled = X_val.copy()
                    X_train_scaled[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
                    X_val_scaled[numerical_cols] = scaler.transform(X_val[numerical_cols])

                    model = LogisticRegression(C=C, penalty=penalty, solver=solver, max_iter=5000, 
                                               random_state=123)
                    model.fit(X_train_scaled, y_train)
                    y_pred = model.predict(X_val_scaled)
                    y_proba = model.predict_proba(X_val_scaled)[:, 1]
                    
                    acc_scores.append(accuracy_score(y_val, y_pred))
                    prec_scores.append(precision_score(y_val, y_pred, zero_division=0))
                    rec_scores.append(recall_score(y_val, y_pred, zero_division=0))
                    f1_scores.append(f1_score(y_val, y_pred, zero_division=0))
                    roc_auc_scores.append(roc_auc_score(y_val, y_proba))

                avg_acc = np.mean(acc_scores)
                avg_prec = np.mean(prec_scores)
                avg_rec = np.mean(rec_scores)
                avg_f1 = np.mean(f1_scores)
                avg_roc_auc = np.mean(roc_auc_scores)

                results.append({
                    'C': C,
                    'penalty': penalty,
                    'solver': solver,
                    'accuracy': avg_acc,
                    'precision': avg_prec,
                    'recall': avg_rec,
                    'f1_score': avg_f1,
                    'roc_auc': avg_roc_auc})

                if avg_f1 > best_score:
                    best_score = avg_f1
                    best_params = {
                        'C': C,
                        'penalty': penalty,
                        'solver': solver}

    df_results = pd.DataFrame(results).sort_values(by='f1_score', ascending=False)
    print(f"\nBest Params for {label}:", best_params)
    print(f"Best F1-Score for {label}:", best_score)
    print(f"\nLogistic Regression - {label} Results")
    print((df_results).head(5))

    return best_params, df_results

lr_param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10], 
    'penalty': ['l1', 'l2'], 
    'solver': ['saga', 'liblinear']}
lrn_best_param, lrn_results = cross_validate_lr(X_train_new, y_train_new, lr_param_grid, 
                                                label='New Visitors')
lrr_best_param, lrr_results = cross_validate_lr(X_train_return, y_train_return, lr_param_grid, 
                                                label='Returning Visitors')


Best Params for New Visitors: {'C': 1, 'penalty': 'l1', 'solver': 'saga'}
Best F1-Score for New Visitors: 0.7701433666255743

Logistic Regression - New Visitors Results
       C penalty     solver  accuracy  precision    recall  f1_score   roc_auc
13   1.0      l1  liblinear  0.900369   0.910131  0.668657  0.770143  0.907095
12   1.0      l1       saga  0.900369   0.910131  0.668657  0.770143  0.907181
19  10.0      l2  liblinear  0.898155   0.906108  0.662774  0.764711  0.903744
17  10.0      l1  liblinear  0.898155   0.906108  0.662774  0.764711  0.904123
16  10.0      l1       saga  0.898155   0.906108  0.662774  0.764711  0.904313

Best Params for Returning Visitors: {'C': 10, 'penalty': 'l1', 'solver': 'saga'}
Best F1-Score for Returning Visitors: 0.4425239793457375

Logistic Regression - Returning Visitors Results
       C penalty     solver  accuracy  precision    recall  f1_score   roc_auc
17  10.0      l1  liblinear  0.886256   0.698155  0.323963  0.442524  0.883485
16  10.0 

In [5]:
'''Step 5.2 Decision Tree'''
# tuning hyperparameters

def cross_validate_dt(X, y, dt_param_grid, label=''):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)
    
    best_score = 0
    best_params = {}
    results = []

    for max_depth in dt_param_grid['max_depth']:
        for criterion in dt_param_grid['criterion']:
                
                acc_scores, prec_scores, rec_scores, f1_scores, roc_auc_scores = [], [], [], [], []
                
                for train_idx, val_idx in skf.split(X, y):
                    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
                    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

                    
                    model = DecisionTreeClassifier(
                    max_depth=max_depth,
                    criterion=criterion,
                    random_state=123)
                
                    model.fit(X_train, y_train)
                    y_pred = model.predict(X_val)
                    y_proba = model.predict_proba(X_val)[:, 1]
                    
                    acc_scores.append(accuracy_score(y_val, y_pred))
                    prec_scores.append(precision_score(y_val, y_pred, zero_division=0))
                    rec_scores.append(recall_score(y_val, y_pred, zero_division=0))
                    f1_scores.append(f1_score(y_val, y_pred, zero_division=0))
                    roc_auc_scores.append(roc_auc_score(y_val, y_proba))
    
                avg_acc = np.mean(acc_scores)
                avg_prec = np.mean(prec_scores)
                avg_rec = np.mean(rec_scores)
                avg_f1 = np.mean(f1_scores)
                avg_roc_auc = np.mean(roc_auc_scores)
    
                results.append({
                'max_depth': max_depth,
                'criterion': criterion,
                'accuracy': avg_acc,
                'precision': avg_prec,
                'recall': avg_rec,
                'f1_score': avg_f1,
                'roc_auc': avg_roc_auc})
    
                if avg_f1 > best_score:
                    best_score = avg_f1
                    best_params = {
                        'max_depth': max_depth,
                        'criterion': criterion}

    df_results = pd.DataFrame(results).sort_values(by='f1_score', ascending=False)
    print(f"\nBest Params for {label}:", best_params)
    print(f"Best F1-Score for {label}:", best_score)
    print(f"\nDecision Tree - {label} Results")
    print((df_results).head(5))

    return best_params, df_results


dt_param_grid = {
        'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10],
        # 'min_samples_leaf': [5, 10, 15],
        'criterion': ['gini', 'entropy']}
dtn_best_param, dtn_results = cross_validate_dt(X_train_new, y_train_new, dt_param_grid, 
                                                label='New Visitors')
dtr_best_param, dtr_results = cross_validate_dt(X_train_return, y_train_return, dt_param_grid, 
                                                label='Returning Visitors')


Best Params for New Visitors: {'max_depth': 2, 'criterion': 'entropy'}
Best F1-Score for New Visitors: 0.824148307451272

Decision Tree - New Visitors Results
   max_depth criterion  accuracy  precision    recall  f1_score   roc_auc
1          2   entropy  0.918081   0.889797  0.769359  0.824148  0.892593
5          4   entropy  0.918081   0.903618  0.754609  0.821500  0.907726
4          4      gini  0.917343   0.903520  0.751580  0.819642  0.882961
7          5   entropy  0.917343   0.903593  0.751668  0.819506  0.911730
3          3   entropy  0.915129   0.893901  0.751624  0.815869  0.903996

Best Params for Returning Visitors: {'max_depth': 5, 'criterion': 'entropy'}
Best F1-Score for Returning Visitors: 0.5865683928576264

Decision Tree - Returning Visitors Results
    max_depth criterion  accuracy  precision    recall  f1_score   roc_auc
7           5   entropy  0.896801   0.668456  0.525546  0.586568  0.916857
6           5      gini  0.897393   0.672065  0.516138  0.583556  0

In [11]:

rf = RandomForestClassifier(random_state=123)

param_grid = {
    'n_estimators': [150, 200, 250, 300, 350, 400],
    'max_depth': [5, 10, 15, 20, 25],
    'criterion': ['gini', 'entropy']}

scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, zero_division=0),
    'recall': make_scorer(recall_score, zero_division=0),
    'f1': make_scorer(f1_score, zero_division=0),
    'roc_auc': make_scorer(roc_auc_score, needs_proba=True)
}

def tune_and_print(X_train, y_train, label):
    gs = GridSearchCV(
        rf,
        param_grid=param_grid,
        cv=5,
        scoring=scoring,
        refit='f1',
        n_jobs=-1,
        return_train_score=False
    )
    gs.fit(X_train, y_train)
    cv = gs.cv_results_
    df = pd.DataFrame({
        'max_depth': cv['param_max_depth'],
        'n_estimators': cv['param_n_estimators'],
        'criterion': cv['param_criterion'],
        'accuracy': cv['mean_test_accuracy'],
        'precision': cv['mean_test_precision'],
        'recall': cv['mean_test_recall'],
        'f1_score': cv['mean_test_f1'],
        'roc_auc': cv['mean_test_roc_auc'],
    })

    top5 = df.nlargest(5, 'f1_score')

    best_p = gs.best_params_
    best_f = gs.best_score_

    print(f"Best Params for {label}: {best_p}")
    print(f"Best F1-Score for {label}: {best_f}\n")
    print(f"Random Forest - {label} Results")
    print(top5.to_string(index=False))

    return best_p, df

rfn_best_param, rfn_results = tune_and_print(X_train_new, y_train_new, 'New Visitors')
rfr_best_param, rfr_results = tune_and_print(X_train_return, y_train_return, 'Returning Visitors')




Best Params for New Visitors: {'criterion': 'gini', 'max_depth': 15, 'n_estimators': 350}
Best F1-Score for New Visitors: 0.8284421549463039

Random Forest - New Visitors Results
max_depth n_estimators criterion  accuracy  precision   recall  f1_score  roc_auc
       15          350      gini  0.921771   0.905693 0.765935  0.828442 0.924818
       20          300   entropy  0.921033   0.899562 0.768920  0.827715 0.928671
       20          400   entropy  0.921033   0.899562 0.768920  0.827715 0.927771
       25          350   entropy  0.921033   0.899562 0.768920  0.827715 0.929486
       25          400   entropy  0.921033   0.899562 0.768920  0.827715 0.928326
Best Params for Returning Visitors: {'criterion': 'entropy', 'max_depth': 15, 'n_estimators': 150}
Best F1-Score for Returning Visitors: 0.5898637497752425

Random Forest - Returning Visitors Results
max_depth n_estimators criterion  accuracy  precision   recall  f1_score  roc_auc
       15          150   entropy  0.902488   0.

In [12]:
'''Step 5.4 Gradient Boosting'''
# tuning hyperparameters

def cross_validate_gb(X, y, gb_param_grid, label=''):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)
    
    best_score = 0
    best_params = {}
    results = []

    for max_depth in gb_param_grid['max_depth']:
        for n_estimators in gb_param_grid['n_estimators']:
            for learning_rate in gb_param_grid['learning_rate']:
                
                acc_scores, prec_scores, rec_scores, f1_scores, roc_auc_scores = [], [], [], [], []
                
                for train_idx, val_idx in skf.split(X, y):
                    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
                    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
                    
                    model = GradientBoostingClassifier(
                    max_depth=max_depth,
                    n_estimators = n_estimators,
                    learning_rate = learning_rate,
                    random_state=123)
                
                    model.fit(X_train, y_train)
                    y_pred = model.predict(X_val)
                    y_proba = model.predict_proba(X_val)[:, 1]
                    
                    acc_scores.append(accuracy_score(y_val, y_pred))
                    prec_scores.append(precision_score(y_val, y_pred, zero_division=0))
                    rec_scores.append(recall_score(y_val, y_pred, zero_division=0))
                    f1_scores.append(f1_score(y_val, y_pred, zero_division=0))
                    roc_auc_scores.append(roc_auc_score(y_val, y_proba))
    
                avg_acc = np.mean(acc_scores)
                avg_prec = np.mean(prec_scores)
                avg_rec = np.mean(rec_scores)
                avg_f1 = np.mean(f1_scores)
                avg_roc_auc = np.mean(roc_auc_scores)
    
                results.append({
                'max_depth': max_depth,
                'n_estimators': n_estimators,
                'learning_rate': learning_rate,
                'accuracy': avg_acc,
                'precision': avg_prec,
                'recall': avg_rec,
                'f1_score': avg_f1,
                'roc_auc': avg_roc_auc})
    
                if avg_f1 > best_score:
                    best_score = avg_f1
                    best_params = {
                        'max_depth': max_depth,
                        'n_estimators': n_estimators,
                        'learning_rate': learning_rate}

    df_results = pd.DataFrame(results).sort_values(by='f1_score', ascending=False)
    print(f"\nBest Params for {label}:", best_params)
    print(f"Best F1-Score for {label}:", best_score)
    print(f"\nGradient Boosting - {label} Results")
    print((df_results).head(5))

    return best_params, df_results


gb_param_grid = {
        'max_depth': [2, 3, 5, 10],
        'n_estimators': [150,200,250,300, 350, 400], 
        'learning_rate': [0.01, 0.05, 0.1, 0.15]}
gbn_best_param, gbn_results = cross_validate_gb(X_train_new, y_train_new, gb_param_grid, 
                                                label='New Visitors')
gbr_best_param, gbr_results = cross_validate_gb(X_train_return, y_train_return, gb_param_grid, 
                                                label='Returning Visitors')


Best Params for New Visitors: {'max_depth': 5, 'n_estimators': 200, 'learning_rate': 0.15}
Best F1-Score for New Visitors: 0.8265268826973134

Gradient Boosting - New Visitors Results
    max_depth  n_estimators  learning_rate  accuracy  precision    recall  \
55          5           200           0.15  0.918081   0.883792  0.778095   
51          5           150           0.15  0.917343   0.883653  0.775154   
54          5           200           0.10  0.917343   0.884814  0.772256   
4           2           200           0.01  0.918081   0.895459  0.763389   
8           2           250           0.01  0.918081   0.895459  0.763389   

    f1_score   roc_auc  
55  0.826527  0.917674  
51  0.824630  0.920014  
54  0.823702  0.922945  
4   0.823195  0.919156  
8   0.823195  0.924730  

Best Params for Returning Visitors: {'max_depth': 3, 'n_estimators': 300, 'learning_rate': 0.05}
Best F1-Score for Returning Visitors: 0.6012272199315454

Gradient Boosting - Returning Visitors Results

In [13]:
results_map = {
    ('Logistic Regression', 'New'):   lrn_results,
    ('Logistic Regression', 'Returning'): lrr_results,
    ('Decision Tree', 'New'):         dtn_results,
    ('Decision Tree', 'Returning'):   dtr_results,
    ('Random Forest', 'New'):         rfn_results,
    ('Random Forest', 'Returning'):   rfr_results,
    ('Gradient Boosting', 'New'):     gbn_results,
    ('Gradient Boosting', 'Returning'): gbr_results,
}

rows = []
for (model, group), df_res in results_map.items():
    best = df_res.sort_values('f1_score', ascending=False).iloc[0]
    row = {
        'Model': model,
        'Group': group,
        'Accuracy': best['accuracy'],
        'Precision': best['precision'],
        'Recall': best['recall'],
        'F1-score': best['f1_score'],
        'ROC AUC': best['roc_auc']
    }

    rows.append(row)

df_best = pd.DataFrame(rows)


df_best['Model'] = pd.Categorical(df_best['Model'],
    categories=['Logistic Regression','Decision Tree','Random Forest','Gradient Boosting'], ordered=True)
df_best['Group'] = pd.Categorical(df_best['Group'], categories=['New','Returning'], ordered=True)

df_best = df_best.sort_values(['Group','Model']).reset_index(drop=True)

print(df_best)

                 Model      Group  Accuracy  Precision    Recall  F1-score  \
0  Logistic Regression        New  0.900369   0.910131  0.668657  0.770143   
1        Decision Tree        New  0.918081   0.889797  0.769359  0.824148   
2        Random Forest        New  0.921771   0.905693  0.765935  0.828442   
3    Gradient Boosting        New  0.918081   0.883792  0.778095  0.826527   
4  Logistic Regression  Returning  0.886256   0.698155  0.323963  0.442524   
5        Decision Tree  Returning  0.896801   0.668456  0.525546  0.586568   
6        Random Forest  Returning  0.902488   0.715971  0.502586  0.589864   
7    Gradient Boosting  Returning  0.899408   0.672705  0.544230  0.601227   

    ROC AUC  
0  0.907095  
1  0.892593  
2  0.924818  
3  0.917674  
4  0.883485  
5  0.916857  
6  0.924399  
7  0.924993  


In [14]:
data_map = {
    'New': {
        'X_train': X_train_new, 'y_train': y_train_new,
        'X_test':  X_test_new,  'y_test':  y_test_new,
    },
    'Returning': {
        'X_train': X_train_return, 'y_train': y_train_return,
        'X_test':  X_test_return,  'y_test':  y_test_return,
    }
}

best_params_map = {
    'LogisticRegression': {
        'New':   lrn_best_param,
        'Returning': lrr_best_param,
    },
    'DecisionTree': {
        'New':   dtn_best_param,
        'Returning': dtr_best_param,
    },
    'RandomForest': {
        'New':   rfn_best_param,
        'Returning': rfr_best_param,
    },
    'GradientBoosting': {
        'New':   gbn_best_param,
        'Returning': gbr_best_param,
    }
}

model_map = {
    'LogisticRegression': LogisticRegression,
    'DecisionTree':        DecisionTreeClassifier,
    'RandomForest':        RandomForestClassifier,
    'GradientBoosting':    GradientBoostingClassifier,
}

def make_pipeline(model_name, ModelClass, params):
    steps = []
    if model_name == 'LogisticRegression':
        steps.append(('scaler', StandardScaler()))

    steps.append(('clf', ModelClass(**params, random_state=123)))
    return Pipeline(steps)
    
def eval_on_test(model_name, X_train, y_train, X_test, y_test, ModelClass, params):
    pipe = make_pipeline(model_name, ModelClass, params)
    pipe.fit(X_train, y_train)

    y_pred = pipe.predict(X_test)
    y_proba = pipe.predict_proba(X_test)[:, 1]
    return {
        'accuracy':   accuracy_score(y_test, y_pred),
        'precision':  precision_score(y_test, y_pred, zero_division=0),
        'recall':     recall_score(y_test, y_pred, zero_division=0),
        'f1_score':   f1_score(y_test, y_pred, zero_division=0),
        'roc_auc':    roc_auc_score(y_test, y_proba)
    }

   
records = []
for model_name, ModelClass in model_map.items():
    for group, data in data_map.items():
        params = best_params_map[model_name][group]
        metrics = eval_on_test(
            model_name,
            data['X_train'], data['y_train'],
            data['X_test'],  data['y_test'],
            ModelClass, params
        )
        row = {'Model': model_name, 'Group': group}
        row.update(metrics)
        records.append(row)

df_eval = pd.DataFrame(records)
df_eval['Model'] = pd.Categorical(df_eval['Model'], 
    categories=['LogisticRegression','DecisionTree','RandomForest','GradientBoosting'], ordered=True)
df_eval['Group'] = pd.Categorical(df_eval['Group'], categories=['New','Returning'], ordered=True)
df_eval = df_eval.sort_values(['Group','Model']).reset_index(drop=True)

print(df_eval)



                Model      Group  accuracy  precision    recall  f1_score  \
0  LogisticRegression        New  0.890855   0.861538  0.666667  0.751678   
1        DecisionTree        New  0.911504   0.846154  0.785714  0.814815   
2        RandomForest        New  0.911504   0.855263  0.773810  0.812500   
3    GradientBoosting        New  0.896755   0.795181  0.785714  0.790419   
4  LogisticRegression  Returning  0.882520   0.666667  0.312925  0.425926   
5        DecisionTree  Returning  0.897679   0.685714  0.489796  0.571429   
6        RandomForest  Returning  0.906679   0.721461  0.537415  0.615984   
7    GradientBoosting  Returning  0.909048   0.705645  0.595238  0.645756   

    roc_auc  
0  0.898553  
1  0.894398  
2  0.924136  
3  0.911345  
4  0.896243  
5  0.925505  
6  0.935612  
7  0.937718  
