In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from imblearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, roc_auc_score, RocCurveDisplay
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import shap
import numpy as np

In [2]:
'''Load dataset and Preprocessing'''
df = pd.read_csv("online_shoppers_intention.csv")

# remove visitortype is other
df = df[df["VisitorType"] != "Other"]

# transform visitortype, weekend and revenue into numerical
df["VisitorType"] = df["VisitorType"].map({"New_Visitor": 0, "Returning_Visitor": 1})
df["Weekend"] = df["Weekend"].map({False: 0, True: 1})
df["Revenue"] = df["Revenue"].map({False: 0, True: 1})

# transform month into numerical by one-hot encoding
df = pd.get_dummies(df, columns=['Month'], prefix='Month')

# splitting visitortype into new and returning groups
new_visitors = df[df["VisitorType"] == 0]
returning_visitors = df[df["VisitorType"] == 1]

In [3]:
# get X, y from new / returning visitor, and deleting visitortype
X_new = new_visitors.drop(columns=['Revenue', 'VisitorType'])
y_new = new_visitors['Revenue']

X_return = returning_visitors.drop(columns=['Revenue', 'VisitorType'])
y_return = returning_visitors['Revenue']

# Columns needed to be standardized
numerical_cols = [
    'Administrative', 'Administrative_Duration', 'Informational',
    'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration',
    'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay']

# train-test split for new/ returning visitors
X_train_newf, X_test_new, y_train_newf, y_test_new = train_test_split(
    X_new, y_new, test_size=0.2, random_state=123, stratify=y_new)

X_train_returnf, X_test_return, y_train_returnf, y_test_return = train_test_split(
    X_return, y_return, test_size=0.2, random_state=123, stratify=y_return)

# validation split for tuning parameters
X_train_new, X_val_new, y_train_new, y_val_new = train_test_split(
    X_train_newf, y_train_newf, test_size=0.2, random_state=123, stratify=y_train_newf)

X_train_return, X_val_return, y_train_return, y_val_return = train_test_split(
    X_train_returnf, y_train_returnf, test_size=0.2, random_state=123, stratify=y_train_returnf)

In [4]:
scaler = StandardScaler()

X_train_new_scaled = X_train_new.copy()
X_val_new_scaled = X_val_new.copy()
X_test_new_scaled = X_test_new.copy()

X_train_return_scaled = X_train_return.copy()
X_val_return_scaled = X_val_return.copy()
X_test_return_scaled = X_test_return.copy()

X_train_new_scaled[numerical_cols] = scaler.fit_transform(X_train_new[numerical_cols])
X_val_new_scaled[numerical_cols] = scaler.transform(X_val_new[numerical_cols])
X_test_new_scaled[numerical_cols] = scaler.transform(X_test_new[numerical_cols])

X_train_return_scaled[numerical_cols] = scaler.fit_transform(X_train_return[numerical_cols])
X_val_return_scaled[numerical_cols] = scaler.transform(X_val_return[numerical_cols])
X_test_return_scaled[numerical_cols] = scaler.transform(X_test_return[numerical_cols])


In [5]:
'''Step 1 Logistic Regression'''
# tuning hyperparameters
# checked the best for new visitor is C = 0.1, penalty = L1, solver = saga
# checked the best for returning visitor is C = 0.001, penalty = L1, solver = saga

def tune_tts_lr(X_train, y_train, X_val, y_val, label=''):
    param_grid = {
        'C': [0.001, 0.01, 0.1, 1, 10], 
        'penalty': ['l1', 'l2'], 
        'solver': ['saga', 'liblinear']}

    best_score = 0
    best_params = {}
    results = []

    for C in param_grid['C']:
        for penalty in param_grid['penalty']:
            for solver in param_grid['solver']:
                model = LogisticRegression(
                    C=C,
                    penalty=penalty,
                    solver=solver,
                    max_iter=5000,
                    random_state=123)
                
                model.fit(X_train, y_train)
                y_pred = model.predict(X_val)
                y_proba = model.predict_proba(X_val)[:, 1]
    
                acc = accuracy_score(y_val, y_pred)
                prec = precision_score(y_val, y_pred, zero_division=0)
                rec = recall_score(y_val, y_pred, zero_division=0)
                f1 = f1_score(y_val, y_pred, zero_division=0)
                roc_auc = roc_auc_score(y_val, y_proba)
    
                results.append({
                    'C': C,
                    'penalty': penalty,
                    'solver': solver,
                    'accuracy': acc,
                    'precision': prec,
                    'recall': rec,
                    'f1_score': f1,
                    'roc_auc': roc_auc})
    
                if f1 > best_score:
                    best_score = f1
                    best_params = {
                        'C': C,
                        'penalty': penalty,
                        'solver': solver}

    df_results = pd.DataFrame(results).sort_values(by='f1_score', ascending=False)
    print(f"\nBest Params for {label}:", best_params)
    print(f"Best F1-Score for {label}:", best_score)
    print(f"\nLogistic Regression - {label} Results")
    print(df_results)
    return best_params, df_results



lrn_best_param, lrn_results = tune_tts_lr(
    X_train_new_scaled,
    y_train_new,
    X_val_new_scaled,
    y_val_new,
    label='New Visitors')

lrr_best_param, lrr_results = tune_tts_lr(
    X_train_return_scaled,
    y_train_return,
    X_val_return_scaled,
    y_val_return,
    label='Returning Visitors')



Best Params for New Visitors: {'C': 1, 'penalty': 'l2', 'solver': 'saga'}
Best F1-Score for New Visitors: 0.8188976377952756

Logistic Regression - New Visitors Results
         C penalty     solver  accuracy  precision    recall  f1_score  \
15   1.000      l2  liblinear  0.915129   0.881356  0.764706  0.818898   
14   1.000      l2       saga  0.915129   0.881356  0.764706  0.818898   
18  10.000      l2       saga  0.911439   0.866667  0.764706  0.812500   
16  10.000      l1       saga  0.911439   0.866667  0.764706  0.812500   
13   1.000      l1  liblinear  0.911439   0.866667  0.764706  0.812500   
12   1.000      l1       saga  0.911439   0.866667  0.764706  0.812500   
11   0.100      l2  liblinear  0.911439   0.892857  0.735294  0.806452   
19  10.000      l2  liblinear  0.907749   0.864407  0.750000  0.803150   
17  10.000      l1  liblinear  0.907749   0.864407  0.750000  0.803150   
10   0.100      l2       saga  0.907749   0.890909  0.720588  0.796748   
9    0.100      

In [6]:
'''Step 2 Decision Tree'''
# tuning hyperparameters

def tune_decision_tree(X_train, y_train, X_val, y_val, label=''):
    param_grid = {
        'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10],
        'criterion': ['gini', 'entropy']}

    best_score = 0
    best_params = {}
    results = []

    for max_depth in param_grid['max_depth']:
        for criterion in param_grid['criterion']:
            model = DecisionTreeClassifier(
                max_depth=max_depth,
                criterion=criterion,
                random_state=123)
            
            model.fit(X_train, y_train)
            y_pred = model.predict(X_val)
            y_proba = model.predict_proba(X_val)[:, 1]

            acc = accuracy_score(y_val, y_pred)
            prec = precision_score(y_val, y_pred, zero_division=0)
            rec = recall_score(y_val, y_pred, zero_division=0)
            f1 = f1_score(y_val, y_pred, zero_division=0)
            roc_auc = roc_auc_score(y_val, y_proba)

            results.append({
                'max_depth': max_depth,
                'criterion': criterion,
                'accuracy': acc,
                'precision': prec,
                'recall': rec,
                'f1_score': f1,
                'roc_auc': roc_auc})

            if f1 > best_score:
                best_score = f1
                best_params = {
                    'max_depth': max_depth,
                    'criterion': criterion}

    df_results = pd.DataFrame(results).sort_values(by='f1_score', ascending=False)
    print(f"\nBest Params for {label}:", best_params)
    print(f"Best F1-Score for {label}:", best_score)
    print(f"\nDecision Tree - {label} Results")
    print(df_results)

    return best_params, df_results

dtn_best_param, dtn_results = tune_decision_tree(
    X_train_new,
    y_train_new,
    X_val_new,
    y_val_new,
    label='New Visitors')

dtr_best_param, dtr_results = tune_decision_tree(
    X_train_return,
    y_train_return,
    X_val_return,
    y_val_return,
    label='Returning Visitors')


Best Params for New Visitors: {'max_depth': 5, 'criterion': 'gini'}
Best F1-Score for New Visitors: 0.8507462686567164

Decision Tree - New Visitors Results
    max_depth criterion  accuracy  precision    recall  f1_score   roc_auc
6           5      gini  0.926199   0.863636  0.838235  0.850746  0.883295
0           2      gini  0.918819   0.838235  0.838235  0.838235  0.906223
3           3   entropy  0.918819   0.838235  0.838235  0.838235  0.925167
1           2   entropy  0.918819   0.838235  0.838235  0.838235  0.911511
16         10      gini  0.911439   0.814286  0.838235  0.826087  0.880035
12          8      gini  0.911439   0.823529  0.823529  0.823529  0.857867
14          9      gini  0.904059   0.791667  0.838235  0.814286  0.873805
2           3      gini  0.904059   0.818182  0.794118  0.805970  0.913322
15          9   entropy  0.904059   0.818182  0.794118  0.805970  0.872139
8           6      gini  0.900369   0.788732  0.823529  0.805755  0.855006
10          7    

In [7]:
'''Step 3 Random Forest'''
# tuning hyperparameters

def tune_random_forest(X_train, y_train, X_val, y_val, label=''):
    param_grid = {
        'max_depth': [5, 10, 15, 20, 25],
        'n_estimators': [150, 200, 250, 300, 350, 400],
        'criterion': ['gini', 'entropy']}

    best_score = 0
    best_params = {}
    results = []

    for n in param_grid['n_estimators']:
        for depth in param_grid['max_depth']:
            for criterion in param_grid['criterion']:
                model = RandomForestClassifier(
                    n_estimators=n,
                    max_depth=depth,
                    random_state=123)
                
                model.fit(X_train, y_train)
                y_pred = model.predict(X_val)
                y_proba = model.predict_proba(X_val)[:, 1]
    
                acc = accuracy_score(y_val, y_pred)
                prec = precision_score(y_val, y_pred, zero_division=0)
                rec = recall_score(y_val, y_pred, zero_division=0)
                f1 = f1_score(y_val, y_pred, zero_division=0)
                roc_auc = roc_auc_score(y_val, y_proba)
    
                results.append({
                    'n_estimators': n,
                    'max_depth': depth,
                    'accuracy': acc,
                    'precision': prec,
                    'recall': rec,
                    'f1_score': f1,
                    'roc_auc': roc_auc})
    
                if f1 > best_score:
                    best_score = f1
                    best_params = {
                        'n_estimators': n,
                        'max_depth': depth,
                        'criterion': criterion}

    df_results = pd.DataFrame(results).sort_values(by='f1_score', ascending=False)
    print(f"\nBest Params for {label}:", best_params)
    print(f"Best F1-Score for {label}:", best_score)
    print(f"\nRandom Forest - {label} Results")
    print(df_results)

    return best_params, df_results


rfn_best_param, rfn_results = tune_random_forest(
    X_train_new,
    y_train_new,
    X_val_new,
    y_val_new,
    label='New Visitors')

rfr_best_param, rfr_results = tune_random_forest(
    X_train_return,
    y_train_return,
    X_val_return,
    y_val_return,
    label='Returning Visitors')


Best Params for New Visitors: {'n_estimators': 150, 'max_depth': 15, 'criterion': 'gini'}
Best F1-Score for New Visitors: 0.8444444444444444

Random Forest - New Visitors Results
    n_estimators  max_depth  accuracy  precision    recall  f1_score   roc_auc
18           200         25  0.922509   0.850746  0.838235  0.844444  0.934041
19           200         25  0.922509   0.850746  0.838235  0.844444  0.934041
4            150         15  0.922509   0.850746  0.838235  0.844444  0.939583
5            150         15  0.922509   0.850746  0.838235  0.844444  0.939583
6            150         20  0.922509   0.850746  0.838235  0.844444  0.934222
7            150         20  0.922509   0.850746  0.838235  0.844444  0.934222
8            150         25  0.922509   0.850746  0.838235  0.844444  0.933063
9            150         25  0.922509   0.850746  0.838235  0.844444  0.933063
17           200         20  0.922509   0.850746  0.838235  0.844444  0.933968
16           200         20  0

In [8]:
'''Step 4 Gradient Boosting'''
# tuning hyperparameters

def tune_gradient_boosting(X_train, y_train, X_val, y_val, label=''):
    param_grid = {
        'max_depth': [2, 3, 5, 10],
        'n_estimators': [150,200,250,300, 350, 400], 
        'learning_rate': [0.01, 0.05, 0.1, 0.15]} 
    
    best_score = 0
    best_params = {}
    results = []

    for n in param_grid['n_estimators']:
        for lr in param_grid['learning_rate']:
            for depth in param_grid['max_depth']:
                model = GradientBoostingClassifier(
                    n_estimators=n,
                    learning_rate=lr,
                    max_depth=depth,
                    random_state=123)
                
                model.fit(X_train, y_train)
                y_pred = model.predict(X_val)
                y_proba = model.predict_proba(X_val)[:, 1]
    
                acc = accuracy_score(y_val, y_pred)
                prec = precision_score(y_val, y_pred, zero_division=0)
                rec = recall_score(y_val, y_pred, zero_division=0)
                f1 = f1_score(y_val, y_pred, zero_division=0)
                roc_auc = roc_auc_score(y_val, y_proba)

                results.append({
                    'n_estimators': n,
                    'learning_rate': lr,
                    'max_depth': depth,
                    'accuracy': acc,
                    'precision': prec,
                    'recall': rec,
                    'f1_score': f1,
                    'roc_auc': roc_auc})
    
                if f1 > best_score:
                    best_score = f1
                    best_params = {
                        'n_estimators': n,
                        'learning_rate': lr,
                        'max_depth': depth,}

    df_results = pd.DataFrame(results).sort_values(by='f1_score', ascending=False)
    print(f"\nBest Params for {label}:", best_params)
    print(f"Best F1-Score for {label}:", best_score)
    print(f"\nGradient Boosting - {label} Results")
    print(df_results)

    return best_params, df_results

gbn_best_param, gbn_results = tune_gradient_boosting(
    X_train_new,
    y_train_new,
    X_val_new,
    y_val_new,
    label='New Visitors')

gbr_best_param, gbr_results = tune_gradient_boosting(
    X_train_return,
    y_train_return,
    X_val_return,
    y_val_return,
    label='Returning Visitors')


Best Params for New Visitors: {'n_estimators': 400, 'learning_rate': 0.1, 'max_depth': 5}
Best F1-Score for New Visitors: 0.851063829787234

Gradient Boosting - New Visitors Results
    n_estimators  learning_rate  max_depth  accuracy  precision    recall  \
90           400           0.10          5  0.922509   0.821918  0.882353   
20           200           0.05          2  0.926199   0.863636  0.838235   
86           400           0.05          5  0.922509   0.830986  0.867647   
37           250           0.05          3  0.922509   0.840580  0.852941   
21           200           0.05          3  0.922509   0.840580  0.852941   
..           ...            ...        ...       ...        ...       ...   
76           350           0.15          2  0.904059   0.800000  0.823529   
1            150           0.01          3  0.907749   0.841270  0.779412   
91           400           0.10         10  0.900369   0.788732  0.823529   
75           350           0.10         10  0.9

In [9]:
results_map = {
    ('Logistic Regression', 'New'):   lrn_results,
    ('Logistic Regression', 'Returning'): lrr_results,
    ('Decision Tree', 'New'):         dtn_results,
    ('Decision Tree', 'Returning'):   dtr_results,
    ('Random Forest', 'New'):         rfn_results,
    ('Random Forest', 'Returning'):   rfr_results,
    ('Gradient Boosting', 'New'):     gbn_results,
    ('Gradient Boosting', 'Returning'): gbr_results,
}

rows = []
for (model, group), df_res in results_map.items():
    best = df_res.sort_values('f1_score', ascending=False).iloc[0]
    row = {
        'Model': model,
        'Group': group,
        'Accuracy': best['accuracy'],
        'Precision': best['precision'],
        'Recall': best['recall'],
        'F1-score': best['f1_score'],
        'ROC AUC': best['roc_auc']
    }

    rows.append(row)

df_best = pd.DataFrame(rows)


df_best['Model'] = pd.Categorical(df_best['Model'],
    categories=['Logistic Regression','Decision Tree','Random Forest','Gradient Boosting'], ordered=True)
df_best['Group'] = pd.Categorical(df_best['Group'], categories=['New','Returning'], ordered=True)

df_best = df_best.sort_values(['Group','Model']).reset_index(drop=True)

print(df_best)


                 Model      Group  Accuracy  Precision    Recall  F1-score  \
0  Logistic Regression        New  0.915129   0.881356  0.764706  0.818898   
1        Decision Tree        New  0.926199   0.863636  0.838235  0.850746   
2        Random Forest        New  0.922509   0.850746  0.838235  0.844444   
3    Gradient Boosting        New  0.922509   0.821918  0.882353  0.851064   
4  Logistic Regression  Returning  0.884479   0.722222  0.276596  0.400000   
5        Decision Tree  Returning  0.889218   0.622449  0.519149  0.566125   
6        Random Forest  Returning  0.896327   0.717391  0.421277  0.530831   
7    Gradient Boosting  Returning  0.898104   0.693252  0.480851  0.567839   

    ROC AUC  
0  0.919371  
1  0.883295  
2  0.934041  
3  0.945378  
4  0.884790  
5  0.904646  
6  0.922174  
7  0.920615  


In [33]:
data_map = {
    'New': {
        'X_train': X_train_newf, 'y_train': y_train_newf,
        'X_test':  X_test_new,  'y_test':  y_test_new,
    },
    'Returning': {
        'X_train': X_train_returnf, 'y_train': y_train_returnf,
        'X_test':  X_test_return,  'y_test':  y_test_return,
    }
}

best_params_map = {
    'LogisticRegression': {
        'New':   lrn_best_param,
        'Returning': lrr_best_param,
    },
    'DecisionTree': {
        'New':   dtn_best_param,
        'Returning': dtr_best_param,
    },
    'RandomForest': {
        'New':   rfn_best_param,
        'Returning': rfr_best_param,
    },
    'GradientBoosting': {
        'New':   gbn_best_param,
        'Returning': gbr_best_param,
    }
}

model_map = {
    'LogisticRegression': LogisticRegression,
    'DecisionTree':        DecisionTreeClassifier,
    'RandomForest':        RandomForestClassifier,
    'GradientBoosting':    GradientBoostingClassifier,
}

def make_pipeline(model_name, ModelClass, params):
    steps = []
    if model_name == 'LogisticRegression':
        steps.append(('scaler', StandardScaler()))

    steps.append(('clf', ModelClass(**params, random_state=123)))
    return Pipeline(steps)
    
def eval_on_test(model_name, X_train, y_train, X_test, y_test, ModelClass, params):
    if model_name == 'LogisticRegression':
        params['max_iter'] = 30000
    pipe = make_pipeline(model_name, ModelClass, params)
    pipe.fit(X_train, y_train)

    y_pred = pipe.predict(X_test)
    y_proba = pipe.predict_proba(X_test)[:, 1]

    return {
        'accuracy':   accuracy_score(y_test, y_pred),
        'precision':  precision_score(y_test, y_pred, zero_division=0),
        'recall':     recall_score(y_test, y_pred, zero_division=0),
        'f1_score':   f1_score(y_test, y_pred, zero_division=0),
        'roc_auc':    roc_auc_score(y_test, y_proba)
    }

   
records = []
for model_name, ModelClass in model_map.items():
    for group, data in data_map.items():
        params = best_params_map[model_name][group]
        metrics = eval_on_test(
            model_name,
            data['X_train'], data['y_train'],
            data['X_test'],  data['y_test'],
            ModelClass, params
        )
        row = {'Model': model_name, 'Group': group}
        row.update(metrics)
        # row.update(params)
        records.append(row)

df_eval = pd.DataFrame(records)
df_eval['Model'] = pd.Categorical(df_eval['Model'], 
    categories=['LogisticRegression','DecisionTree','RandomForest','GradientBoosting'], ordered=True)
df_eval['Group'] = pd.Categorical(df_eval['Group'], categories=['New','Returning'], ordered=True)
df_eval = df_eval.sort_values(['Group','Model']).reset_index(drop=True)

print(df_eval)

                Model      Group  accuracy  precision    recall  f1_score  \
0  LogisticRegression        New  0.884956   0.835821  0.666667  0.741722   
1        DecisionTree        New  0.884956   0.800000  0.714286  0.754717   
2        RandomForest        New  0.908555   0.853333  0.761905  0.805031   
3    GradientBoosting        New  0.884956   0.792208  0.726190  0.757764   
4  LogisticRegression  Returning  0.880152   0.651852  0.299320  0.410256   
5        DecisionTree  Returning  0.897679   0.685714  0.489796  0.571429   
6        RandomForest  Returning  0.904311   0.707207  0.534014  0.608527   
7    GradientBoosting  Returning  0.908100   0.708333  0.578231  0.636704   

    roc_auc  
0  0.897946  
1  0.891783  
2  0.920168  
3  0.904482  
4  0.898128  
5  0.925505  
6  0.933128  
7  0.937878  
