In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from imblearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, roc_auc_score, RocCurveDisplay
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import shap
import numpy as np

In [2]:
'''Load dataset and Preprocessing'''
df = pd.read_csv("online_shoppers_intention.csv")

# remove visitortype is other
df = df[df["VisitorType"] != "Other"]

# transform visitortype, weekend and revenue into numerical
df["VisitorType"] = df["VisitorType"].map({"New_Visitor": 0, "Returning_Visitor": 1})
df["Weekend"] = df["Weekend"].map({False: 0, True: 1})
df["Revenue"] = df["Revenue"].map({False: 0, True: 1})

# transform month into numerical by one-hot encoding
df = pd.get_dummies(df, columns=['Month'], prefix='Month')

# splitting visitortype into new and returning groups
new_visitors = df[df["VisitorType"] == 0]
returning_visitors = df[df["VisitorType"] == 1]

In [3]:
# get X, y from new / returning visitor, and deleting visitortype
X_new = new_visitors.drop(columns=['Revenue', 'VisitorType'])
y_new = new_visitors['Revenue']

X_return = returning_visitors.drop(columns=['Revenue', 'VisitorType'])
y_return = returning_visitors['Revenue']

# Columns needed to be standardized
numerical_cols = [
    'Administrative', 'Administrative_Duration', 'Informational',
    'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration',
    'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay']

# train-test split for new/ returning visitors
X_train_newf, X_test_new, y_train_newf, y_test_new = train_test_split(
    X_new, y_new, test_size=0.2, random_state=123, stratify=y_new)

X_train_returnf, X_test_return, y_train_returnf, y_test_return = train_test_split(
    X_return, y_return, test_size=0.2, random_state=123, stratify=y_return)

# validation split for tuning parameters
X_train_new, X_val_new, y_train_new, y_val_new = train_test_split(
    X_train_newf, y_train_newf, test_size=0.2, random_state=123, stratify=y_train_newf)

X_train_return, X_val_return, y_train_return, y_val_return = train_test_split(
    X_train_returnf, y_train_returnf, test_size=0.2, random_state=123, stratify=y_train_returnf)

In [4]:
scaler = StandardScaler()

X_train_new_scaled = X_train_new.copy()
X_val_new_scaled = X_val_new.copy()
X_test_new_scaled = X_test_new.copy()

X_train_return_scaled = X_train_return.copy()
X_val_return_scaled = X_val_return.copy()
X_test_return_scaled = X_test_return.copy()

X_train_new_scaled[numerical_cols] = scaler.fit_transform(X_train_new[numerical_cols])
X_val_new_scaled[numerical_cols] = scaler.transform(X_val_new[numerical_cols])
X_test_new_scaled[numerical_cols] = scaler.transform(X_test_new[numerical_cols])

X_train_return_scaled[numerical_cols] = scaler.fit_transform(X_train_return[numerical_cols])
X_val_return_scaled[numerical_cols] = scaler.transform(X_val_return[numerical_cols])
X_test_return_scaled[numerical_cols] = scaler.transform(X_test_return[numerical_cols])

smote = SMOTE(random_state=123)
X_train_new_resampled, y_train_new_resampled = smote.fit_resample(X_train_new_scaled, y_train_new)
X_train_return_resampled, y_train_return_resampled = smote.fit_resample(X_train_return_scaled, y_train_return)

smote = SMOTE(random_state=123)
X_train_new_resampled_unscaled, y_train_new_resampled_unscaled = smote.fit_resample(X_train_new, y_train_new)
X_train_return_resampled_unscaled, y_train_return_resampled_unscaled = smote.fit_resample(
    X_train_return, y_train_return)


In [29]:
'''Step 1 Logistic Regression'''
# tuning hyperparameters

def tune_tts_lr(X_train, y_train, X_val, y_val, label=''):
    param_grid = {
        'C': [0.001, 0.01, 0.1, 1, 10], 
        'penalty': ['l1', 'l2'], 
        'solver': ['saga', 'liblinear']}

    best_score = 0
    best_params = {}
    results = []

    for C in param_grid['C']:
        for penalty in param_grid['penalty']:
            for solver in param_grid['solver']:
                model = LogisticRegression(
                    C=C,
                    penalty=penalty,
                    solver=solver,
                    max_iter=5000,
                    random_state=123)
                
                model.fit(X_train, y_train)
                y_pred = model.predict(X_val)
                y_proba = model.predict_proba(X_val)[:, 1]
    
                acc = accuracy_score(y_val, y_pred)
                prec = precision_score(y_val, y_pred, zero_division=0)
                rec = recall_score(y_val, y_pred, zero_division=0)
                f1 = f1_score(y_val, y_pred, zero_division=0)
                roc_auc = roc_auc_score(y_val, y_proba)
    
                results.append({
                    'C': C,
                    'penalty': penalty,
                    'solver': solver,
                    'accuracy': acc,
                    'precision': prec,
                    'recall': rec,
                    'f1_score': f1,
                    'roc_auc': roc_auc})
    
                if f1 > best_score:
                    best_score = f1
                    best_params = {
                        'C': C,
                        'penalty': penalty,
                        'solver': solver}

    df_results = pd.DataFrame(results).sort_values(by='f1_score', ascending=False)
    print(f"\nBest Params for {label}:", best_params)
    print(f"Best F1-Score for {label}:", best_score)
    print(f"\nLogistic Regression - {label} Results")
    print(df_results)
    return best_params, df_results



lrn_best_param, lrn_results = tune_tts_lr(
    X_train_new_resampled,
    y_train_new_resampled,
    X_val_new_scaled,
    y_val_new,
    label='New Visitors')

lrr_best_param, lrr_results = tune_tts_lr(
    X_train_return_resampled,
    y_train_return_resampled,
    X_val_return_scaled,
    y_val_return,
    label='Returning Visitors')



Best Params for New Visitors: {'C': 0.1, 'penalty': 'l1', 'solver': 'saga'}
Best F1-Score for New Visitors: 0.8208955223880597

Logistic Regression - New Visitors Results
         C penalty     solver  accuracy  precision    recall  f1_score  \
8    0.100      l1       saga  0.911439   0.833333  0.808824  0.820896   
9    0.100      l1  liblinear  0.911439   0.833333  0.808824  0.820896   
12   1.000      l1       saga  0.904059   0.828125  0.779412  0.803030   
15   1.000      l2  liblinear  0.904059   0.828125  0.779412  0.803030   
14   1.000      l2       saga  0.904059   0.828125  0.779412  0.803030   
5    0.010      l1  liblinear  0.904059   0.828125  0.779412  0.803030   
13   1.000      l1  liblinear  0.904059   0.828125  0.779412  0.803030   
6    0.010      l2       saga  0.904059   0.838710  0.764706  0.800000   
7    0.010      l2  liblinear  0.904059   0.838710  0.764706  0.800000   
10   0.100      l2       saga  0.900369   0.825397  0.764706  0.793893   
18  10.000    

In [6]:
'''Step 2 Decision Tree'''
# tuning hyperparameters

def tune_decision_tree(X_train, y_train, X_val, y_val, label=''):
    param_grid = {
        'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10],
        'criterion': ['gini', 'entropy']}

    best_score = 0
    best_params = {}
    results = []

    for max_depth in param_grid['max_depth']:
        for criterion in param_grid['criterion']:
            model = DecisionTreeClassifier(
                max_depth=max_depth,
                criterion=criterion,
                random_state=123)
            
            model.fit(X_train, y_train)
            y_pred = model.predict(X_val)
            y_proba = model.predict_proba(X_val)[:, 1]

            acc = accuracy_score(y_val, y_pred)
            prec = precision_score(y_val, y_pred)
            rec = recall_score(y_val, y_pred)
            f1 = f1_score(y_val, y_pred)
            roc_auc = roc_auc_score(y_val, y_proba)

            results.append({
                'max_depth': max_depth,
                'criterion': criterion,
                'accuracy': acc,
                'precision': prec,
                'recall': rec,
                'f1_score': f1,
                'roc_auc': roc_auc})

            if f1 > best_score:
                best_score = f1
                best_params = {
                    'max_depth': max_depth,
                    'criterion': criterion}

    df_results = pd.DataFrame(results).sort_values(by='f1_score', ascending=False)
    print(f"\nBest Params for {label}:", best_params)
    print(f"Best F1-Score for {label}:", best_score)
    print(f"\nDecision Tree - {label} Results")
    print(df_results)

    return best_params, df_results

dtn_best_param, dtn_results = tune_decision_tree(
    X_train_new_resampled_unscaled,
    y_train_new_resampled_unscaled,
    X_val_new,
    y_val_new,
    label='New Visitors')

dtr_best_param, dtr_results = tune_decision_tree(
    X_train_return_resampled_unscaled,
    y_train_return_resampled_unscaled,
    X_val_return,
    y_val_return,
    label='Returning Visitors')


Best Params for New Visitors: {'max_depth': 2, 'criterion': 'gini'}
Best F1-Score for New Visitors: 0.8321167883211679

Decision Tree - New Visitors Results
    max_depth criterion  accuracy  precision    recall  f1_score   roc_auc
0           2      gini  0.915129   0.826087  0.838235  0.832117  0.904376
1           2   entropy  0.915129   0.826087  0.838235  0.832117  0.905571
13          8   entropy  0.911439   0.805556  0.852941  0.828571  0.917560
11          7   entropy  0.911439   0.805556  0.852941  0.828571  0.915278
5           4   entropy  0.911439   0.814286  0.838235  0.826087  0.939474
9           6   entropy  0.911439   0.814286  0.838235  0.826087  0.915858
7           5   entropy  0.900369   0.773333  0.852941  0.811189  0.939221
4           4      gini  0.900369   0.780822  0.838235  0.808511  0.894342
15          9   entropy  0.896679   0.763158  0.852941  0.805556  0.913141
17         10   entropy  0.896679   0.763158  0.852941  0.805556  0.907925
8           6    

In [7]:
'''Step 3 Random Forest'''
# tuning hyperparameters

def tune_random_forest(X_train, y_train, X_val, y_val, label=''):
    param_grid = {
        'max_depth': [5, 10, 15, 20, 25],
        'n_estimators': [150, 200, 250, 300, 350, 400],
        'criterion': ['gini', 'entropy']}

    best_score = 0
    best_params = {}
    results = []

    for n in param_grid['n_estimators']:
        for depth in param_grid['max_depth']:
            for criterion in param_grid['criterion']:
                model = RandomForestClassifier(
                    n_estimators=n,
                    max_depth=depth,
                    random_state=123)
                
                model.fit(X_train, y_train)
                y_pred = model.predict(X_val)
                y_proba = model.predict_proba(X_val)[:, 1]
    
                acc = accuracy_score(y_val, y_pred)
                prec = precision_score(y_val, y_pred)
                rec = recall_score(y_val, y_pred)
                f1 = f1_score(y_val, y_pred)
                roc_auc = roc_auc_score(y_val, y_proba)
    
                results.append({
                    'n_estimators': n,
                    'max_depth': depth,
                    'accuracy': acc,
                    'precision': prec,
                    'recall': rec,
                    'f1_score': f1,
                    'roc_auc': roc_auc})
    
                if f1 > best_score:
                    best_score = f1
                    best_params = {
                        'n_estimators': n,
                        'max_depth': depth,
                        'criterion': criterion}

    df_results = pd.DataFrame(results).sort_values(by='f1_score', ascending=False)
    print(f"\nBest Params for {label}:", best_params)
    print(f"Best F1-Score for {label}:", best_score)
    print(f"\nRandom Forest - {label} Results")
    print(df_results)

    return best_params, df_results


rfn_best_param, rfn_results = tune_random_forest(
    X_train_new_resampled_unscaled,
    y_train_new_resampled_unscaled,
    X_val_new,
    y_val_new,
    label='New Visitors')

rfr_best_param, rfr_results = tune_random_forest(
    X_train_return_resampled_unscaled,
    y_train_return_resampled_unscaled,
    X_val_return,
    y_val_return,
    label='Returning Visitors')


Best Params for New Visitors: {'n_estimators': 150, 'max_depth': 5, 'criterion': 'gini'}
Best F1-Score for New Visitors: 0.8405797101449275

Random Forest - New Visitors Results
    n_estimators  max_depth  accuracy  precision    recall  f1_score   roc_auc
0            150          5  0.918819   0.828571  0.852941  0.840580  0.947117
32           300         10  0.918819   0.828571  0.852941  0.840580  0.946103
22           250         10  0.918819   0.828571  0.852941  0.840580  0.945016
23           250         10  0.918819   0.828571  0.852941  0.840580  0.945016
24           250         15  0.918819   0.828571  0.852941  0.840580  0.937337
25           250         15  0.918819   0.828571  0.852941  0.840580  0.937337
1            150          5  0.918819   0.828571  0.852941  0.840580  0.947117
31           300          5  0.918819   0.828571  0.852941  0.840580  0.945161
33           300         10  0.918819   0.828571  0.852941  0.840580  0.946103
20           250          5  0.

In [8]:
'''Step 4 Gradient Boosting'''
# tuning hyperparameters

def tune_gradient_boosting(X_train, y_train, X_val, y_val, label=''):
    param_grid = {
        'max_depth': [2, 3, 5, 10],
        'n_estimators': [150,200,250,300, 350, 400], 
        'learning_rate': [0.01, 0.05, 0.1, 0.15]} 
    
    best_score = 0
    best_params = {}
    results = []

    for n in param_grid['n_estimators']:
        for lr in param_grid['learning_rate']:
            for depth in param_grid['max_depth']:
                model = GradientBoostingClassifier(
                    n_estimators=n,
                    learning_rate=lr,
                    max_depth=depth,
                    random_state=123)
                
                model.fit(X_train, y_train)
                y_pred = model.predict(X_val)
                y_proba = model.predict_proba(X_val)[:, 1]
    
                acc = accuracy_score(y_val, y_pred)
                prec = precision_score(y_val, y_pred)
                rec = recall_score(y_val, y_pred)
                f1 = f1_score(y_val, y_pred)
                roc_auc = roc_auc_score(y_val, y_proba)

                results.append({
                    'n_estimators': n,
                    'learning_rate': lr,
                    'max_depth': depth,
                    'accuracy': acc,
                    'precision': prec,
                    'recall': rec,
                    'f1_score': f1,
                    'roc_auc': roc_auc})
    
                if f1 > best_score:
                    best_score = f1
                    best_params = {
                        'n_estimators': n,
                        'learning_rate': lr,
                        'max_depth': depth,}

    df_results = pd.DataFrame(results).sort_values(by='f1_score', ascending=False)
    print(f"\nBest Params for {label}:", best_params)
    print(f"Best F1-Score for {label}:", best_score)
    print(f"\nGradient Boosting - {label} Results")
    print(df_results)

    return best_params, df_results

gbn_best_param, gbn_results = tune_gradient_boosting(
    X_train_new_resampled_unscaled,
    y_train_new_resampled_unscaled,
    X_val_new,
    y_val_new,
    label='New Visitors')

gbr_best_param, gbr_results = tune_gradient_boosting(
    X_train_return_resampled_unscaled,
    y_train_return_resampled_unscaled,
    X_val_return,
    y_val_return,
    label='Returning Visitors')


Best Params for New Visitors: {'n_estimators': 250, 'learning_rate': 0.15, 'max_depth': 5}
Best F1-Score for New Visitors: 0.8450704225352113

Gradient Boosting - New Visitors Results
    n_estimators  learning_rate  max_depth  accuracy  precision    recall  \
46           250           0.15          5  0.918819   0.810811  0.882353   
74           350           0.10          5  0.918819   0.819444  0.867647   
82           400           0.01          5  0.918819   0.828571  0.852941   
80           400           0.01          2  0.918819   0.828571  0.852941   
66           350           0.01          5  0.918819   0.828571  0.852941   
..           ...            ...        ...       ...        ...       ...   
19           200           0.01         10  0.885609   0.740260  0.838235   
11           150           0.10         10  0.889299   0.763889  0.808824   
87           400           0.05         10  0.889299   0.763889  0.808824   
15           150           0.15         10  0

In [9]:
results_map = {
    ('Logistic Regression', 'New'):   lrn_results,
    ('Logistic Regression', 'Returning'): lrr_results,
    ('Decision Tree', 'New'):         dtn_results,
    ('Decision Tree', 'Returning'):   dtr_results,
    ('Random Forest', 'New'):         rfn_results,
    ('Random Forest', 'Returning'):   rfr_results,
    ('Gradient Boosting', 'New'):     gbn_results,
    ('Gradient Boosting', 'Returning'): gbr_results,
}

rows = []
for (model, group), df_res in results_map.items():
    best = df_res.sort_values('f1_score', ascending=False).iloc[0]
    row = {
        'Model': model,
        'Group': group,
        'Accuracy': best['accuracy'],
        'Precision': best['precision'],
        'Recall': best['recall'],
        'F1-score': best['f1_score'],
        'ROC AUC': best['roc_auc']
    }

    rows.append(row)

df_best = pd.DataFrame(rows)


df_best['Model'] = pd.Categorical(df_best['Model'],
    categories=['Logistic Regression','Decision Tree','Random Forest','Gradient Boosting'], ordered=True)
df_best['Group'] = pd.Categorical(df_best['Group'], categories=['New','Returning'], ordered=True)

df_best = df_best.sort_values(['Group','Model']).reset_index(drop=True)

print(df_best)


                 Model      Group  Accuracy  Precision    Recall  F1-score  \
0  Logistic Regression        New  0.911439   0.833333  0.808824  0.820896   
1        Decision Tree        New  0.915129   0.826087  0.838235  0.832117   
2        Random Forest        New  0.918819   0.828571  0.852941  0.840580   
3    Gradient Boosting        New  0.918819   0.810811  0.882353  0.845070   
4  Logistic Regression  Returning  0.877370   0.548276  0.676596  0.605714   
5        Decision Tree  Returning  0.888033   0.583942  0.680851  0.628684   
6        Random Forest  Returning  0.883886   0.563934  0.731915  0.637037   
7    Gradient Boosting  Returning  0.881517   0.555556  0.744681  0.636364   

    ROC AUC  
0  0.923428  
1  0.904376  
2  0.947117  
3  0.941394  
4  0.877835  
5  0.881122  
6  0.921884  
7  0.919956  


In [39]:
data_map = {
    'New': {
        'X_train': X_train_newf, 'y_train': y_train_newf,
        'X_test':  X_test_new,  'y_test':  y_test_new,
    },
    'Returning': {
        'X_train': X_train_returnf, 'y_train': y_train_returnf,
        'X_test':  X_test_return,  'y_test':  y_test_return,
    }
}

best_params_map = {
    'LogisticRegression': {
        'New':   lrn_best_param,
        'Returning': lrr_best_param,
    },
    'DecisionTree': {
        'New':   dtn_best_param,
        'Returning': dtr_best_param,
    },
    'RandomForest': {
        'New':   rfn_best_param,
        'Returning': rfr_best_param,
    },
    'GradientBoosting': {
        'New':   gbn_best_param,
        'Returning': gbr_best_param,
    }
}

model_map = {
    'LogisticRegression': LogisticRegression,
    'DecisionTree':        DecisionTreeClassifier,
    'RandomForest':        RandomForestClassifier,
    'GradientBoosting':    GradientBoostingClassifier,
}

def make_pipeline(model_name, ModelClass, params):
    steps = []
    if model_name == 'LogisticRegression':
        steps.append(('scaler', StandardScaler()))

    steps.append(('clf', ModelClass(**params, random_state=123)))
    return Pipeline(steps)
    
def eval_on_test(model_name, X_train, y_train, X_test, y_test, ModelClass, params, threshold=0.5):
    if model_name == 'LogisticRegression':
        params['max_iter'] = 30000
    pipe = make_pipeline(model_name, ModelClass, params)
    pipe.fit(X_train, y_train)

    y_proba = pipe.predict_proba(X_test)[:, 1]
    y_pred = (y_proba > threshold).astype(int)
    return {
        'accuracy':   accuracy_score(y_test, y_pred),
        'precision':  precision_score(y_test, y_pred, zero_division=0),
        'recall':     recall_score(y_test, y_pred, zero_division=0),
        'f1_score':   f1_score(y_test, y_pred, zero_division=0),
        'roc_auc':    roc_auc_score(y_test, y_proba)
    }

   
records = []
for model_name, ModelClass in model_map.items():
    for group, data in data_map.items():
        params = best_params_map[model_name][group]
        if model_name == 'LogisticRegression':
            metrics = eval_on_test(
                model_name, data['X_train'], data['y_train'],
                data['X_test'],  data['y_test'],
                ModelClass, params,
                threshold=0.3)
        else:
            metrics = eval_on_test(
                model_name,
                data['X_train'], data['y_train'],
                data['X_test'],  data['y_test'],
                ModelClass, params)
        # 把參數內容也攤平放進 row 裡
        row = {'Model': model_name, 'Group': group}
        row.update(metrics)
        # row.update(params)
        records.append(row)

df_eval = pd.DataFrame(records)
df_eval['Model'] = pd.Categorical(df_eval['Model'], 
    categories=['LogisticRegression','DecisionTree','RandomForest','GradientBoosting'], ordered=True)
df_eval['Group'] = pd.Categorical(df_eval['Group'], categories=['New','Returning'], ordered=True)
df_eval = df_eval.sort_values(['Group','Model']).reset_index(drop=True)

print(df_eval)

                Model      Group  accuracy  precision    recall  f1_score  \
0  LogisticRegression        New  0.902655   0.849315  0.738095  0.789809   
1        DecisionTree        New  0.911504   0.846154  0.785714  0.814815   
2        RandomForest        New  0.905605   0.842105  0.761905  0.800000   
3    GradientBoosting        New  0.896755   0.795181  0.785714  0.790419   
4  LogisticRegression  Returning  0.893415   0.692737  0.421769  0.524313   
5        DecisionTree  Returning  0.897679   0.659836  0.547619  0.598513   
6        RandomForest  Returning  0.906679   0.725581  0.530612  0.612967   
7    GradientBoosting  Returning  0.908574   0.702811  0.595238  0.644567   

    roc_auc  
0  0.902194  
1  0.912255  
2  0.920121  
3  0.913492  
4  0.872456  
5  0.862289  
6  0.935895  
7  0.934933  
