In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, classification_report




In [4]:
df = pd.read_csv('/Users/jkr/Desktop/FutureGoal/pythonprojects/jay-genai-portfolio/retail-purchase-prediction/data/raw/online_shoppers_intention.csv')

In [5]:
y = df["Revenue"]                                  # True = buyers (positive)
X = df.drop(columns=["Revenue"])

In [6]:
# First split 80/20 (TrainPool / Temp)
X_train_pool, X_temp, y_train_pool, y_temp = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42
)

# From the 20% Temp, split 50/50 → Val (10%) / Test (10%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, stratify=y_temp, random_state=42
)

X_train, y_train = X_train_pool, y_train_pool


In [7]:
df["weekend"]=df["Weekend"].astype(int)

In [8]:
cat_cols = ["Month", "VisitorType", "OperatingSystems", "Browser", "Region", "TrafficType"]
num_cols = [
    "Administrative", "Administrative_Duration",
    "Informational", "Informational_Duration",
    "ProductRelated", "ProductRelated_Duration",
    "BounceRates", "ExitRates", "PageValues", "SpecialDay",
    "Weekend"
]


In [9]:
preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols),
        ("num", "passthrough", num_cols),
    ]
)


In [10]:
rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)

pipe = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", rf)
])


In [11]:
pipe.fit(X_train, y_train)

# Validation predictions & probabilities
val_pred = pipe.predict(X_val)
val_proba = pipe.predict_proba(X_val)[:, 1]

print("Validation metrics")
print("Precision:", precision_score(y_val, val_pred, zero_division=0))
print("Recall   :", recall_score(y_val, val_pred, zero_division=0))
print("F1       :", f1_score(y_val, val_pred, zero_division=0))
print("ROC-AUC  :", roc_auc_score(y_val, val_proba))
print("\nClassification Report:\n", classification_report(y_val, val_pred, zero_division=0))


Validation metrics
Precision: 0.776
Recall   : 0.5078534031413613
F1       : 0.6139240506329114
ROC-AUC  : 0.9324371175045975

Classification Report:
               precision    recall  f1-score   support

       False       0.92      0.97      0.94      1042
        True       0.78      0.51      0.61       191

    accuracy                           0.90      1233
   macro avg       0.85      0.74      0.78      1233
weighted avg       0.89      0.90      0.89      1233



In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, precision_score

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scoring = {
    "precision": make_scorer(precision_score, pos_label=True),
    "f1": "f1",
    "roc_auc": "roc_auc",
    "avg_prec": "average_precision"  # area under PR curve
}

param_grid = {
    "model__class_weight": [None, "balanced"],
    "model__max_depth": [None, 6, 10, 14],
    "model__min_samples_leaf": [1, 3, 5, 10],
    "model__min_samples_split": [2, 5, 10],
    "model__max_features": ["sqrt", 0.5, None],
    "model__n_estimators": [200, 400],
}

grid = GridSearchCV(
    estimator=pipe,                # your preprocessing + model pipeline
    param_grid=param_grid,
    scoring=scoring,               # multiple metrics
    refit="precision",             # or "f1" depending on your business choice
    cv=cv,
    n_jobs=-1,
    verbose=1,
    return_train_score=True
)

grid.fit(X_train, y_train)
best = grid.best_estimator_
print("Best params:", grid.best_params_)
print("CV best precision:", grid.cv_results_["mean_test_precision"][grid.best_index_])

Fitting 5 folds for each of 576 candidates, totalling 2880 fits


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix

best = grid.best_estimator_

# Validation
val_pred = best.predict(X_val)
val_proba = best.predict_proba(X_val)[:, 1]

print("=== Validation Metrics (threshold=0.50) ===")
print("Precision:", precision_score(y_val, val_pred))
print("Recall   :", recall_score(y_val, val_pred))
print("F1       :", f1_score(y_val, val_pred))
print("ROC-AUC  :", roc_auc_score(y_val, val_proba))
print("\nClassification Report:\n", classification_report(y_val, val_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, val_pred))

NameError: name 'grid' is not defined

In [None]:
import numpy as np

thresholds = np.linspace(0.50, 0.90, 21)  # sweep 0.50 → 0.90
records = []
for t in thresholds:
    preds_t = (val_proba >= t).astype(int)
    p = precision_score(y_val, preds_t, zero_division=0)
    r = recall_score(y_val, preds_t, zero_division=0)
    f = f1_score(y_val, preds_t, zero_division=0)
    records.append((t, p, r, f))

# Pick threshold that maximizes precision but keeps recall acceptable (e.g., >= 0.45)
records_sorted = sorted(records, key=lambda x: (-x[1], -x[2]))  # sort by precision desc, then recall desc
best_t, best_p, best_r, best_f = records_sorted[0]
print(f"\nBest threshold on VAL (by precision): t={best_t:.2f} | precision={best_p:.3f} | recall={best_r:.3f} | f1={best_f:.3f}")

In [None]:
# Refit on Train+Val for final model
X_train_full = pd.concat([X_train, X_val], axis=0)
y_train_full = pd.concat([y_train, y_val], axis=0)

final_model = grid.best_estimator_
final_model.fit(X_train_full, y_train_full)

# Test evaluation
test_proba = final_model.predict_proba(X_test)[:, 1]
# Use chosen threshold; if you skipped tuning, default to 0.50
t = best_t  # or 0.50
test_pred = (test_proba >= t).astype(int)

print(f"\n=== TEST Metrics (threshold={t:.2f}) ===")
print("Precision:", precision_score(y_test, test_pred, zero_division=0))
print("Recall   :", recall_score(y_test, test_pred, zero_division=0))
print("F1       :", f1_score(y_test, test_pred, zero_division=0))
print("ROC-AUC  :", roc_auc_score(y_test, test_proba))
print("\nClassification Report:\n", classification_report(y_test, test_pred, zero_division=0))
print("Confusion Matrix:\n", confusion_matrix(y_test, test_pred))

In [None]:
# Get feature names from the ColumnTransformer + OHE
ohe = best.named_steps["preprocess"].named_transformers_["cat"]
ohe_features = list(ohe.get_feature_names_out(input_features=["Month","VisitorType","OperatingSystems","Browser","Region","TrafficType"]))
num_features = [
    "Administrative", "Administrative_Duration",
    "Informational", "Informational_Duration",
    "ProductRelated", "ProductRelated_Duration",
    "BounceRates", "ExitRates", "PageValues", "SpecialDay", "Weekend"
]
all_features = ohe_features + num_features

import numpy as np
import pandas as pd
rf = final_model.named_steps["model"]
imp = pd.Series(rf.feature_importances_, index=all_features).sort_values(ascending=False)
print("\nTop 15 features driving predictions:\n", imp.head(15))