In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import joblib

sns.set(style="whitegrid", palette="muted")

In [15]:
df = pd.read_csv("../data/customers_50k.csv") 

In [16]:
X = df.drop(columns=["Customer_ID", "Probability_Score", "Purchased"])
y = df["Purchased"]


In [17]:
numeric_features = ["Age", "Annual_Income", "Family_Size", "Credit_Score", "Travel_Frequency", "Web_Visits", "Email_Engagement"]
categorical_features = ["Gender", "Marital_Status", "Previous_Package"]

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [19]:
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ])

In [20]:
log_reg_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))
])

log_reg_model.fit(X_train, y_train)

# Random Forest
rf_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=200, random_state=42))
])

rf_model.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [21]:
def evaluate_model(name, model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
    
    print(f"\n📊 Model: {name}")
    print(classification_report(y_test, y_pred))
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred))
    if y_proba is not None:
        print("ROC-AUC:", roc_auc_score(y_test, y_proba))

In [22]:
evaluate_model("Logistic Regression", log_reg_model, X_test, y_test)
evaluate_model("Random Forest", rf_model, X_test, y_test)


📊 Model: Logistic Regression
              precision    recall  f1-score   support

           0       0.89      0.89      0.89      5499
           1       0.86      0.86      0.86      4501

    accuracy                           0.88     10000
   macro avg       0.88      0.87      0.87     10000
weighted avg       0.88      0.88      0.88     10000

Accuracy: 0.8762
Precision: 0.8642554141549453
Recall: 0.8600311041990669
F1 Score: 0.8621380846325167
ROC-AUC: 0.9544396975653389

📊 Model: Random Forest
              precision    recall  f1-score   support

           0       0.88      0.89      0.88      5499
           1       0.86      0.85      0.86      4501

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000

Accuracy: 0.8715
Precision: 0.8611859838274932
Recall: 0.851810708731393
F1 Score: 0.856472690718195
ROC-AUC: 0.947234150023601


In [23]:
best_model = rf_model 

In [24]:
joblib.dump(best_model, "../models/classifier_new.pkl")

print("\n Best classification model saved as 'classifier_new.pkl'")


 Best classification model saved as 'classifier_new.pkl'
