SyntaxError: invalid character '│' (U+2502) (1609920148.py, line 2)

In [1]:
import pandas as pd
import numpy as np

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix

In [2]:
df=pd.read_csv("../data/customers.csv")

df.head()

Unnamed: 0,customer_id,region,age,purchase_frequency,avg_order_value,discount_usage_rate,last_purchase_days,total_spend,churn_flag
0,1,East,57,7,1918.270827,0.551,25,13427.895791,0
1,2,West,24,7,1816.876354,0.140614,327,12718.134477,1
2,3,North,49,2,2461.199375,0.842197,306,4922.398749,1
3,4,East,36,6,2092.406587,0.10689,328,12554.439524,1
4,5,East,23,8,1188.423009,0.094429,36,9507.384074,0


In [5]:
df = df.drop("customer_id", axis=1)

In [7]:
df

Unnamed: 0,region,age,purchase_frequency,avg_order_value,discount_usage_rate,last_purchase_days,total_spend,churn_flag
0,East,57,7,1918.270827,0.551000,25,13427.895791,0
1,West,24,7,1816.876354,0.140614,327,12718.134477,1
2,North,49,2,2461.199375,0.842197,306,4922.398749,1
3,East,36,6,2092.406587,0.106890,328,12554.439524,1
4,East,23,8,1188.423009,0.094429,36,9507.384074,0
...,...,...,...,...,...,...,...,...
9995,West,48,3,2518.487980,0.387981,295,7555.463941,1
9996,East,46,3,2031.038171,0.114810,10,6093.114513,0
9997,South,58,7,1709.794252,0.301143,256,11968.559764,1
9998,South,28,2,1839.757405,0.808163,272,3679.514810,1


In [9]:
df = pd.get_dummies(df, columns=["region"], drop_first=True)

In [11]:
df

Unnamed: 0,age,purchase_frequency,avg_order_value,discount_usage_rate,last_purchase_days,total_spend,churn_flag,region_North,region_South,region_West
0,57,7,1918.270827,0.551000,25,13427.895791,0,False,False,False
1,24,7,1816.876354,0.140614,327,12718.134477,1,False,False,True
2,49,2,2461.199375,0.842197,306,4922.398749,1,True,False,False
3,36,6,2092.406587,0.106890,328,12554.439524,1,False,False,False
4,23,8,1188.423009,0.094429,36,9507.384074,0,False,False,False
...,...,...,...,...,...,...,...,...,...,...
9995,48,3,2518.487980,0.387981,295,7555.463941,1,False,False,True
9996,46,3,2031.038171,0.114810,10,6093.114513,0,False,False,False
9997,58,7,1709.794252,0.301143,256,11968.559764,1,False,True,False
9998,28,2,1839.757405,0.808163,272,3679.514810,1,False,True,False


In [13]:
X = df.drop("churn_flag", axis=1)
y = df["churn_flag"]

In [15]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [17]:
model = XGBClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=4,
    random_state=42,
    eval_metric="logloss"
)

model.fit(X_train, y_train)

In [19]:
predictions = model.predict(X_test)
probabilities = model.predict_proba(X_test)[:, 1]

In [21]:
accuracy = accuracy_score(y_test, predictions)
auc = roc_auc_score(y_test, probabilities)

print("Accuracy:", accuracy)
print("AUC Score:", auc)

print("\nClassification Report:\n")
print(classification_report(y_test, predictions))

Accuracy: 1.0
AUC Score: 1.0

Classification Report:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1124
           1       1.00      1.00      1.00       876

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000



In [23]:
cm = confusion_matrix(y_test, predictions)
print("Confusion Matrix:\n", cm)

Confusion Matrix:
 [[1124    0]
 [   0  876]]


In [25]:
# ============================================================
# DAY 3 - CHURN PREDICTION USING XGBOOST (CLASSIFICATION)
# PROJECT: E-Commerce Executive AI Decision Agent
# ============================================================

# ------------------------------------------------------------
# OBJECTIVE:
# Build a churn prediction model to identify customers
# likely to leave (churn).
#
# This is a Binary Classification Problem:
#   churn_flag = 1 → Customer churned
#   churn_flag = 0 → Customer active
# ------------------------------------------------------------


# ============================================================
# IMPORTANT LIBRARIES
# ============================================================

# pandas (pd)
# - Used for loading and manipulating dataset

# numpy (np)
# - Used for numerical operations

# XGBClassifier (from xgboost)
# - Gradient Boosting model for classification
# - Strong performance on tabular structured data

# train_test_split
# - Used to split dataset into training and testing

# sklearn.metrics
# - accuracy_score → overall correct predictions
# - classification_report → precision, recall, F1-score
# - roc_auc_score → ability to distinguish churn vs non-churn
# - confusion_matrix → detailed prediction breakdown


# ============================================================
# KEY MACHINE LEARNING CONCEPTS
# ============================================================

# 1. BINARY CLASSIFICATION
#    Predicting one of two classes:
#        0 or 1

# 2. FEATURE ENGINEERING
#    Removing irrelevant columns (customer_id)
#    Encoding categorical variables (region)

# 3. ONE-HOT ENCODING
#    Converts categorical variables into numeric format.
#    Example:
#        Region = North, South
#        Becomes:
#            region_North
#            region_South

# 4. TRAIN-TEST SPLIT
#    80% → Training
#    20% → Testing
#    random_state ensures reproducibility.

# 5. XGBOOST CLASSIFIER
#    Gradient Boosting algorithm.
#    Builds trees sequentially to reduce classification error.


# ============================================================
# IMPORTANT MODEL PARAMETERS
# ============================================================

# n_estimators:
#     Number of trees built.
#     More trees = stronger learning capacity.

# learning_rate:
#     Controls how fast the model learns.
#     Smaller = slower but more stable learning.

# max_depth:
#     Controls tree complexity.
#     Higher depth = more complex decision boundaries.

# eval_metric="logloss":
#     Evaluation metric for classification.


# ============================================================
# EVALUATION METRICS (VERY IMPORTANT FOR INTERVIEWS)
# ============================================================

# 1. ACCURACY
#    (Correct Predictions / Total Predictions)
#
#    Problem:
#    Can be misleading if data is imbalanced.

# 2. PRECISION
#    Of predicted churn customers,
#    how many actually churned?

#    Formula:
#    TP / (TP + FP)

# 3. RECALL (Very Important for churn)
#    Of actual churn customers,
#    how many did we correctly detect?

#    Formula:
#    TP / (TP + FN)

# 4. F1-SCORE
#    Harmonic mean of Precision and Recall.
#    Balances both metrics.

# 5. AUC (Area Under ROC Curve)
#    Measures model's ability to separate classes.
#
#    0.5 → Random guessing
#    0.7 → Acceptable
#    0.8+ → Good
#    0.9+ → Excellent


# ============================================================
# CONFUSION MATRIX INTERPRETATION
# ============================================================

# Matrix format:
#
#              Predicted 0   Predicted 1
# Actual 0         TN            FP
# Actual 1         FN            TP
#
# TN → True Negative
# FP → False Positive
# FN → False Negative
# TP → True Positive
#
# For churn prediction:
# We care more about minimizing FN
# (missing actual churn customers).


# ============================================================
# MODEL PIPELINE SUMMARY
# ============================================================

# Step 1:
#   Load customers.csv

# Step 2:
#   Remove unnecessary columns (customer_id)

# Step 3:
#   Convert categorical variables using one-hot encoding

# Step 4:
#   Define X (features) and y (target)

# Step 5:
#   Train-test split

# Step 6:
#   Train XGBClassifier

# Step 7:
#   Predict churn labels

# Step 8:
#   Evaluate using:
#       Accuracy
#       Precision
#       Recall
#       F1-score
#       AUC
#       Confusion Matrix


# ============================================================
# BUSINESS INTERPRETATION
# ============================================================

# The churn model helps answer:
# - Which customers are high risk?
# - What percentage of customers may leave?
# - How much revenue is at risk?
#
# This enables:
# - Targeted retention campaigns
# - Personalized discounts
# - Marketing optimization


# ============================================================
# INTERVIEW-READY EXPLANATION
# ============================================================

# "I built a churn prediction model using XGBoost classification.
# The model was evaluated using AUC and F1-score to ensure strong
# discrimination capability. High-risk customers were identified
# for proactive retention strategies."

# ============================================================
# END OF DAY 3 NOTES
# ============================================================

In [27]:
import joblib

joblib.dump(model, "../models/churn_model.pkl")

print("Churn model saved successfully ✅")

Churn model saved successfully ✅


In [29]:
import os# get path 

print(os.getcwd())

C:\Users\LAKSHAY\Python files\project_llm\notebooks
