In [144]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, roc_auc_score


In [136]:
df = pd.read_excel("online_retail_data.xlsx", sheet_name = ["Year 2010-2011"]) #reading the excel file

In [149]:
data = df["Year 2010-2011"]

data = data.dropna(subset=["Customer ID"])
data = data[(data["Quantity"] > 0) & (data["Price"] > 0)]
data["InvoiceDate"] = pd.to_datetime(data["InvoiceDate"])
data["TotalAmount"] = data["Quantity"] * data["Price"]

data.to_csv("cleaned_transactions.csv", index=False)

In [138]:
snapshot = data["InvoiceDate"].max() + dt.timedelta(days=1)

rfm = data.groupby("Customer ID").agg({
    "InvoiceDate": lambda x: (snapshot - x.max()).days,
    "Invoice": "count",
    "TotalAmount": "sum"
})

rfm.columns = ["Recency", "Frequency", "Monetary"]
rfm.head()


Unnamed: 0_level_0,Recency,Frequency,Monetary
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12346.0,326,1,77183.6
12347.0,2,182,4310.0
12348.0,75,31,1797.24
12349.0,19,73,1757.55
12350.0,310,17,334.4


In [139]:
rfm["AOV"] = rfm["Monetary"] / rfm["Frequency"]
rfm["Customer_Lifespan"] = 365 / rfm["Recency"]
rfm["CLV"] = rfm["AOV"] * rfm["Frequency"] * rfm["Customer_Lifespan"]


In [140]:
rfm["Churn"] = (rfm["Recency"] > 90).astype(int)
rfm["Churn"].value_counts()


Unnamed: 0_level_0,count
Churn,Unnamed: 1_level_1
0,2889
1,1449


In [141]:
X = rfm[["Frequency", "Monetary", "AOV"]]
y = rfm["Churn"]

X = X.replace([np.inf, -np.inf], np.nan)
X = X.fillna(0)


In [142]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)


In [145]:

results = []

def evaluate_model(name, model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:,1]

    auc = roc_auc_score(y_test, y_prob)

    print("\n", name)
    print(classification_report(y_test, y_pred))
    print("AUC:", auc)

    results.append({
        "Model": name,
        "AUC": auc
    })

# 1. Logistic Regression
evaluate_model("Logistic Regression", LogisticRegression(max_iter=1000))

# 2. Random Forest
evaluate_model("Random Forest", RandomForestClassifier(n_estimators=200, random_state=42))

# 3. Decision Tree
evaluate_model("Decision Tree", DecisionTreeClassifier(random_state=42))

# 4. KNN
evaluate_model("KNN", KNeighborsClassifier(n_neighbors=5))

# 5. Gradient Boosting
evaluate_model("Gradient Boosting", GradientBoostingClassifier(random_state=42))



 Logistic Regression
              precision    recall  f1-score   support

           0       0.73      0.88      0.80       867
           1       0.60      0.36      0.45       435

    accuracy                           0.70      1302
   macro avg       0.66      0.62      0.62      1302
weighted avg       0.69      0.70      0.68      1302

AUC: 0.7728618966180116

 Random Forest
              precision    recall  f1-score   support

           0       0.74      0.81      0.77       867
           1       0.53      0.44      0.48       435

    accuracy                           0.69      1302
   macro avg       0.64      0.62      0.63      1302
weighted avg       0.67      0.69      0.68      1302

AUC: 0.697103236155855

 Decision Tree
              precision    recall  f1-score   support

           0       0.71      0.72      0.72       867
           1       0.43      0.42      0.43       435

    accuracy                           0.62      1302
   macro avg       0.57    

In [150]:
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="AUC", ascending=False).reset_index(drop=True)

print("\nMODEL RANKING (by AUC)")
print(results_df)



MODEL RANKING (by AUC)
                 Model       AUC
0    Gradient Boosting  0.773986
1  Logistic Regression  0.772862
2                  KNN  0.705915
3        Random Forest  0.697103
4        Decision Tree  0.571600


In [147]:
def segment(row):
    if row["CLV"] > rfm["CLV"].quantile(0.75) and row["Churn"] == 0:
        return "VIP"=
    elif row["CLV"] > rfm["CLV"].quantile(0.75) and row["Churn"] == 1:
        return "Rescue"
    elif row["Churn"] == 0:
        return "Regular"
    else:
        return "Low Priority"

rfm["Customer_Type"] = rfm.apply(segment, axis=1)
rfm["Customer_Type"].value_counts()


Unnamed: 0_level_0,count
Customer_Type,Unnamed: 1_level_1
Regular,1808
Low Priority,1445
VIP,1081
Rescue,4


In [148]:
rfm.to_csv("final_customer_analytics.csv")
