# 03_Classifier_Model
-------------
Build a basic ML model to classify a customer into one main category

In [0]:
# 1. Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

In [0]:
# 2. Data Loading
sdf = spark.sql("SELECT * FROM users.landan_george.customer_engagement_gold")
df = sdf.toPandas()
df.head()

In [0]:
# 3. Rule-based Segment Assignment (Target Variable)
def assign_segment(row):
    if row['LoyaltyScore'] > 80 and row['TotalPurchases'] >= 10:
        return "Loyalists"
    elif row['DiscountUsageRate'] > 0.5 and row['LoyaltyScore'] < 60:
        return "Bargain Seekers"
    elif row['AvgOrderValue'] > 70 and row['Returns'] == 0:
        return "High Value"
    elif row['DaysSinceLastPurchase'] > 90 and row['LoyaltyScore'] < 50:
        return "Churn Risk"
    else:
        return "Other"
      
df["Segment"] = df.apply(assign_segment, axis=1)
df = df[df["Segment"] != "Other"]  # remove unclear segments for training

In [0]:
sdf = spark.createDataFrame(df)
sdf.write.mode("overwrite").saveAsTable("users.landan_george.customer_segment_annotated")

In [0]:
# 4. Prepare Features and Labels
features = [
  'TotalPurchases',
  'AvgOrderValue',
  'TotalSpent',
  'DaysSinceLastPurchase',
  'Age',
  'LoyaltyScore',
  'Returns',
  'DiscountUsageRate',
  'EmailOpens',
  'WebVisits',
  'MobileAppSessions'
]

X = df[features]
y = df['Segment']


In [0]:
# 5. Preprocessing
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 6. Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, stratify=y, test_size=0.2, random_state=23)

# 7. Train Classifier
clf = RandomForestClassifier(n_estimators=25, random_state=42)
clf.fit(X_train, y_train)

# 8. Evaluate
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


### Method 2

In [0]:
sdf = spark.sql("SELECT * FROM users.landan_george.customer_engagement_gold")
df = sdf.toPandas()

In [0]:
df2 = df[['CustomerID', 'TotalPurchases', 'DaysSinceLastPurchase', 'TotalSpent', 'LoyaltyScore']]

In [0]:
df2 = df2.rename(
  columns={
    'TotalPurchases': 'Frequency',
    'DaysSinceLastPurchase': 'Recency',
    'TotalSpent': 'MonetaryValue',
    'LoyaltyScore': 'Loyalty'
  }
)

In [0]:
df2.tail()

In [0]:
# # # Calculate custom bin edges for Recency, Frequency, and Monetary scores
# # recency_bins = [1, 30, 60, 90, 120, 150, 180, 210, 240, 270, 300, 330, 365]
# # frequency_bins = [df2['Frequency'].min() - 1, 3, 6, 9, 12, df2['Frequency'].max()]
# # monetary_bins = [0, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
# # loyalty_bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

In [0]:
# Calculate custom bin edges for Recency, Frequency, and Monetary scores
recency_bins = [df2['Recency'].min()-1, 20, 50, 150, 250, df2['Recency'].max()]
frequency_bins = [df2['Frequency'].min() - 1, 2, 3, 10, 100, df2['Frequency'].max()]
monetary_bins = [df2['MonetaryValue'].min() - 3, 300, 600, 2000, 5000, df2['MonetaryValue'].max()]
loyalty_bins = [df2['Loyalty'].min() - 10, 25, 50, 75, df2['Loyalty'].max()]

In [0]:
# Ensure bins are sorted in ascending order
recency_bins = sorted(recency_bins)
frequency_bins = sorted(frequency_bins)
monetary_bins = sorted(monetary_bins)

# Calculate Recency score based on custom bins 
df2['R_Score'] = pd.cut(df2['Recency'], bins=recency_bins, labels=range(1, 6), include_lowest=True)

# Reverse the Recency scores so that higher values indicate more recent purchases
df2['R_Score'] = 5 - df2['R_Score'].astype(int) + 1

# Calculate Frequency and Monetary scores based on custom bins
df2['F_Score'] = pd.cut(df2['Frequency'], bins=frequency_bins, labels=range(1, 6), include_lowest=True).astype(int)
df2['M_Score'] = pd.cut(df2['MonetaryValue'], bins=monetary_bins, labels=range(1, 6), include_lowest=True).astype(int)

In [0]:
df2.head(3)

In [0]:
# # Extract RFM scores for K-means clustering
X = df2[['R_Score', 'F_Score', 'M_Score', 'L_Score']]

# Extract RFM scores for K-means clustering
# X = df2[['R_Score', 'F_Score', 'M_Score']]

In [0]:
X.head()

In [0]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

In [0]:
# Calculate inertia (sum of squared distances) for different values of k
inertia = []
for k in range(2, 15):
    kmeans = KMeans(n_clusters=k, n_init= 10, random_state=42)
    kmeans.fit(X)
    inertia.append(kmeans.inertia_)

# Plot the elbow curve
plt.figure(figsize=(8, 6),dpi=150)
plt.plot(range(2, 15), inertia, marker='o')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Curve for K-means Clustering')
plt.grid(True)
plt.show()

In [0]:
# Perform K-means clustering with best K
best_kmeans = KMeans(n_clusters=4, n_init=10, random_state=42)
df2['Cluster'] = best_kmeans.fit_predict(X)

In [0]:
# Convert categorical columns to numeric
df2['R_Score'] = pd.to_numeric(df2['R_Score'], errors='coerce')
df2['F_Score'] = pd.to_numeric(df2['F_Score'], errors='coerce')
df2['M_Score'] = pd.to_numeric(df2['M_Score'], errors='coerce')
df2['L_Score'] = pd.to_numeric(df2['L_Score'], errors='coerce')

# Group by cluster and calculate mean values
cluster_summary = df2.groupby('Cluster').agg({
    'R_Score': 'median',
    'F_Score': 'median',
    'M_Score': 'median',
    'L_Score': 'median'
}).reset_index()

In [0]:
# # Group by cluster and calculate mean values
# cluster_summary = df2.groupby('Cluster').agg({
#     'R_Score': 'mean',
#     'F_Score': 'mean',
#     'M_Score': 'mean',
#     'L_Score': 'mean'
# }).reset_index()

In [0]:
print(cluster_summary)


In [0]:
df2

In [0]:
colors = ['#3498db', '#2ecc71', '#f39c12','#C9B1BD']

# Plot the average RFM scores for each cluster
plt.figure(figsize=(10, 8),dpi=150)

plt.grid(True, linestyle='--', alpha=0.5)
plt.legend(bars, cluster_summary.index, title='Clusters')

# Plot Avg Recency
plt.subplot(4, 1, 1)
bars = plt.bar(cluster_summary.index, cluster_summary['R_Score'], color=colors)
plt.xlabel('Cluster')
plt.ylabel('Avg Recency')
plt.title('Average Recency for Each Cluster')

# Plot Avg Frequency
plt.subplot(4, 1, 2)
bars = plt.bar(cluster_summary.index, cluster_summary['F_Score'], color=colors)
plt.xlabel('Cluster')
plt.ylabel('Avg Frequency')
plt.title('Average Frequency for Each Cluster')
plt.grid(True, linestyle='--', alpha=0.5)
plt.legend(bars, cluster_summary.index, title='Clusters')

# Plot Avg Monetary
plt.subplot(4, 1, 3)
bars = plt.bar(cluster_summary.index, cluster_summary['M_Score'], color=colors)
plt.xlabel('Cluster')
plt.ylabel('Avg Monetary')
plt.title('Average Monetary Value for Each Cluster')
plt.grid(True, linestyle='--', alpha=0.5)
plt.legend(bars, cluster_summary.index, title='Clusters')

# # Plot Avg Loalty
# plt.subplot(4, 1, 4)
# bars = plt.bar(cluster_summary.index, cluster_summary['L_Score'], color=colors)
# plt.xlabel('Cluster')
# plt.ylabel('Avg Loyalty')
# plt.title('Average Loyalty Value for Each Cluster')
# plt.grid(True, linestyle='--', alpha=0.5)
# plt.legend(bars, cluster_summary.index, title='Clusters')

plt.tight_layout()
plt.show()