<a href="https://colab.research.google.com/github/khietvuarong/ML-Basics-Exercise/blob/main/Part_3_Customer_Segmentation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Part 3: K-Means Customer Segmentation
# Dataset Source:
# Mall Customers Dataset
# https://www.kaggle.com/datasets/vjchoudhary7/customer-segmentation-tutorial-in-python
#
# Extra Credit:
# Housing Demand Forecast using Linear Regression

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split


print("----- PART 3: CUSTOMER SEGMENTATION -----")

# 1. Load Dataset
df = pd.read_csv("Mall_Customers.csv")

# 2. Select Features
features = ['Age', 'Annual Income (k$)', 'Spending Score (1-100)']
X = df[features]

# 3. Scale Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 4. Elbow Method
inertia = []
K_range = range(1, 11)

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

plt.figure(figsize=(8,5))
plt.plot(K_range, inertia, marker='o')
plt.title("Elbow Method")
plt.xlabel("Number of Clusters (K)")
plt.ylabel("Inertia")
plt.grid(True)
plt.savefig("elbow_plot.png")
plt.close()

print("Elbow plot saved as elbow_plot.png")

# 5. Apply K-Means (K=5 chosen based on elbow)
optimal_k = 5
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
df['Cluster'] = kmeans.fit_predict(X_scaled)

# 6. Evaluate Clustering
sil_score = silhouette_score(X_scaled, df['Cluster'])
print(f"Silhouette Score: {sil_score:.3f}")

# 7. Analyze Clusters
cluster_summary = df.groupby('Cluster')[features].mean().round(2)
print("\nCluster Characteristics:")
print(cluster_summary)

# 8. Marketing Strategies
for cluster in cluster_summary.index:
    print(f"\nCluster {cluster} Strategy:")

    income = cluster_summary.loc[cluster, 'Annual Income (k$)']
    spending = cluster_summary.loc[cluster, 'Spending Score (1-100)']

    if income > 70 and spending > 60:
        print("High income & high spending → VIP loyalty program.")
    elif income > 70 and spending < 40:
        print("High income but low spending → Targeted premium promotions.")
    elif income < 40 and spending > 60:
        print("Lower income but high spending → Bundle discounts.")
    else:
        print("Moderate segment → Seasonal marketing campaigns.")

# 9. Visualize Clusters
plt.figure(figsize=(8,6))
plt.scatter(df['Annual Income (k$)'],
            df['Spending Score (1-100)'],
            c=df['Cluster'])
plt.xlabel("Annual Income (k$)")
plt.ylabel("Spending Score (1-100)")
plt.title("Customer Segments")
plt.grid(True)
plt.savefig("customer_clusters.png")
plt.close()

print("Cluster visualization saved as customer_clusters.png")

# 10. Save Cluster Results
df.to_csv("customer_segments.csv", index=False)
print("Clustered data saved as customer_segments.csv")

# ==========================================================
# ================= EXTRA CREDIT ===========================
# ==========================================================

print("\n----- EXTRA CREDIT: HOUSING FORECAST -----")

# Example housing demand dataset structure:
# Month_Number, Demand
# 1, 120
# 2, 135
# 3, 150
# ...

try:
    housing_df = pd.read_csv("housing_demand.csv")

    X = housing_df[['Month_Number']]
    y = housing_df['Demand']

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    model = LinearRegression()
    model.fit(X_train, y_train)

    # Forecast next 6 months
    last_month = housing_df['Month_Number'].max()
    future_months = pd.DataFrame({
        'Month_Number': range(last_month + 1, last_month + 7)
    })

    forecast = model.predict(future_months)

    # Plot
    plt.figure(figsize=(8,5))
    plt.plot(housing_df['Month_Number'], housing_df['Demand'], label='Historical')
    plt.plot(future_months['Month_Number'], forecast, label='Forecast', linestyle='--')
    plt.xlabel("Month")
    plt.ylabel("Housing Demand")
    plt.title("Housing Demand Forecast (Next 6 Months)")
    plt.legend()
    plt.grid(True)
    plt.savefig("housing_forecast.png")
    plt.close()

    print("Forecast saved as housing_forecast.png")

except FileNotFoundError:
    print("housing_demand.csv not found. Extra credit section skipped.")


----- PART 3: CUSTOMER SEGMENTATION -----
Elbow plot saved as elbow_plot.png
Silhouette Score: 0.417

Cluster Characteristics:
           Age  Annual Income (k$)  Spending Score (1-100)
Cluster                                                   
0        46.25               26.75                   18.35
1        25.19               41.09                   62.24
2        32.88               86.10                   81.53
3        39.87               86.10                   19.36
4        55.64               54.38                   48.85

Cluster 0 Strategy:
Moderate segment → Seasonal marketing campaigns.

Cluster 1 Strategy:
Moderate segment → Seasonal marketing campaigns.

Cluster 2 Strategy:
High income & high spending → VIP loyalty program.

Cluster 3 Strategy:
High income but low spending → Targeted premium promotions.

Cluster 4 Strategy:
Moderate segment → Seasonal marketing campaigns.
Cluster visualization saved as customer_clusters.png
Clustered data saved as customer_segments.cs