In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics import davies_bouldin_score, silhouette_score
from mpl_toolkits.mplot3d import Axes3D

In [None]:
#Load Dataset
customers = pd.read_csv(r"E:\assignment zeotap\all_tasks\Customers.csv")
transactions = pd.read_csv(r"E:\assignment zeotap\all_tasks\Transactions.csv")

In [None]:
# Encode categorical variables
label_encoder = LabelEncoder()
customers["RegionEncoded"] = label_encoder.fit_transform(customers["Region"])

In [None]:
# Aggregate transaction data per customer
customer_spending = transactions.groupby("CustomerID").agg(
    TotalSpending=("TotalValue", "sum"),
    AvgPurchaseValue=("TotalValue", "mean"),
    PurchaseFrequency=("TransactionID", "count")
).reset_index()

In [None]:
# Merge customer profiles with transaction history
customer_data = customers.merge(customer_spending, on="CustomerID", how="left").fillna(0)

In [None]:
# Select features for clustering
features = ["TotalSpending", "AvgPurchaseValue", "PurchaseFrequency", "RegionEncoded"]

In [None]:
# Apply Standard Scaling
scaler = StandardScaler()
customer_data_scaled = scaler.fit_transform(customer_data[features])

In [None]:
# Find the optimal number of clusters using DB Index
db_scores = []
silhouette_scores = []
range_clusters = range(2, 10)

In [None]:
for k in range_clusters:
    kmeans = KMeans(n_clusters=k, random_state=42)
    cluster_labels = kmeans.fit_predict(customer_data_scaled)
    db_scores.append(davies_bouldin_score(customer_data_scaled, cluster_labels))
    silhouette_scores.append(silhouette_score(customer_data_scaled, cluster_labels))

In [None]:
# Plot DB Index & Silhouette Score
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(range_clusters, db_scores, marker="o", linestyle="-", color="red")
plt.xlabel("Number of Clusters")
plt.ylabel("Davies-Bouldin Index (Lower is Better)")
plt.title("Optimal Clusters using DB Index")

In [None]:
plt.subplot(1, 2, 2)
plt.plot(range_clusters, silhouette_scores, marker="o", linestyle="-", color="blue")
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Score (Higher is Better)")
plt.title("Optimal Clusters using Silhouette Score")

In [None]:
plt.show()

In [None]:
# Select the best number of clusters (based on the lowest DB Index)
best_k = range_clusters[np.argmin(db_scores)]
print(f"Best Number of Clusters: {best_k}")

In [None]:
# Apply K-Means with Best Cluster Count
final_kmeans = KMeans(n_clusters=best_k, random_state=42)
customer_data["Cluster"] = final_kmeans.fit_predict(customer_data_scaled)

In [None]:
# Compute Final DB Index
db_index_final = davies_bouldin_score(customer_data_scaled, customer_data["Cluster"])
print(f"Final Davies-Bouldin Index: {db_index_final:.2f}")

In [None]:
# 3D Visualization of Clusters
fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111, projection="3d")
scatter = ax.scatter(
    customer_data["TotalSpending"],
    customer_data["PurchaseFrequency"],
    customer_data["AvgPurchaseValue"],
    c=customer_data["Cluster"],
    cmap="viridis",
    alpha=0.7
)
ax.set_xlabel("Total Spending")
ax.set_ylabel("Purchase Frequency")
ax.set_zlabel("Avg Purchase Value")
plt.title("Customer Segments in 3D")
plt.colorbar(scatter)
plt.show()

In [None]:
#Save Clustered Data
print("Customer segmentation completed! Results saved to Customer_Segments.csv.")