In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score, silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA

In [None]:
# Step 1: Load the data
customers = pd.read_csv("Customers.csv")
transactions = pd.read_csv("Transactions.csv")

In [None]:
customers.head(5)

In [None]:
transactions.head(5)

In [None]:
# Step 2: Data Preprocessing
# Aggregating transaction data
customer_transaction_summary = transactions.groupby('CustomerID').agg(
    TotalValue=('TotalValue', 'sum'),
    AverageTransactionValue=('TotalValue', 'mean'),
    TotalQuantity=('Quantity', 'sum'),
    LastTransactionDate=('TransactionDate', 'max')
).reset_index()

In [None]:
# Merge with customer data
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
customer_transaction_summary['LastTransactionDate'] = pd.to_datetime(customer_transaction_summary['LastTransactionDate'])
merged_data = pd.merge(customers, customer_transaction_summary, on='CustomerID', how='inner')


In [None]:
merged_data.head(5)

In [None]:
# Add derived features
merged_data['CustomerTenureDays'] = (merged_data['LastTransactionDate'] - merged_data['SignupDate']).dt.days


In [None]:
# Select features for clustering
features = merged_data[['TotalValue', 'AverageTransactionValue', 'TotalQuantity', 'CustomerTenureDays']]

In [None]:
print(features)

In [None]:
# Step 3: Feature Scaling
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)


In [None]:
# Step 4: K-Means Clustering
# Try clustering with 2 to 10 clusters and calculate DB Index for each
db_scores = []
silhouette_scores = []
for n_clusters in range(2, 11):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(scaled_features)
    db_score = davies_bouldin_score(scaled_features, cluster_labels)
    silhouette_avg = silhouette_score(scaled_features, cluster_labels)
    db_scores.append(db_score)
    silhouette_scores.append(silhouette_avg)

# Optimal number of clusters based on DB Index
optimal_clusters = np.argmin(db_scores) + 2
print(f"Optimal number of clusters based on DB Index: {optimal_clusters}")

In [None]:
# Final K-Means model
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
merged_data['Cluster'] = kmeans.fit_predict(scaled_features)


In [None]:
# Step 5: Visualizations
# Plot DB Index for different numbers of clusters
plt.figure(figsize=(10, 6))
plt.plot(range(2, 11), db_scores, marker='o', label='DB Index')
plt.xlabel('Number of Clusters')
plt.ylabel('Davies-Bouldin Index')
plt.title('DB Index vs. Number of Clusters')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
# Plot Silhouette Score for different numbers of clusters
plt.figure(figsize=(10, 6))
plt.plot(range(2, 11), silhouette_scores, marker='o', label='Silhouette Score')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score vs. Number of Clusters')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
# Visualizing clusters using PCA
pca = PCA(n_components=2)
reduced_features = pca.fit_transform(scaled_features)
merged_data['PCA1'] = reduced_features[:, 0]
merged_data['PCA2'] = reduced_features[:, 1]

plt.figure(figsize=(10, 6))
sns.scatterplot(data=merged_data, x='PCA1', y='PCA2', hue='Cluster', palette='Set1', s=100)
plt.title('Customer Clusters (PCA Reduced)')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend(title='Cluster')
plt.grid(True)
plt.show()

In [None]:
# Step 6: Report Clustering Metrics
final_db_score = davies_bouldin_score(scaled_features, merged_data['Cluster'])
print(f"Final Davies-Bouldin Index for {optimal_clusters} clusters: {final_db_score:.4f}")

# Save final clustered data to CSV
merged_data.to_csv("Customer_Segmentation_Clusters.csv", index=False)