In [None]:
# clustering_analysis.ipynb
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

# Load Dataset
df = pd.read_csv('data/customer_data.csv')
print("Initial Data Overview:\n", df.head())

# Inspect data
print("Shape:", df.shape)
print("Missing Values:\n", df.isnull().sum())
print("Duplicates:", df.duplicated().sum())
print("Data Types:\n", df.dtypes)
print("Summary:\n", df.describe())

# Standardize Data
X = df[['Age', 'Annual Income', 'Spending Score']]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Elbow Method
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
    kmeans.fit(X_scaled)
    wcss.append(kmeans.inertia_)

plt.figure(figsize=(8, 4))
plt.plot(range(1, 11), wcss, marker='o')
plt.title("Elbow Method")
plt.xlabel("No. of Clusters")
plt.ylabel("WCSS")
plt.savefig('output/elbow_plot.png')
plt.show()

# Silhouette Score
best_k = 4
score = silhouette_score(X_scaled, KMeans(n_clusters=best_k).fit_predict(X_scaled))
with open("output/silhouette_score.txt", "w") as f:
    f.write(f"Silhouette Score for k={best_k}: {score:.4f}")

# Apply KMeans
kmeans = KMeans(n_clusters=best_k, random_state=42)
df['Cluster'] = kmeans.fit_predict(X_scaled)
df.to_csv("output/clustered_customers.csv", index=False)

# PCA for 2D Visualization
pca = PCA(n_components=2)
pca_components = pca.fit_transform(X_scaled)
df['PCA1'] = pca_components[:, 0]
df['PCA2'] = pca_components[:, 1]

plt.figure(figsize=(8, 5))
sns.scatterplot(data=df, x='PCA1', y='PCA2', hue='Cluster', palette='Set2')
plt.title("Customer Segmentation (PCA)")
plt.savefig("output/cluster_visuals.png")
plt.show()

# Pair Plot
sns.pairplot(df[['Age', 'Annual Income', 'Spending Score', 'Cluster']], hue='Cluster', palette='Set2')
plt.savefig("visuals/pairplot.png")
plt.show()
