In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
df=pd.read_csv("customer_segmentation_data.csv",index_col=0)
df.head()
data=df.copy()
numerical_features = ['age', 'income', 'spending_score','membership_years','purchase_frequency','last_purchase_amount']

fig, axes = plt.subplots(3, 2, figsize=(15, 10))
for i, feature in enumerate(numerical_features):
    ax = axes[i // 2, i % 2]
    sns.histplot(data[feature], bins=20, kde=True, ax=ax)
    ax.set_title(f'Distribution of {feature}')
    ax.set_xlabel(feature)
    ax.set_ylabel('Frequency')
plt.tight_layout()
plt.show()
sns.pairplot(df)
plt.show()
categorical_features = ['gender','preferred_category']

fig, axes = plt.subplots(1, 2, figsize=(15, 6))
sns.countplot(x='gender',data=df,ax=axes[0])
axes[0].set_title('Distribution of Gender')
axes[0].set_xlabel('Gender')
axes[0].set_ylabel('Count')

sns.countplot(x='preferred_category',data=df,ax=axes[1])
axes[1].set_title('Distribution of Preferred Category')
axes[1].set_xlabel('Preferred Category')
axes[1].set_ylabel('Count')
axes[1].tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.show()
corr = df.select_dtypes("number").corr()
sns.heatmap(corr)
plt.title('Correlation Heatmap of Numeric Columns')
plt.show()
df.drop_duplicates(inplace=True)
columns_to_scale = ['age', 'income', 'spending_score','membership_years','purchase_frequency','last_purchase_amount']
data_to_scale = df[columns_to_scale]

scaler = StandardScaler()
scaled_data = scaler.fit_transform(data_to_scale)
scaled_df = pd.DataFrame(scaled_data, columns=columns_to_scale)

scaled_df.index =df.index
df[columns_to_scale] = scaled_df

print(df.head())
df.select_dtypes("object").nunique()
df = pd.get_dummies(df,columns=['gender','preferred_category'],drop_first=True)
wcss = []
for i in range(1,11):
    kmeans = KMeans(n_clusters=i, random_state=42)
    kmeans.fit(df)
    wcss.append(kmeans.inertia_)

plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), wcss, marker='o',linestyle='--')
plt.title('Elbow Method for Optimal k')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

silhouette_scores = []
for k in range(2,11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(df)
    score = silhouette_score(df, kmeans.labels_)
    silhouette_scores.append(score)
    

plt.figure(figsize=(10, 6))
plt.plot(range(2, 11), silhouette_scores, marker='o',linestyle='--')
plt.title('Silhouette Score Method')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Score')
plt.show()
kmeans = KMeans(n_clusters=4, random_state=42)
kmeans.fit(df)

df['cluster'] = kmeans.labels_
# Perform PCA for dimensionality reduction to 2 components
pca = PCA(n_components=2, random_state=42)
pca_components = pca.fit_transform(df.drop('cluster', axis=1, errors='ignore'))

# Add PCA components to DataFrame
df['PCA1'] = pca_components[:, 0]
df['PCA2'] = pca_components[:, 1]

# Initialize K-means with K=3 clusters
kmeans = KMeans(n_clusters=3, random_state=42)
labels = kmeans.fit_predict(df.drop(['cluster', 'PCA1', 'PCA2'], axis=1, errors='ignore'))

# Add cluster labels to the original data
df['cluster'] = labels

# Visualize clusters based on PCA components
plt.figure(figsize=(10, 6))
sns.scatterplot(x='PCA1', y='PCA2', hue='cluster', data=df, palette='viridis')
plt.title('Clustered Customer Segmentation (PCA)')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend(title='Cluster', loc='upper right')
plt.show()
Label_0 = df[df['cluster'] == 0]
Label_1 = df[df['cluster'] == 1]
Label_2 = df[df['cluster'] == 2]
print(f"Label 0 shape is: {Label_0.shape}")
print(f"Label 1 shape is: {Label_1.shape}")
print(f"Label 2 shape is: {Label_2.shape}")
data["Clusters"] = labels
Segment1 = data.loc[(data["Clusters"] == 0)]
Segment2 = data.loc[(data["Clusters"] == 1)]
Segment3 = data.loc[(data["Clusters"] == 2)]
Segment1.head(2)
Segment2.head(2)
Segment2.head(2)
# Only select numeric columns for the groupby mean
numeric_data = data.select_dtypes(include='number')

# Combine numeric data with cluster labels
numeric_data['Clusters'] = data['Clusters']

cluster_profile = numeric_data.groupby('Clusters').mean()

cluster_profile['size'] = data['Clusters'].value_counts()
plt.figure(figsize=(14,6))
sns.boxplot(x='Clusters', y='age', data=data,palette='viridis')
plt.title('Age Distribution by Cluster')
plt.xlabel('Clusters')
plt.ylabel('Age')
plt.show()

plt.figure(figsize=(14,6))
sns.boxplot(x='Clusters', y='income', data=data,palette='viridis')
plt.title('Last Purchase Amount Distribution by Cluster')
plt.xlabel('Clusters')
plt.ylabel('Last Purchase Amount')
plt.show()
s = Segment1['preferred_category'].fillna('No')
counts = s.value_counts()
percent  = s.value_counts(normalize=True)
percent100  = s.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
pd.DataFrame({'Counts': counts, 'Percentage': percent100})
s = Segment2['preferred_category'].fillna('No')
counts = s.value_counts()
percent  = s.value_counts(normalize=True)
percent100  = s.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
pd.DataFrame({'Counts': counts, 'Percentage': percent100})
s = Segment3['preferred_category'].fillna('No')
counts = s.value_counts()
percent  = s.value_counts(normalize=True)
percent100  = s.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
pd.DataFrame({'Counts': counts, 'Percentage': percent100})
s = Segment1['gender'].fillna('No')
counts = s.value_counts()
percent  = s.value_counts(normalize=True)
percent100  = s.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
pd.DataFrame({'Counts': counts, 'Percentage': percent100})
s = Segment2['gender'].fillna('No')
counts = s.value_counts()
percent  = s.value_counts(normalize=True)
percent100  = s.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
pd.DataFrame({'Counts': counts, 'Percentage': percent100})
s = Segment3['gender'].fillna('No')
counts = s.value_counts()
percent  = s.value_counts(normalize=True)
percent100  = s.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
pd.DataFrame({'Counts': counts, 'Percentage': percent100})
