In [None]:
#gender vs spending
#gendervs income vs spending

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import scipy.cluster.hierarchy as sch

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Set plot style
sns.set_style('whitegrid')

In [None]:
# Load the dataset from the user-provided file
!git clone "https://github.com/GeeksforgeeksDS/21-Days-21-Projects-Dataset"
df = pd.read_csv('/content/21-Days-21-Projects-Dataset/Datasets/Mall_Customers.csv')
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.drop('CustomerID',axis=1,inplace=True)

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(20, 6))
fig.suptitle('Distributions of Customer Features', fontsize=16)

sns.histplot(ax=axes[0], data=df, x='Age', kde=True, bins=20, hue='Gender').set_title('Age Distribution')
sns.histplot(ax=axes[1], data=df, x='Annual Income (k$)', kde=True, bins=20, hue='Gender').set_title('Annual Income Distribution')
sns.histplot(ax=axes[2], data=df, x='Spending Score (1-100)', kde=True, bins=20, hue='Gender').set_title('Spending Score Distribution')

plt.show()

In [None]:
sns.pairplot(df, vars=['Age', 'Annual Income (k$)', 'Spending Score (1-100)'], hue='Gender', diag_kind='kde')
plt.suptitle('Pair Plot of Customer Features', y=1.02)
plt.show()

In [None]:
fig = px.scatter_3d(df,
                    x='Annual Income (k$)',
                    y='Spending Score (1-100)',
                    z='Age',
                    color='Gender',
                    title='3D View of Customer Data')
fig.show()

In [None]:
# 1. Select and scale the features
X1 = df[['Annual Income (k$)', 'Spending Score (1-100)']]
scaler1 = StandardScaler()
X1_scaled = scaler1.fit_transform(X1)

# 2. Implement the Elbow Method
wcss1 = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, init='k-means++', random_state=42, n_init=10)
    kmeans.fit(X1_scaled)
    wcss1.append(kmeans.inertia_)

# 3. Plot the Elbow Curve
plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), wcss1, marker='o', linestyle='--')
plt.title('Elbow Method for Income-Spending Segmentation')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('WCSS (Inertia)')
plt.xticks(range(1, 11))
plt.show()
# Build and fit the final model
kmeans1 = KMeans(n_clusters=5, init='k-means++', random_state=42, n_init=10)
df['Income_Cluster'] = kmeans1.fit_predict(X1_scaled)
plt.figure(figsize=(12, 8))
sns.scatterplot(data=df, x='Annual Income (k$)', y='Spending Score (1-100)',
                hue='Income_Cluster', palette='viridis', s=100, alpha=0.8, edgecolor='black')
plt.title('Customer Segments by Income and Spending')
plt.legend(title='Cluster')
plt.show()
# Quantitative Persona Analysis
cluster_profiles1 = df.groupby('Income_Cluster')[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']].mean().round(2)
cluster_profiles1['Size'] = df['Income_Cluster'].value_counts()
print("--- Income-Based Cluster Profiles ---")
cluster_profiles1

In [None]:
# 1. Select and scale the features
X2 = df[['Age', 'Spending Score (1-100)']]
scaler2 = StandardScaler()
X2_scaled = scaler2.fit_transform(X2)

# 2. Implement the Elbow Method
wcss2 = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, init='k-means++', random_state=42, n_init=10)
    kmeans.fit(X2_scaled)
    wcss2.append(kmeans.inertia_)

# 3. Plot the Elbow Curve
plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), wcss2, marker='o', linestyle='--')
plt.title('Elbow Method for Age-Spending Segmentation')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('WCSS (Inertia)')
plt.xticks(range(1, 11))
plt.show()
# Build and fit the final model for age segmentation
kmeans2 = KMeans(n_clusters=4, init='k-means++', random_state=42, n_init=10)
df['Age_Cluster'] = kmeans2.fit_predict(X2_scaled)

# Visualize the new clusters
plt.figure(figsize=(12, 8))
sns.scatterplot(data=df, x='Age', y='Spending Score (1-100)',
                hue='Age_Cluster', palette='magma', s=100, alpha=0.8, edgecolor='black')
plt.title('Customer Segments by Age and Spending')
plt.legend(title='Cluster')
plt.show()


GENDER VS SPENDING SCORE :
IN THIS CELL LETS CHECK THE SCATTERING FOR GENDER VS SPENDING SCORE. AS GENDER IS CATEGORICAL LETS ONE HOT ENCODE IT AND STANDARDISE THE VALUES OF SPENDING SCORE

In [None]:
# 1. Select and scale the features
X2 = df[['Gender', 'Spending Score (1-100)']]
X2 = pd.get_dummies(X2, columns=['Gender'], drop_first=True) # One-hot encode 'Gender'
scaler2 = StandardScaler()
X2_scaled = scaler2.fit_transform(X2)

# 2. Implement the Elbow Method
wcss2 = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, init='k-means++', random_state=42, n_init=10)
    kmeans.fit(X2_scaled)
    wcss2.append(kmeans.inertia_)

# 3. Plot the Elbow Curve
plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), wcss2, marker='o', linestyle='--')
plt.title('Elbow Method for Gender-Spending Segmentation')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('WCSS (Inertia)')
plt.xticks(range(1, 11))
plt.show()

# Determine the optimal number of clusters from the elbow plot
# Based on the plot, let's choose k=4 (adjust if the plot suggests a different number)
optimal_k2 = 4

# Build and fit the final model
kmeans2 = KMeans(n_clusters=optimal_k2, init='k-means++', random_state=42, n_init=10)
df['Gender_Spending_Cluster'] = kmeans2.fit_predict(X2_scaled)

plt.figure(figsize=(12, 8))
sns.scatterplot(data=df, x='Gender', y='Spending Score (1-100)',
                hue='Gender_Spending_Cluster', palette='viridis', s=100, alpha=0.8, edgecolor='black')
plt.title('Customer Segments by Gender and Spending')
plt.legend(title='Cluster')
plt.show()

# Quantitative Persona Analysis
cluster_profiles2 = df.groupby('Gender_Spending_Cluster')[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']].mean().round(2)
cluster_profiles2['Size'] = df['Gender_Spending_Cluster'].value_counts()
print("--- Gender-Spending Based Cluster Profiles ---")
display(cluster_profiles2)

IN THIS LETS CHECK gendervs income vs spending
GENDER IS ONE HOT ENCODED AND INCOM EAND SPENDING ARE STANDARDISED
EACH COLOR IS FOR A DIFFERENT COLOR IN 3 DIMENSIONAL SPACE AND ELBOW METHOD TO BE USED TO GET GOOD NO OF CLUSTERS REQUIRED

In [None]:
# 1. Select and scale the features
X3 = df[['Gender', 'Annual Income (k$)', 'Spending Score (1-100)']]
X3 = pd.get_dummies(X3, columns=['Gender'], drop_first=True) # One-hot encode 'Gender'
scaler3 = StandardScaler()
X3_scaled = scaler3.fit_transform(X3)

# 2. Implement the Elbow Method
wcss3 = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, init='k-means++', random_state=42, n_init=10)
    kmeans.fit(X3_scaled)
    wcss3.append(kmeans.inertia_)

# 3. Plot the Elbow Curve
plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), wcss3, marker='o', linestyle='--')
plt.title('Elbow Method for Gender-Income-Spending Segmentation')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('WCSS (Inertia)')
plt.xticks(range(1, 11))
plt.show()

# Determine the optimal number of clusters from the elbow plot
# Based on the plot, let's choose k=5 (adjust if the plot suggests a different number)
optimal_k3 = 5

# Build and fit the final model
kmeans3 = KMeans(n_clusters=optimal_k3, init='k-means++', random_state=42, n_init=10)
df['Gender_Income_Spending_Cluster'] = kmeans3.fit_predict(X3_scaled)

# Visualize the clusters (using a 3D scatter plot)
fig = px.scatter_3d(df,
                    x='Annual Income (k$)',
                    y='Spending Score (1-100)',
                    z='Age', # Using Age for the third dimension for visualization
                    color='Gender_Income_Spending_Cluster',
                    title='3D View of Customer Segments by Gender, Income, and Spending')
fig.show()

# Quantitative Persona Analysis
cluster_profiles3 = df.groupby('Gender_Income_Spending_Cluster')[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']].mean().round(2)
cluster_profiles3['Size'] = df['Gender_Income_Spending_Cluster'].value_counts()
print("--- Gender-Income-Spending Based Cluster Profiles ---")
display(cluster_profiles3)