<a href="https://colab.research.google.com/github/ithsirs/PRODIGY_ML_02/blob/main/PRODIGY_ML_02_FInal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Task 2

Create a K-means clustering algorithm to group customers of a retail store based on their purchase history.



Dataset :- https://www.kaggle.com/datasets/vjchoudhary7/customer-segmentation-tutorial-in-python




In [1]:

!pip install -q plotly ipywidgets joblib


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/1.6 MB[0m [31m9.7 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m1.5/1.6 MB[0m [31m22.0 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.6/1.6 MB[0m [31m20.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import kagglehub
import shutil
import os


destination_folder = r"/content/sample_data"  # Change this path to destination folder


path = kagglehub.dataset_download("vjchoudhary7/customer-segmentation-tutorial-in-python")

# Create the destination folder if it doesn't exist
os.makedirs(destination_folder, exist_ok=True)

# Move files from kagglehub's cache to your desired folder
for item in os.listdir(path):
    s = os.path.join(path, item)
    d = os.path.join(destination_folder, item)
    if os.path.isdir(s):
        shutil.copytree(s, d, dirs_exist_ok=True)
    else:
        shutil.copy2(s, d)

print("Dataset copied to:", destination_folder)

Downloading from https://www.kaggle.com/api/v1/datasets/download/vjchoudhary7/customer-segmentation-tutorial-in-python?dataset_version_number=1...


100%|██████████| 1.55k/1.55k [00:00<00:00, 2.10MB/s]

Extracting files...
Dataset copied to: /content/sample_data





In [3]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from ipywidgets import interact, IntSlider
import joblib
from google.colab import output
output.enable_custom_widget_manager()

In [4]:
df = pd.read_csv('/content/sample_data/Mall_Customers.csv')

In [5]:
le = LabelEncoder()
df['Gender'] = le.fit_transform(df['Gender'])  # Male=1, Female=0

In [6]:
df['Income_per_Age'] = df['Annual Income (k$)'] / (df['Age'] + 1)
df['Age_Group'] = pd.cut(df['Age'], bins=[0, 25, 45, 100], labels=[0, 1, 2]).astype(int)


In [7]:
features = ['Gender', 'Age', 'Annual Income (k$)', 'Spending Score (1-100)', 'Income_per_Age', 'Age_Group']
X = df[features]

In [8]:
scaler = StandardScaler()
X_scale = scaler.fit_transform(X)

In [9]:
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X_scale)

In [10]:
# Elbow Method
def plot_elbow():
    distortions = []
    silhouette_scores = []
    K = range(2, 11)
    for k in K:
        km = KMeans(n_clusters=k, random_state=42)
        km.fit(X_pca)
        distortions.append(km.inertia_)
        silhouette_scores.append(silhouette_score(X_pca, km.labels_))

    elbow_df = pd.DataFrame({'Clusters': K, 'Inertia': distortions, 'Silhouette': silhouette_scores})
    fig1 = px.line(elbow_df, x='Clusters', y='Inertia', title='Elbow Method (Inertia)', markers=True)
    fig2 = px.line(elbow_df, x='Clusters', y='Silhouette', title='Silhouette Score', markers=True)
    fig1.show()
    fig2.show()


In [19]:
def run_final_kmeans(n_clusters=5):
    kmeans = KMeans(n_clusters=n_clusters, init='k-means++', random_state=42)
    df['Cluster'] = kmeans.fit_predict(X_pca)
    centers = kmeans.cluster_centers_

    # Save model
    joblib.dump(kmeans, f'kmeans_pca_model_k{n_clusters}.pkl')
    joblib.dump(scaler, 'scaler.pkl')
    joblib.dump(pca, 'pca.pkl')
    df.to_csv(f'clustered_customers_k{n_clusters}.csv', index=False)

    print(f"Model saved as 'kmeans_pca_model_k{n_clusters}.pkl'")
    print(f"Data saved as 'clustered_customers_k{n_clusters}.csv'")

    # 3D plot
    fig_3d = px.scatter_3d(
        x=X_pca[:, 0], y=X_pca[:, 1], z=X_pca[:, 2],
        color=df['Cluster'].astype(str),
        symbol=df['Gender'],
        color_discrete_sequence=px.colors.qualitative.Set1,
        title='3D PCA Clustering',
        hover_data=[df['CustomerID'], df['Age'], df['Annual Income (k$)'], df['Spending Score (1-100)']]
    )
    fig_3d.show()

    # 2D plot
    fig_2d = px.scatter(
        x=X_pca[:, 0], y=X_pca[:, 1],
        color=df['Cluster'].astype(str),
        symbol=df['Gender'],
        color_discrete_sequence=px.colors.qualitative.Set2,
        title='2D PCA Clustering',
        hover_data=[df['CustomerID'], df['Age'], df['Annual Income (k$)'], df['Spending Score (1-100)']]
    )
    fig_2d.show()

    #Comment out this 3d and 2d plot section before running in colab. In github, this



In [13]:
def predict_new_customer(gender, age, income, spending):
    new_df = pd.DataFrame({
        'Gender': [gender],
        'Age': [age],
        'Annual Income (k$)': [income],
        'Spending Score (1-100)': [spending],
        'Income_per_Age': [income / (age + 1)],
        'Age_Group': [0 if age <= 25 else (1 if age <= 45 else 2)]
    })

    scaler = joblib.load('scaler.pkl')
    pca = joblib.load('pca.pkl')
    kmeans = joblib.load('kmeans_pca_model_k5.pkl')
    new_scaled = scaler.transform(new_df)
    new_pca = pca.transform(new_scaled)
    cluster = kmeans.predict(new_pca)[0]
    print(f"This customer belongs to Cluster: {cluster}")


In [20]:
interact(run_final_kmeans, n_clusters=IntSlider(min=2, max=10, step=1, value=5))

interactive(children=(IntSlider(value=5, description='n_clusters', max=10, min=2), Output()), _dom_classes=('w…

In [16]:
# Example: Female (0), 20 years old, ₹90k income, 100 score
predict_new_customer(gender=0, age=20, income=90, spending=100)


This customer belongs to Cluster: 3


In [17]:
plot_elbow()


In [18]:
    summary = df.groupby('Cluster')[['Age', 'Annual Income (k$)', 'Spending Score (1-100)', 'Income_per_Age']].agg(['mean', 'min', 'max', 'count'])
    print("Cluster-wise Summary Statistics:\n")
    display(summary)

Cluster-wise Summary Statistics:



Unnamed: 0_level_0,Age,Age,Age,Age,Annual Income (k$),Annual Income (k$),Annual Income (k$),Annual Income (k$),Spending Score (1-100),Spending Score (1-100),Spending Score (1-100),Spending Score (1-100),Income_per_Age,Income_per_Age,Income_per_Age,Income_per_Age
Unnamed: 0_level_1,mean,min,max,count,mean,min,max,count,mean,min,max,count,mean,min,max,count
Cluster,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
0,56.323529,37,70,34,54.470588,19,98,34,35.647059,3,60,34,0.97817,0.279412,2.085106,34
1,51.162791,35,68,43,52.418605,18,101,43,36.883721,5,59,43,1.016034,0.338983,1.836364,43
2,26.771429,18,35,35,28.485714,15,48,35,65.857143,6,99,35,1.047141,0.527778,1.761905,35
3,31.347826,18,47,46,80.391304,54,126,46,58.978261,16,95,46,2.522797,1.641026,3.421053,46
4,30.380952,18,43,42,78.833333,46,137,42,52.952381,1,97,42,2.612198,1.384615,4.419355,42
