In [6]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [2]:
# download the dataset from Kaggle
!kaggle datasets download -d vjchoudhary7/customer-segmentation-tutorial-in-python -p ../Data

Dataset URL: https://www.kaggle.com/datasets/vjchoudhary7/customer-segmentation-tutorial-in-python
License(s): other
Downloading customer-segmentation-tutorial-in-python.zip to ../Data
  0% 0.00/1.55k [00:00<?, ?B/s]
100% 1.55k/1.55k [00:00<00:00, 4.42MB/s]


In [3]:
# unzip the dataset
!unzip ../Data/customer-segmentation-tutorial-in-python.zip -d ../Data

Archive:  ../Data/customer-segmentation-tutorial-in-python.zip
  inflating: ../Data/Mall_Customers.csv  


In [5]:
data = pd.read_csv('../Data/Mall_Customers.csv')
data

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40
...,...,...,...,...,...
195,196,Female,35,120,79
196,197,Female,45,126,28
197,198,Male,32,126,74
198,199,Male,32,137,18


In [8]:
#  Preprocessing
class Preprocessor :
    def __init__ (self, df):
        self.df = df.copy(deep=True)
        self.scaler = StandardScaler()
        self.encoder = LabelEncoder()

    def encode_categorical_features(self):
        self.df['Gender'] = self.encoder.fit_transform(self.df['Gender'])

    def scale_features(self):
        numerical_features = ['Gender', 'Age', 'Annual Income (k$)', 'Spending Score (1-100)']
        self.df[numerical_features] = self.scaler.fit_transform(self.df[numerical_features])

    def drop_unnecessary_columns(self):
        self.df.drop('CustomerID', axis=1, inplace=True)

    def handle_missing_values (self) :
        numerical_features = self.df.select_dtypes(include=[np.number]).columns
        self.df[numerical_features] = self.df[numerical_features].fillna(self.df[numerical_features].mean())


    def transform (self) :
        self.handle_missing_values()
        self.encode_categorical_features()
        self.scale_features()
        self.drop_unnecessary_columns()
        return self.df

In [10]:
preprocessor = Preprocessor(data)
processed_df = preprocessor.transform()
processed_df

Unnamed: 0,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1.128152,-1.424569,-1.738999,-0.434801
1,1.128152,-1.281035,-1.738999,1.195704
2,-0.886405,-1.352802,-1.700830,-1.715913
3,-0.886405,-1.137502,-1.700830,1.040418
4,-0.886405,-0.563369,-1.662660,-0.395980
...,...,...,...,...
195,-0.886405,-0.276302,2.268791,1.118061
196,-0.886405,0.441365,2.497807,-0.861839
197,1.128152,-0.491602,2.497807,0.923953
198,1.128152,-0.491602,2.917671,-1.250054


In [11]:
processed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Gender                  200 non-null    float64
 1   Age                     200 non-null    float64
 2   Annual Income (k$)      200 non-null    float64
 3   Spending Score (1-100)  200 non-null    float64
dtypes: float64(4)
memory usage: 6.4 KB


In [21]:
# training the model
model = KMeans(n_clusters=5, random_state=0)
clusters = model.fit_predict(processed_df)

In [22]:
# model evaluation
sil_score = silhouette_score(processed_df, clusters)
print("Silhouette Score:", (sil_score + 1) / 2)

Silhouette Score: 0.6589034856651037
