<a href="https://colab.research.google.com/github/iamnirajkc-git/KProtype/blob/main/Clustering_algorithm(Categorical_Variable).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Perfoming clustering of customers based on RFM (Recency, frequency, Monetary) analysis.

# Clustering was done on categorical variable 'sex' in the  dataset Retail1.csv.

## Loading required libraries

In [3]:
import pandas as pd
import numpy as np
import sklearn
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

## Loading dataset

In [4]:
# Load the dataset
df = pd.read_csv('Retail1.csv')
print(df.head())

   Unnamed: 0 InvoiceNo StockCode                          Description  \
0           0    570715     21014        SWISS CHALET TREE DECORATION    
1           1    573167     21014        SWISS CHALET TREE DECORATION    
2           2    570715     35967   FOLK ART METAL STAR T-LIGHT HOLDER   
3           3    570715     21824  PAINTED METAL STAR WITH HOLLY BELLS   
4           4    570715     22065       CHRISTMAS PUDDING TRINKET POT    

   Quantity     InvoiceDate  UnitPrice  CustomerID         Country   sex  
0        24  10/12/11 10:23       0.29      118287  United Kingdom  male  
1        24   10/28/11 9:29       0.29      118287  United Kingdom  male  
2        36  10/12/11 10:23       0.38      118287  United Kingdom  male  
3        24  10/12/11 10:23       0.39      118287  United Kingdom  male  
4        48  10/12/11 10:23       0.39      118287  United Kingdom  male  


In [5]:
#checking quantity and priceless than zero
print((df['Quantity'] <= 0).any())
print((df['UnitPrice'] <= 0).any())


True
True


In [6]:
#change into category
df['sex'] = df['sex'].astype('category')
#summary statistics of column Quantity and Unit Price
print(df[["Quantity", "UnitPrice"]].describe())
#Filtering Quantity and Unit Price wgreater than O.
df = df[(df['Quantity'] > 0) & (df['UnitPrice'] > 0)]

            Quantity      UnitPrice
count  406829.000000  406829.000000
mean       12.061303       3.460471
std       248.693370      69.315162
min    -80995.000000       0.000000
25%         2.000000       1.250000
50%         5.000000       1.950000
75%        12.000000       3.750000
max     80995.000000   38970.000000


In [7]:
# separate date, month, year, daysOfWeek column from InvoiceDate
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df['Date'] = df['InvoiceDate'].dt.date
df['Month'] = df['InvoiceDate'].dt.month
df['Year'] = df['InvoiceDate'].dt.year
df['DayOfWeek'] = df['InvoiceDate'].dt.day_name()

In [8]:
# Convert the columns to the 'category' data type
df['Month'] = df['Month'].astype('category')
df['Year'] = df['Year'].astype('category')
df['DayOfWeek'] = df['DayOfWeek'].astype('category')

In [9]:
# Create the 'Total Sales' column
df['Total Sales'] = df['Quantity'] * df['UnitPrice']
df = df[(df['Total Sales'] > 0)]
CountrySales = pd.DataFrame(df.groupby('Country')['Total Sales'].sum()).reset_index()

In [10]:
# Total Sales By Month
sales = df.groupby(["Year","Month"])["Total Sales"].sum().reset_index()
sales  = sales[(sales['Total Sales'] > 0)]
import calendar
sales['Month'] = sales['Month'].apply(lambda x: calendar.month_abbr[x])
sales['Month'] = sales['Month'].astype(str)+ '' + sales ['Year'].astype(str)
sales = sales.drop("Year", axis=1)

In [11]:
#RFA Analysis
#Total Sales(Monetary)
df['CustomerID'] = df['CustomerID'].astype(str)
rfm_ds_m = df.groupby('CustomerID')['Total Sales'].sum()
rfm_ds_m.reset_index()
rfm_ds_m.columns = ['CustomerID', 'Total Amount']

In [12]:
#frequency of purchase
df['CustomerID'] = df['CustomerID'].astype(str)
rfm_ds_f = df.groupby('CustomerID')['InvoiceNo'].count()
rfm_ds_f = rfm_ds_f.reset_index()
rfm_ds_f.columns = ['CustomerID', 'Frequency']

In [13]:
#Recent Purchase
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'],format='%d-%m-%Y %H:%M')
max_date = max(df['InvoiceDate'])
df['Diff'] = max_date - df['InvoiceDate']
rfm_ds_p = df.groupby('CustomerID')['Diff'].min()
rfm_ds_p = rfm_ds_p.reset_index()
rfm_ds_p.columns = ['CustomerID','Diff']
rfm_ds_p['Diff'] = rfm_ds_p['Diff'].dt.days

In [14]:
# Group by 'CustomerID' and aggregate 'Sex' as a list
grouped_data = df.groupby('CustomerID')['sex'].unique().reset_index()
new_sex = pd.DataFrame({'CustomerID': grouped_data['CustomerID'], 'sex': grouped_data['sex'].str[0]})

In [15]:
rfm_ds_final = pd.merge(rfm_ds_m,rfm_ds_f,on='CustomerID',how='inner')
rfm_ds_final = pd.merge(rfm_ds_final,rfm_ds_p,on='CustomerID',how='inner')
rfm_ds_final = pd.merge(rfm_ds_final,new_sex,on='CustomerID',how='inner')
rfm_ds_final.columns = ['CustomerID','Amount', 'Frequency', 'Recency', 'sex']
Cus_ID = rfm_ds_final['CustomerID']

In [16]:
# Encode the 'sex' column
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
sex=rfm_ds_final['sex']
rfm_ds_final['sex_encoded'] = encoder.fit_transform(rfm_ds_final['sex'])
#Changing into categorical
rfm_ds_final['sex_encoded'] = rfm_ds_final['sex_encoded'].astype('category')
# Access the encoded 'sex' column
sex_encoded = rfm_ds_final['sex_encoded']
sex_encoded.head()

0    1
1    1
2    0
3    1
4    1
Name: sex_encoded, dtype: category
Categories (2, int64): [0, 1]

# Handling outliers

In [17]:

# Handling Outliers
Q1 = rfm_ds_final.Amount.quantile(0.25)
Q3 = rfm_ds_final.Amount.quantile(0.75)
IQR = Q3 - Q1
rfm_ds_final = rfm_ds_final[(rfm_ds_final.Amount >= Q1 - 1.5*IQR) & (rfm_ds_final.Amount <= Q3 + 1.5*IQR)]

Q1 = rfm_ds_final.Recency.quantile(0.05)
Q3 = rfm_ds_final.Recency.quantile(0.95)
IQR = Q3 - Q1
rfm_ds_final = rfm_ds_final[(rfm_ds_final.Recency >= Q1 - 1.5*IQR) & (rfm_ds_final.Recency <= Q3 + 1.5*IQR)]

Q1 = rfm_ds_final.Frequency.quantile(0.05)
Q3 = rfm_ds_final.Frequency.quantile(0.95)
IQR = Q3 - Q1
rfm_ds_final = rfm_ds_final[(rfm_ds_final.Frequency >= Q1 - 1.5*IQR) & (rfm_ds_final.Frequency <= Q3 + 1.5*IQR)]
print(rfm_ds_final.head())

X = rfm_ds_final[['Amount', 'Frequency', 'Recency', 'sex_encoded']]

  CustomerID  Amount  Frequency  Recency     sex sex_encoded
0    1112354  132.48          8      231    male           1
1    1114911  838.10         33        2    male           1
2    1118287  822.84         26       58  female           0
4     112347  389.12         30        1    male           1
5     112349  525.68         30       18  female           0


In [18]:
print(X.head())


   Amount  Frequency  Recency sex_encoded
0  132.48          8      231           1
1  838.10         33        2           1
2  822.84         26       58           0
4  389.12         30        1           1
5  525.68         30       18           0


In [19]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3909 entries, 0 to 4357
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   Amount       3909 non-null   float64 
 1   Frequency    3909 non-null   int64   
 2   Recency      3909 non-null   int64   
 3   sex_encoded  3909 non-null   category
dtypes: category(1), float64(1), int64(2)
memory usage: 126.1 KB


## Performing heat map for checking correlation and multicollinearity.

In [20]:
#Heat Map for determing the correlation of variables.
import plotly.express as px

# Create a correlation matrix
correlation_matrix = X.corr()
# Generate heatmap using Plotly
fig = px.imshow(correlation_matrix,
                x=correlation_matrix.columns,
                y=correlation_matrix.columns)

# Display the heatmap
fig.show()

Heat Map is not showing sex as it is categorical variable in the above dataframe, we will sacle the variable which will change it in the float and we will perform heat map again later.

In [21]:

#MinMaxScaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
rfm_ds_scaled = scaler.fit_transform(X)
rfm_ds_scaled = pd.DataFrame(rfm_ds_scaled)
rfm_ds_scaled.columns = ['Amount', 'Frequency', 'Recency', 'sex']
print(rfm_ds_scaled.head())

     Amount  Frequency   Recency  sex
0  0.035085   0.014463  0.619303  1.0
1  0.227398   0.066116  0.005362  1.0
2  0.223239   0.051653  0.155496  0.0
3  0.105031   0.059917  0.002681  1.0
4  0.142250   0.059917  0.048257  0.0


In [22]:
rfm_ds_scaled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3909 entries, 0 to 3908
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Amount     3909 non-null   float64
 1   Frequency  3909 non-null   float64
 2   Recency    3909 non-null   float64
 3   sex        3909 non-null   float64
dtypes: float64(4)
memory usage: 122.3 KB


In [23]:
import plotly.express as px
import numpy as np

# Create a correlation matrix
correlation_matrix = rfm_ds_scaled.corr()

# Generate heatmap using Plotly
fig = px.imshow(correlation_matrix,
                x=correlation_matrix.columns,
                y=correlation_matrix.columns,
                zmin=-1,  # Set minimum value of color scale
                zmax=1)  # Set maximum value of color scale

# Add custom annotations to show correlation values
fig.update_layout(annotations=[
    dict(
        x=i,
        y=j,
        text=np.around(correlation_matrix.values[i][j], decimals=2),
        showarrow=False,
        font=dict(size=9),
    )
    for i in range(len(correlation_matrix.columns))
    for j in range(len(correlation_matrix.columns))
])

# Display the heatmap
fig.show()


In [24]:
!pip install kmodes



Collecting kmodes
  Downloading kmodes-0.12.2-py2.py3-none-any.whl (20 kB)
Installing collected packages: kmodes
Successfully installed kmodes-0.12.2


## Determing silhoute score for each clusters

In [25]:
import numpy as np
import plotly.offline as pyo
import plotly.graph_objs as go
from kmodes.kprototypes import KPrototypes
from sklearn.metrics import silhouette_score
num_clusters = list(range(2, 9))
silhouette_avg = []
# calculate cost values for each number of clusters (2 to 8)
for k in num_clusters:
    try:
        kproto = KPrototypes(n_jobs=-1, n_clusters=k, init='Huang', random_state=42)
        kproto.fit_predict(rfm_ds_scaled, categorical=[3])
        cluster_labels = kproto.labels_
        silhouette_avg.append(silhouette_score(rfm_ds_scaled, cluster_labels))
        print('Cluster initiation: {}'.format(k))
    except:
        break

trace = go.Scatter(x=num_clusters, y=silhouette_avg, mode='lines+markers', name='Silhouette Score')
data = [trace]
layout = go.Layout(title='Silhouette Curve', xaxis=dict(title='Number of Clusters'), yaxis=dict(title='Score'))
fig = go.Figure(data=data, layout=layout)
pyo.plot(fig, filename='silhouette.html')



Cluster initiation: 2
Cluster initiation: 3
Cluster initiation: 4
Cluster initiation: 5
Cluster initiation: 6
Cluster initiation: 7
Cluster initiation: 8


'silhouette.html'

In [26]:
fig.show()

As max silhoute score is only 0.56 which indicates that clusters are not properly seperated from each otherThe value of silhoute score ranges from 0 to 1. Here in the above graph cluster 5 have maxium silhoute score. We will perform PCA later for the dimensinal reduction.

# Lets see what will the optimal number of cluster, elbow curve shows.

In [27]:
num_clusters = list(range(2, 9))
cost_values = []
for k in num_clusters:
    kproto = KPrototypes(n_jobs=-1, n_clusters=k, init='Huang', random_state=42)
    kproto.fit_predict(rfm_ds_scaled, categorical=[3])
    cost_values.append(kproto.cost_)
    cluster_labels = kproto.labels_
    print('Cluster initiation: {}'.format(k))
trace = go.Scatter(x=num_clusters, y=cost_values, mode='lines+markers', name='Elbow curve')
data = [trace]
layout = go.Layout(title='Elbow curve', xaxis=dict(title='Number of Clusters'), yaxis=dict(title='Cost'))
fig1 = go.Figure(data=data, layout=layout)
fig1.show()

Cluster initiation: 2
Cluster initiation: 3
Cluster initiation: 4
Cluster initiation: 5
Cluster initiation: 6
Cluster initiation: 7
Cluster initiation: 8


# Based on the silhoutte and elbow method, the optimum number of cluster is 5. We will cluster the dataset using Kprototype clustering algorithm which is effective for categorical variables as K-means clustering is only effective for numerical continuous variable.


In [28]:
# we set the number of clusters to 5
kproto = KPrototypes(n_jobs=-1, n_clusters=5 , init='Huang', n_init = 25, random_state=42)#n_init -number of centroid initialization
kproto.fit_predict(rfm_ds_scaled, categorical=[3])
# store cluster labels
cluster_labels = kproto.labels_
# add clusters to dataframe
#rfm_ds_scaled["cluster"] = cluster_labels

#Adding customer in the dataframe
#rfm_ds_scaled['customer'] = Cus_ID
#print(rfm_ds_scaled.head())


As metioned earlier, in our dataset the silhoute score in the clusters were not very high which indicate that our clusters are not seperated proeply so we will perform PCA(Pricipal component analysis to reduce the dimensions of the dataset). Lets see how it performs

In [None]:

#Cluster Exploration
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca_df = pca.fit_transform(rfm_ds_scaled)



In [43]:
explained_variance_ratio = pca.explained_variance_ratio_
print(explained_variance_ratio)

[0.63721827 0.23621878]


Principal component 1 expresses 70% and PC2 explained 22% variance of our dataset.

In [44]:
print(rfm_ds_scaled.head())

     Amount  Frequency   Recency  sex  cluster customer
0  0.035085   0.014463  0.619303  1.0        1  1112354
1  0.227398   0.066116  0.005362  1.0        0  1114911
2  0.223239   0.051653  0.155496  0.0        4  1118287
3  0.105031   0.059917  0.002681  1.0        0   112346
4  0.142250   0.059917  0.048257  0.0        4   112347


In [45]:
import numpy as np
np.set_printoptions(suppress=True)
print(pca.components_)

pca_dataframe = pd.DataFrame(pca_df)
print(pca_dataframe.head())

[[ 0.01578308  0.0035451   0.00633146 -0.99984911]
 [ 0.52458489  0.27786194 -0.80472731  0.00417015]]
          0         1
0 -0.492158 -0.416648
1 -0.492827  0.192644
2  0.507856  0.061457
3 -0.494797  0.128887
4  0.505928  0.107565


In [46]:
print(pca_dataframe.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3909 entries, 0 to 3908
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       3909 non-null   float64
 1   1       3909 non-null   float64
dtypes: float64(2)
memory usage: 61.2 KB
None


In [47]:
#Plotting PCA
fig = px.scatter(x=pca_df[:, 0], y=pca_df[:, 1], color=cluster_labels)
fig.show()


In [48]:
# add clusters to dataframe
rfm_ds_scaled["cluster"] = cluster_labels

#Adding customer in the dataframe
rfm_ds_scaled['customer'] = Cus_ID
print(rfm_ds_scaled.head())

     Amount  Frequency   Recency  sex  cluster customer
0  0.035085   0.014463  0.619303  1.0        1  1112354
1  0.227398   0.066116  0.005362  1.0        0  1114911
2  0.223239   0.051653  0.155496  0.0        4  1118287
3  0.105031   0.059917  0.002681  1.0        0   112346
4  0.142250   0.059917  0.048257  0.0        4   112347


In [49]:
# size of each cluster
print(rfm_ds_scaled["cluster"].value_counts())
print(rfm_ds_scaled["sex"].value_counts())

0    1243
4    1054
3     598
1     522
2     492
Name: cluster, dtype: int64
1.0    1990
0.0    1919
Name: sex, dtype: int64


In [50]:
#Clusters for numeric variables
fig = px.scatter_matrix(rfm_ds_scaled,
    dimensions=['Amount', 'Frequency','Recency','customer','sex'],
    color="cluster",
    hover_data=['customer','sex'])
fig.show()

Cluster3 is has highest amount, frequency and low recency value as compared to others.

In [51]:
#Clusters for categorical  variable
fig = px.histogram(rfm_ds_scaled, x="sex", color="cluster", barmode="group")
fig.show()

In [52]:
#look at stastitics
rfm_ds_scaled[rfm_ds_scaled["cluster"]==0].describe()


Unnamed: 0,Amount,Frequency,Recency,sex,cluster
count,1243.0,1243.0,1243.0,1243.0,1243.0
mean,0.195429,0.102067,0.127023,1.0,0.0
std,0.136984,0.096069,0.099586,0.0,0.0
min,0.003066,0.0,0.0,1.0,0.0
25%,0.084711,0.035124,0.045576,1.0,0.0
50%,0.163735,0.07438,0.101877,1.0,0.0
75%,0.284641,0.140496,0.190349,1.0,0.0
max,0.601263,0.679752,0.442359,1.0,0.0


In [53]:
rfm_ds_scaled[rfm_ds_scaled["cluster"]==4].describe()

Unnamed: 0,Amount,Frequency,Recency,sex,cluster
count,1054.0,1054.0,1054.0,1054.0,1054.0
mean,0.171123,0.088467,0.136096,0.0,4.0
std,0.11108,0.077515,0.10705,0.0,0.0
min,0.000668,0.0,0.0,0.0,4.0
25%,0.081148,0.033058,0.048257,0.0,4.0
50%,0.152439,0.066116,0.112601,0.0,4.0
75%,0.246499,0.119835,0.19571,0.0,4.0
max,0.499895,0.471074,0.439678,0.0,4.0


In [54]:
rfm_ds_scaled[rfm_ds_scaled["cluster"]==3].describe()

Unnamed: 0,Amount,Frequency,Recency,sex,cluster
count,598.0,598.0,598.0,598.0,598.0
mean,0.658505,0.324035,0.093641,0.376254,3.0
std,0.167983,0.189446,0.107448,0.484851,0.0
min,0.262652,0.0,0.0,0.0,3.0
25%,0.532392,0.202479,0.021448,0.0,3.0
50%,0.644066,0.272727,0.058981,0.0,3.0
75%,0.780216,0.41064,0.133378,1.0,3.0
max,1.0,1.0,0.705094,1.0,3.0


In [55]:
#Cluster3 is the best cluster among all.