# CUSTOMER SORTING BY K-MEANS AND HEIRARCICHAL CLUSTERING

### Changing the directory of Notebook

In [None]:
cd C:\Users\pooja\Downloads\MLProjects 

### Import required libraries for clustering

In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import plotly.express as px


import sklearn
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import cut_tree
# to remove warnings
import warnings 
warnings.filterwarnings('ignore')

### Importing the dataset

In [None]:
df=pd.read_excel("Online Retail.xlsx")
df.head()

### Data Preprocessing

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df_null = round(100*(df.isnull().sum())/len(df), 2)
df_null

In [None]:
df=df.dropna()

In [None]:
df.shape

In [None]:
# to check if data is of one country or multiple countries
df['Country'].unique()

In [None]:
#CHANGING THE DATATYPE OF CUSTOMER ID AS PER THE BUSINESS REQUIREMENT as it uniquely identifies id's and may contain characters
df['CustomerID']=df['CustomerID'].astype(str)

### FEATURE ENGINEERING

In [None]:
df['Amount']=df['Quantity']*df['UnitPrice']
df.head()

In [None]:
# This step groups the data in the retail DataFrame by the 'CustomerID' column 
# to calculates the sum of the 'Amount' column for each customer. 
df1 = df.groupby('CustomerID')['Amount'].sum().reset_index()
df1

In [None]:
# New Attribute : Frequency
# count the number of unique invoice numbers for each customer
df2 = df.groupby('CustomerID')['InvoiceNo'].count().reset_index()
df2.head()

In [None]:
df2.rename(columns={'InvoiceNo': 'Frequency'}, inplace=True)
df2.head()

In [None]:
# Merging the two dfs

dfm = df1.merge(df2, on='CustomerID')
# another way 
# rfm = pd.merge(rfm_m, rfm_f, on='CustomerID', how='inner')
dfm.head()

In [None]:
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'],format='%d-%m-%Y %H:%M')
df.head()

In [None]:
# calculate the time difference between the maximum date in the dataset and the transaction date for each record

df['Diff'] = max(df['InvoiceDate']) - df['InvoiceDate']
df.head()

In [None]:
# Compute last transaction date to get the recency of customers 
# As each customer may have multiple invonces that has been purchased 
# So we need to create a indicator that tells us when he made the last transaction to find whether he is active or not
df3= df.groupby('CustomerID')['Diff'].min().reset_index()
df3.head()

In [None]:
# Extract number of days only

df3['Diff'] = df3['Diff'].dt.days
df3.head()

In [None]:
# Merge tha dataframes to get the final RFM dataframe

dfm = dfm.merge(df3, on='CustomerID')

dfm.rename(columns={'Diff': 'Recency'}, inplace=True)
dfm.head()

In [None]:
dfm.shape

In [None]:
fig=px.box(dfm[['Frequency','Amount','Recency']],boxmode='group')
fig.show()

In [None]:
# Removing (statistical) outliers for Amount
Q1 = dfm.Amount.quantile(0.10)
Q3 = dfm.Amount.quantile(0.90)
IQR = Q3 - Q1
dfm = dfm[(dfm.Amount >= Q1 - 1.5*IQR) & (dfm.Amount <= Q3 + 1.5*IQR)]



# Removing (statistical) outliers for Recency
Q1 = dfm.Recency.quantile(0.10)
Q3 = dfm.Recency.quantile(0.90)
IQR = Q3 - Q1
dfm = dfm[(dfm.Recency >= Q1 - 1.5*IQR) & (dfm.Recency <= Q3 + 1.5*IQR)]

# Removing (statistical) outliers for Frequency
Q1 = dfm.Frequency.quantile(0.10)
Q3 = dfm.Frequency.quantile(0.90)
IQR = Q3 - Q1
dfm = dfm[(dfm.Frequency >= Q1 - 1.5*IQR) & (dfm.Frequency <= Q3 + 1.5*IQR)]

In [None]:
fig=px.box(dfm[['Frequency','Amount','Recency']],boxmode='group')
fig.show()

In [None]:
dfm.describe()

In [None]:
# Rescaling the attributes

dfm = dfm[['Amount', 'Frequency', 'Recency']]

# Instantiate
scaler = StandardScaler()

# fit_transform
dfm_scaled = scaler.fit_transform(dfm)
dfm_scaled.shape

In [None]:
dfm_scaled = pd.DataFrame(dfm_scaled)
dfm_scaled.columns = ['Amount', 'Frequency', 'Recency']
dfm_scaled.head()

### BUILDING THE MODEL

In [None]:
# k-means with some arbitrary k

kmeans = KMeans(n_clusters=4, max_iter=50)
kmeans.fit(dfm_scaled)

In [None]:
#A fundamental step for any unsupervised algorithm is to determine the optimal number of clusters into which the data
#may be clustered.The Elbow Method is one of the most popular methods to determine this optimal value of k.¶

ssd = []
range_n_clusters = [2, 3, 4, 5, 6, 7, 8]

for num_clusters in range_n_clusters:
    kmeans = KMeans(n_clusters=num_clusters, max_iter=50)
    kmeans.fit(dfm_scaled)
    
    ssd.append(kmeans.inertia_)
    print("For n_clusters={0}, the Elbow score is {1}".format(num_clusters, kmeans.inertia_))

fig = px.line(x=range_n_clusters, y=ssd, 
              title="Elbow Curve for K-Means Clustering",
              labels={'x': 'Number of Clusters', 'y': 'Sum of Squared Distances (SSD)'})
fig.show()


In [None]:
ssd

In [None]:
# Silhouette analysis
range_n_clusters = [2, 3, 4, 5, 6, 7, 8]

for num_clusters in range_n_clusters:
    
    # intialise kmeans
    kmeans = KMeans(n_clusters=num_clusters, max_iter=50)
    kmeans.fit(dfm_scaled)
    
    cluster_labels = kmeans.labels_
    
    # silhouette score
    silhouette_avg = silhouette_score(dfm_scaled, cluster_labels)
    print("For n_clusters={0}, the silhouette score is {1}".format(num_clusters, silhouette_avg))
    

In [None]:
# Final model with k=3
kmeans = KMeans(n_clusters=3, max_iter=50)
kmeans.fit(dfm_scaled)

In [None]:
y_predicted = kmeans.fit_predict(dfm_scaled)
y_predicted

In [None]:
dfm_scaled['cluster']=y_predicted
dfm_scaled.head()

In [None]:
score = silhouette_score(dfm_scaled, y_predicted)
score

### Hirarichal Clustering

In [None]:
from sklearn.cluster import AgglomerativeClustering

In [None]:
hc = AgglomerativeClustering(n_clusters = 3, affinity = "euclidean", linkage = "single")
cluster = hc.fit_predict(dfm)

In [None]:
dfm["label"] = cluster

In [None]:
from sklearn.metrics import silhouette_score

In [None]:
score_agg = silhouette_score(dfm, cluster)
score_agg