In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt # plotting
from sklearn.cluster import KMeans # loading k means algorithm
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv("Mall_Customers.csv")
df.head()

In [None]:
df.describe()

# Density distribution plots

In [None]:
sns.distplot(df['Annual Income (k$)']) # distribution plots
# our data shape is normal distirbution
df.columns

In [None]:
# we use a for loop to makes distribution of all the variables
columns = ['Age', 'Annual Income (k$)', 'Spending Score (1-100)']
for i in columns:
    plt.figure()
    sns.displot(data=df, hue='Gender', x=i, kind='kde', fill=True, 
                height=5,aspect=1.6, cut=0, bw_adjust=1)

In [None]:
# lets use a box plot for more visualisations
columns = ['Age', 'Annual Income (k$)', 'Spending Score (1-100)']
for i in columns:
    plt.figure()
    sns.boxplot(data=df, x='Gender', y=i)

In [None]:
df['Gender'].value_counts(normalize = True)
# normalize to see percentages of gender in the data

# Bivariate Analysis

In [None]:
sns.scatterplot(data=df, x='Annual Income (k$)', y = 'Spending Score (1-100)')

In [None]:
# use pair plots to see all the scatterplots and histograms
df1=df.drop('CustomerID', axis=1) # axis 1 dropping customer ID column level
sns.pairplot(data=df1, hue='Gender')

In [None]:
df1.groupby(['Gender'])['Age', 'Annual Income (k$)', 'Spending Score (1-100)'].mean()

In [None]:
# correlation
df1.corr()

In [None]:
sns.heatmap(df1.corr(), annot=True, cmap='coolwarm')

# Clustering (univariate clustering, bivariate clustering)

In [None]:
clustering1 = KMeans(n_clusters=6)
df1.columns # to see columns of the data frame

In [None]:
clustering1.fit(df1[['Annual Income (k$)']]) 
# Fit data on the clustering algo 

In [None]:
clustering1.labels_ # clusters showing default clusters = 8

In [None]:
df1['Income cluster'] = clustering1.labels_
df1.head()

In [None]:
df1['Income cluster'].value_counts()

In [None]:
clustering1.inertia_ 
# distance between the centroid, we would want to lower this number 5050.904761904766

# Elbow Method for optimal value of k in KMeans

In [None]:
inertia_scores= []
for i in range(1,11):
    kmeans=KMeans(n_clusters=i)
    kmeans.fit(df1[['Annual Income (k$)']])
    inertia_scores.append(kmeans.inertia_)

In [None]:
plt.plot(range(1,11),inertia_scores) # elBow shows 3
plt.xlabel('Values of K')
plt.ylabel('Inertia')

In [None]:
clustering1 = KMeans(n_clusters=3)
clustering1.fit(df1[['Annual Income (k$)']])
clustering1.inertia_
df1['Income cluster']= clustering1.labels_
df1['Income cluster'].value_counts()

In [None]:
df1.groupby('Income cluster')['Age', 'Annual Income (k$)','Spending Score (1-100)'].mean()

In [None]:
clustering2 = KMeans(n_clusters=3)
clustering2.fit(df1[['Annual Income (k$)','Spending Score (1-100)']])
clustering2.labels_
# add cluster column to the data frame 
df1['Spending and Income cluster']= clustering2.labels_ 
#df1['Income cluster'].value_counts()
df1.head()

In [None]:
inertia_scores2= []
for i in range(1,11):
    kmeans2=KMeans(n_clusters=i)
    kmeans2.fit(df1[['Annual Income (k$)','Spending Score (1-100)']])
    inertia_scores2.append(kmeans2.inertia_)  
plt.plot(range(1,11),inertia_scores2)
plt.xlabel('Values of K')
plt.ylabel('Inertia')

In [None]:
# elbow seems 5 here so lets train again on 5 clusters
clustering2 = KMeans(n_clusters=5)
clustering2.fit(df1[['Annual Income (k$)','Spending Score (1-100)']])
clustering2.labels_
df1['Spending and Income cluster']= clustering2.labels_
#df1[‘Income cluster’].value_counts()
plt.figure(figsize=(10,8))
sns.scatterplot(data=df1, x='Annual Income (k$)', y ='Spending Score (1-100)', hue='Spending and Income cluster', palette='tab10')

In [None]:
centers =pd.DataFrame(clustering2.cluster_centers_)
centers.columns = ['x','y']
centers

In [None]:
plt.figure(figsize=(10,8))
plt.scatter(x=centers['x'],y=centers['y'],s=100,c='black',marker='*')
sns.scatterplot(data=df1, x='Annual Income (k$)', y ='Spending Score (1-100)', hue='Spending and Income cluster', palette='tab10')
plt.savefig('clustering_bivaraiate.png')

In [None]:
pd.crosstab(df1['Spending and Income cluster'],df1['Gender'],normalize='index') 
# cluster 1 is spending the highest

In [None]:
df1.groupby('Spending and Income cluster')['Age', 'Annual Income (k$)', 'Spending Score (1-100)'].mean()