In [103]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
%matplotlib inline

In [104]:
url='https://raw.githubusercontent.com/imamanmehrotra/Datasets/main/income_kmeans.csv'

In [105]:
df=pd.read_csv(url)
print(df.shape)
df

(22, 3)


Unnamed: 0,Name,Age,Income($)
0,Rob,27,70000
1,Michael,29,90000
2,Mohan,29,61000
3,Ismail,28,60000
4,Kory,42,150000
5,Gautam,39,155000
6,David,41,160000
7,Andrea,38,162000
8,Brad,36,156000
9,Angelina,35,130000


In [106]:
px.scatter(x='Age',y='Income($)',data_frame=df)

In [107]:
km=KMeans(n_clusters=3)
km

KMeans(n_clusters=3)

In [108]:
y_pred=km.fit_predict(df[['Age','Income($)']])
y_pred

array([2, 2, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1],
      dtype=int32)

In [109]:
df['Cluster_Number']=y_pred

In [110]:
df

Unnamed: 0,Name,Age,Income($),Cluster_Number
0,Rob,27,70000,2
1,Michael,29,90000,2
2,Mohan,29,61000,1
3,Ismail,28,60000,1
4,Kory,42,150000,0
5,Gautam,39,155000,0
6,David,41,160000,0
7,Andrea,38,162000,0
8,Brad,36,156000,0
9,Angelina,35,130000,0


In [111]:
px.scatter(x='Age',y='Income($)', data_frame=df, color='Cluster_Number', title='K-Means Clustering')

## Now we can observe that Yellow and blue clusters are not proper and this is due to the reason that feature scaling needs to be done, since our feature income has pretty high values, while our feature Age has very narrow values. So lets proceed with feature scaling at this point

In [112]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [113]:
mm=MinMaxScaler()

mm.fit(df[['Income($)']])
df['Income_Scaled']=mm.transform(df[['Income($)']])

mm.fit(df[['Age']])
df['Age_Scaled']=mm.transform(df[['Age']])

df

Unnamed: 0,Name,Age,Income($),Cluster_Number,Income_Scaled,Age_Scaled
0,Rob,27,70000,2,0.213675,0.058824
1,Michael,29,90000,2,0.384615,0.176471
2,Mohan,29,61000,1,0.136752,0.176471
3,Ismail,28,60000,1,0.128205,0.117647
4,Kory,42,150000,0,0.897436,0.941176
5,Gautam,39,155000,0,0.940171,0.764706
6,David,41,160000,0,0.982906,0.882353
7,Andrea,38,162000,0,1.0,0.705882
8,Brad,36,156000,0,0.948718,0.588235
9,Angelina,35,130000,0,0.726496,0.529412


In [114]:
km=KMeans(n_clusters=3)
y_pred=km.fit_predict(df[['Age_Scaled','Income_Scaled']])
df['New_Cluster']=y_pred
df

Unnamed: 0,Name,Age,Income($),Cluster_Number,Income_Scaled,Age_Scaled,New_Cluster
0,Rob,27,70000,2,0.213675,0.058824,1
1,Michael,29,90000,2,0.384615,0.176471,1
2,Mohan,29,61000,1,0.136752,0.176471,1
3,Ismail,28,60000,1,0.128205,0.117647,1
4,Kory,42,150000,0,0.897436,0.941176,0
5,Gautam,39,155000,0,0.940171,0.764706,0
6,David,41,160000,0,0.982906,0.882353,0
7,Andrea,38,162000,0,1.0,0.705882,0
8,Brad,36,156000,0,0.948718,0.588235,0
9,Angelina,35,130000,0,0.726496,0.529412,0


In [116]:
px.scatter(x='Age',y='Income($)', data_frame=df, color='New_Cluster', title='K-Means Clustering')

## Computing the Centroids of our Clusters

In [117]:
cc=km.cluster_centers_
cc_x=cc[:,0]
cc_y=cc[:,1]

In [118]:
cc_x,cc_y

(array([0.72268908, 0.1372549 , 0.85294118]),
 array([0.8974359 , 0.11633428, 0.2022792 ]))

In [149]:
fig=px.scatter(x='Age',y='Income($)', data_frame=df, color='New_Cluster', title='K-Means Clustering')
fig

## K - Means Clustering with Centroid Representation on Scaled Data

In [158]:
import plotly.graph_objects as go

fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=df['Age_Scaled'],
        y=df['Income_Scaled'],
        mode= 'markers',
        marker_color=df['New_Cluster'],
        text=df[['Age','Income($)']],
        name='Data_Point',
        marker_size=10
        
    ))

fig.add_trace(
    go.Scatter(
        x=cc_x,
        y=cc_y,
        mode='markers',
        marker_color='green',
        text=['Centroids'],
        name='Centroid',
        marker_size=15
        
        
    ))

fig.show()

# ELBOW METHOD

### Now let's see the Elbow Method to find the most optimal value for K while deciding for the number of clusters in K-Means Clustering.

In [163]:
k_range=range(1,10)
wcsse=[]
for k in k_range:
    km=KMeans(n_clusters=k)
    km.fit(df[['Age_Scaled','Income_Scaled']])
    sse=km.inertia_
    wcsse.append(sse)

In [168]:
px.line(x=k_range,y=wcsse,labels=dict(x="K-Value", y="Error"))

### From the above graph we could observe that at K=3 there is an abrupt decrease in the Sum of Squared error and hence we can say that this should be the optimal value for K according to ur dataset.