# Clustering
Load the `customers` dataset. Train a clustering model, such as k-means or HDBSCAN, using scikit-learn. Check the documentation to identify the most important hyperparameters, attributes, and methods of the models. Use them in practice. Furthermore, identify the optimal number of clusters.

In [4]:
import pandas as pd
import sklearn.metrics
import sklearn.preprocessing
import sklearn.model_selection
import plotly.express as px
import plotly.graph_objects as go
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from yellowbrick.cluster import KElbowVisualizer


# Loading the dataset

In [5]:
df = pd.read_csv('customers.csv')
df = df.set_index('ID')
print(df.shape)
df.head(3) 

(2000, 7)


Unnamed: 0_level_0,Sex,Marital status,Age,Education,Income,Occupation,Settlement size
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
100000001,0,0,67,2,124670,1,2
100000002,1,1,22,1,150773,1,2
100000003,0,0,49,1,89210,0,0


# Scaling the Features

In [6]:
scarler = sklearn.preprocessing.StandardScaler()
x = scarler.fit_transform(df)

# Training a Model

In [7]:
model = KMeans(n_clusters=2)
model.fit(x)
y_predicted = model.predict(x)
model.cluster_centers_

array([[ 0.52723906,  0.33736541, -0.28493767, -0.00976793, -0.51329401,
        -0.54089367, -0.58328733],
       [-0.67103153, -0.42937416,  0.36264795,  0.01243191,  0.65328328,
         0.68841013,  0.74236569]])

# Identifying the Number of Clusters

### Elbow Method

In [8]:
k_list = []
elbow_scores = []

for k in range(2, 51):
    k_list.append(k)
    model = KMeans(n_clusters=k)
    model.fit(x)
    y_predicted = model.predict(x)
    es = model.inertia_
    elbow_scores.append(es)
    

fig = go.Figure(data=go.Scatter(x=k_list, y=elbow_scores))
fig.show()


## Silhouette Scores

In [9]:
k_list = []
silhouette_scores = []

for k in range(2, 51):
    k_list.append(k)
    model = KMeans(n_clusters=k)
    model.fit(x)
    y_predicted = model.predict(x)
    ss = sklearn.metrics.silhouette_score(x, y_predicted)
    silhouette_scores.append(ss)

fig = go.Figure(data=go.Scatter(x=k_list, y=silhouette_scores))
fig.show()