# K-means classifier

Examples taken from https://realpython.com/k-means-clustering-python/

## Prepare Data

In [None]:
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

In [None]:
features, true_labels = make_blobs(
    n_samples=20,
    centers=3,
    cluster_std=2.75,
    random_state = 42
)

In [None]:
features

In [None]:
true_labels

In [None]:
ft = features.T 
plt.scatter(ft[0], ft[1])

In [None]:
ft = features.T 
plt.scatter(ft[0], ft[1], c = true_labels)

In [None]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

In [None]:
scaled_features[:5]

## Create and fit model

In [None]:
kmeans = KMeans(
    n_clusters = 3,
    init="random", #initial number of centroids
    n_init=10, #times running with random seeds
    max_iter=300,
    random_state=42
)

In [None]:
kmeans.fit(scaled_features)

In [None]:
# The lowest Sum of squared distances of samples to their closest cluster center value
kmeans.inertia_

In [None]:
# Final locations of the centroid
kmeans.cluster_centers_

In [None]:
# The number of iterations required to converge
kmeans.n_iter_

In [None]:
kmeans.labels_

In [None]:
sc_t = scaled_features.T
plt.scatter(sc_t[0], sc_t[1], c = kmeans.labels_)

## Choosing centrers


### Elbow method

In [None]:
kmeans_kwargs = {
    "init": "random",
    "n_init": 10,
    "max_iter": 300,
    "random_state": 42,
}

# A list holds the SSE values for each k
sse = []
for k in range(1, 11): #vary number of clusters to find best number
    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
    kmeans.fit(scaled_features)
    sse.append(kmeans.inertia_)

In [None]:
 plt.style.use("fivethirtyeight")
 plt.plot(range(1, 11), sse)
 plt.xticks(range(1, 11))
 plt.xlabel("Number of Clusters")
 plt.ylabel("SSE")
 plt.show()

### Silhouette coefficient

In [None]:
# A list holds the silhouette coefficients for each k
silhouette_coefficients = []

# Notice you start at 2 clusters for silhouette coefficient
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
    kmeans.fit(scaled_features)
    score = silhouette_score(scaled_features, kmeans.labels_)
    silhouette_coefficients.append(score)

In [None]:
plt.style.use("fivethirtyeight")
plt.plot(range(2, 11), silhouette_coefficients)
plt.xticks(range(2, 11))
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Coefficient")
plt.show()