In [None]:
import pandas as pd
import altair as alt
from sklearn.datasets import make_blobs
import random

# Everything in this cell is just to create a fake DataFrame to experiment with
X, y = make_blobs(n_samples=100, centers=3, n_features=2, random_state=0)

def generateFeature(x):
  if x['feature_a'] < 0:
    return random.normalvariate(0, 10)
  elif x['feature_b'] < 3 and x['feature_a'] > 0:
    return random.normalvariate(50,5)
  else:
    return random.normalvariate(10,5)

data = pd.DataFrame(X, columns=['feature_a', 'feature_b'])
data['feature_c'] = data.apply(generateFeature, axis=1)
data

Unnamed: 0,feature_a,feature_b,feature_c
0,2.631858,0.689365,45.565427
1,0.080804,4.690690,5.274141
2,3.002519,0.742654,56.146984
3,-0.637628,4.091047,9.539799
4,-0.072283,2.883769,14.895220
...,...,...,...
95,-0.338874,3.234825,-4.664418
96,1.289338,3.449692,4.589036
97,1.840706,3.561622,13.467011
98,-0.901673,1.315825,1.004761


In [None]:
# Let's plot our data to get an idea of what it looks like when we compare 
# feature_a and feature_b:
alt.Chart(data).mark_circle().encode(
    x='feature_a',
    y='feature_b'
)

In [None]:
# Let's see what the optimal number of clusters would be using the elbow method
from sklearn.cluster import KMeans


# Try different k values and plot the results to find the "elbow". We'll try
# between 2 and 12 clusters

kmeans_results = pd.DataFrame({"k": [], "sse": []})

kmeans_results = []
for k in range(2,12):
  kmeans = KMeans(n_clusters = k)
  kmeans.fit_predict(data[['feature_a', 'feature_b']])
  kmeans_results.append([k, kmeans.inertia_])

results_df = pd.DataFrame(kmeans_results, columns=['k', 'sse'])

alt.Chart(results_df).mark_line(point=True).encode(
    x='k',
    y='sse'
)



In [None]:
# Looks like it could be either 3 or 4, so we'll try both. Do the clustering,
# and store the cluster id that each sample belongs to
kmeans = KMeans(n_clusters = 3)
data['cluster_3_label'] = kmeans.fit_predict(data[['feature_a', 'feature_b']])

kmeans = KMeans(n_clusters = 4)
data['cluster_4_label'] = kmeans.fit_predict(data[['feature_a', 'feature_b']])

data

Unnamed: 0,feature_a,feature_b,feature_c,cluster_3_label,cluster_4_label
0,2.631858,0.689365,45.565427,2,0
1,0.080804,4.690690,5.274141,0,1
2,3.002519,0.742654,56.146984,2,0
3,-0.637628,4.091047,9.539799,1,1
4,-0.072283,2.883769,14.895220,1,1
...,...,...,...,...,...
95,-0.338874,3.234825,-4.664418,1,1
96,1.289338,3.449692,4.589036,0,2
97,1.840706,3.561622,13.467011,0,2
98,-0.901673,1.315825,1.004761,1,3


In [None]:
# Now, let's plot what it looks like with 3 clusters
chart3 = alt.Chart(data).mark_circle().encode(
    x='feature_a',
    y='feature_b',
    color='cluster_3_label:N'
)

chart3

In [None]:
# Now, let's plot what it looks like with 4 clusters
chart4 = alt.Chart(data).mark_circle().encode(
    x='feature_a',
    y='feature_b',
    color='cluster_4_label:N'
)

chart4

In [None]:
# Let's put those side by side:
chart3 | chart4

In [None]:
# Looks like 3 makes more sense, let's add feature_c to the visualization to see if we can find a relationship:
chart_c = alt.Chart(data).mark_circle().encode(
    x='feature_a',
    y='feature_b',
    color='feature_c'
)

chart_c | chart3

In [None]:
# Looks like feature_c is a determining factor in cluster membership, let's combine these:

alt.Chart(data).mark_circle().encode(
    x='feature_a',
    y='feature_b',
    color='cluster_3_label:N',
    size='feature_c'
)