In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import MiniBatchKMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [None]:
# for reproducibility
random_state = 0

#load data
df = pd.read_csv('./Posts_MonsterSg.csv')
df.head()
df = df.dropna()

In [None]:
vec = TfidfVectorizer(stop_words="english")
vec.fit(df.tech_stack.values) # change to skills
features = vec.transform(df.tech_stack.values)

## Creating model

In [None]:
cls = MiniBatchKMeans(n_clusters=5, random_state=random_state)
cls.fit(features)

In [None]:
# predict cluster labels for new dataset
cls.predict(features)

# to get cluster labels for the dataset used while
# training the model (used for models that does not
# support prediction on new dataset).
cls.labels_

In [None]:
df['Cluster'] = cls.labels_

In [None]:
df

## Evaluation
- best value is 1 and worst value is -1
- values near 0 indicates overlapping clusters

In [None]:
from sklearn.metrics import silhouette_score
silhouette_score(features, labels=cls.predict(features)) 

## Visualisation

In [None]:
# reduce the features to 2D
pca = PCA(n_components=2, random_state=random_state)
reduced_features = pca.fit_transform(features.toarray())

# reduce the cluster centers to 2D
reduced_cluster_centers = pca.transform(cls.cluster_centers_)

In [None]:
plt.scatter(reduced_features[:,0], reduced_features[:,1], c=cls.predict(features))
plt.scatter(reduced_cluster_centers[:, 0], reduced_cluster_centers[:,1], marker='x', s=150, c='b')