# libraries, helper functions and constants

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from umap import UMAP
from sklearn.decomposition import NMF
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from hdbscan import HDBSCAN
from scipy.optimize import linear_sum_assignment
from sklearn.metrics.cluster import (
    contingency_matrix,
    homogeneity_score,
    v_measure_score,
    completeness_score,
    adjusted_rand_score,
    adjusted_mutual_info_score
)
from plotmat import plot_mat

def get_cluster_metrics(y_true, y_pred, metrics=None):
    if not metrics:
        metrics = [
            homogeneity_score,
            completeness_score,
            v_measure_score,
            adjusted_rand_score,
            adjusted_mutual_info_score
        ]
    d = {}
    for m in metrics:
        d[m.__name__] = m(y_true, y_pred)
    df = pd.DataFrame(d, index=[0]).T
    df.reset_index(inplace=True)
    df.rename(columns={'index': 'metric', 0: 'score'}, inplace=True)
    return df

RANDOM_SEED = 42

# load in data and transform to TF-IDF vectors

In [2]:
df = pd.read_csv("./BBC_News_Train.csv")
df['category_id'], _ = df['Category'].factorize()
category_id_df = df[['Category', 'category_id']].drop_duplicates()

In [3]:
tfidf = TfidfVectorizer(min_df=5, stop_words='english')
tfidf_data = tfidf.fit_transform(df.Text)
labels = df.category_id

# dimension reduction (Truncated SVD, NMF and UMAP with K-mean)

In [4]:
km = KMeans(
    n_clusters=category_id_df.Category.count(),
    random_state=0,
    max_iter=5000,
    n_init=50
)

## Truncated SVD

In [5]:
scores = []
n_components = [1, 2, 3, 5, 10, 20, 50, 100, 300, 500, 800]
for r in n_components:
    input_svd = TruncatedSVD(
        n_components=r, random_state=RANDOM_SEED).fit_transform(tfidf_data)
    preds = km.fit_predict(input_svd)
    scores.append(get_cluster_metrics(labels, preds)['score'].tolist())
metrics = get_cluster_metrics(labels, preds)['metric'].tolist()
svd_metrics_df = pd.DataFrame(scores, columns=metrics, index=n_components)
svd_metrics_df

Unnamed: 0,homogeneity_score,completeness_score,v_measure_score,adjusted_rand_score,adjusted_mutual_info_score
1,0.094331,0.104362,0.099094,0.049817,0.095885
2,0.303752,0.34661,0.323769,0.193236,0.321323
3,0.448852,0.502422,0.474129,0.355048,0.472247
5,0.636503,0.68509,0.659903,0.572929,0.658711
10,0.633928,0.707318,0.668616,0.543747,0.667433
20,0.616071,0.693878,0.652664,0.514318,0.651419
50,0.635522,0.718284,0.674373,0.536805,0.673204
100,0.620452,0.698453,0.657146,0.517965,0.655918
300,0.715419,0.75744,0.73583,0.662944,0.734912
500,0.782276,0.799019,0.790559,0.780867,0.789844


## NMF

In [6]:
# scores = []
# for r in n_components:
#     inputs_nmf = NMF(n_components=r, init='random',
#                      random_state=RANDOM_SEED).fit_transform(tfidf_data)
#     preds = km.fit_predict(inputs_nmf)
#     scores.append(get_cluster_metrics(labels, preds)['score'].tolist())
# metrics = get_cluster_metrics(labels, preds)['metric'].tolist()
# nmf_metrics_df = pd.DataFrame(scores, columns=metrics, index=n_components)
# nmf_metrics_df

## UMAP

### eucledian

In [7]:
scores = []
for n in n_components:
    umapfit = UMAP(n_components = n, metric = 'euclidean')
    inputs_umap = umapfit.fit_transform(tfidf_data)
    preds = km.fit_predict(inputs_umap)
    get_cluster_metrics(labels, preds)
    scores.append(get_cluster_metrics(labels, preds)['score'].tolist())
metrics = get_cluster_metrics(labels, preds)['metric'].tolist()
metrics_df = pd.DataFrame(scores, columns=metrics, index=n_components)
metrics_df

Unnamed: 0,homogeneity_score,completeness_score,v_measure_score,adjusted_rand_score,adjusted_mutual_info_score
1,0.503905,0.532986,0.518038,0.449377,0.516362
2,0.754845,0.754043,0.754443,0.792041,0.753615
3,0.761434,0.762732,0.762083,0.7966,0.761279
5,0.763608,0.764623,0.764115,0.798185,0.763318
10,0.759082,0.76052,0.759801,0.792509,0.758989
20,0.764593,0.765417,0.765005,0.799412,0.764211
50,0.758525,0.759729,0.759127,0.792946,0.758313
100,0.756974,0.758256,0.757614,0.790324,0.756795
300,0.758194,0.759761,0.758977,0.792098,0.758162
500,0.763876,0.764883,0.764379,0.797515,0.763583


### cosine

In [8]:
scores = []
for n in n_components:
    umapfit = UMAP(n_components = n, metric = 'cosine')
    inputs_umap = umapfit.fit_transform(tfidf_data)
    preds = km.fit_predict(inputs_umap)
    get_cluster_metrics(labels, preds)
    scores.append(get_cluster_metrics(labels, preds)['score'].tolist())
metrics = get_cluster_metrics(labels, preds)['metric'].tolist()
metrics_df = pd.DataFrame(scores, columns=metrics, index=n_components)
metrics_df

Unnamed: 0,homogeneity_score,completeness_score,v_measure_score,adjusted_rand_score,adjusted_mutual_info_score
1,0.722813,0.722302,0.722557,0.759215,0.721621
2,0.757159,0.758311,0.757735,0.790911,0.756916
3,0.7648,0.765501,0.765151,0.800332,0.764357
5,0.758756,0.759741,0.759248,0.794197,0.758435
10,0.774445,0.775069,0.774757,0.810086,0.773996
20,0.75888,0.760047,0.759463,0.793803,0.75865
50,0.76078,0.761916,0.761348,0.794339,0.760542
100,0.75897,0.760256,0.759612,0.793557,0.7588
300,0.763328,0.764328,0.763828,0.798372,0.76303
500,0.759884,0.760713,0.760298,0.795409,0.759489


# clustering (K-mean, Agglomerative, DBSCAN, HDBSCAN)

In [9]:
svd_train = TruncatedSVD(
    n_components=500, random_state=RANDOM_SEED).fit_transform(tfidf_data)


## K-mean

In [10]:
preds = km.fit_predict(svd_train)

get_cluster_metrics(labels, preds)

Unnamed: 0,metric,score
0,homogeneity_score,0.782276
1,completeness_score,0.799019
2,v_measure_score,0.790559
3,adjusted_rand_score,0.780867
4,adjusted_mutual_info_score,0.789844


## Agglomerative

In [12]:
preds = AgglomerativeClustering(n_clusters = category_id_df.Category.count(), linkage = 'ward').fit_predict(svd_train)

get_cluster_metrics(labels, preds)

Unnamed: 0,metric,score
0,homogeneity_score,0.567887
1,completeness_score,0.609867
2,v_measure_score,0.588129
3,adjusted_rand_score,0.512709
4,adjusted_mutual_info_score,0.586687


## DBSCAN

In [None]:
eps_list = [x * 0.05 for x in range(1, 21)]
min_samples_list = list(range(5,500,10))

scores = []
for ep in eps_list:
    for min_samp in min_samples_list:
        preds = DBSCAN(eps = ep, min_samples = min_samp, n_jobs = -1).fit_predict(inputs_umap)
        row = get_cluster_metrics(labels, preds)['score'].tolist()
        row.append(ep)
        row.append(min_samp)
        scores.append(row)
        
titles = get_cluster_metrics(labels, preds)['metric'].tolist()
titles.append("Epsilon")
titles.append("min_samples")
pd.DataFrame(scores, columns=titles)

## HDBSCAN

In [15]:
clust_eps_list = [x * 0.05 for x in range(1, 21)]
min_samples_list = list(range(5,500,10))

scores = []
for ep in clust_eps_list:
    for min_samp in min_samples_list:
        preds = hdbscan.HDBSCAN(min_cluster_size=100, min_samples = min_samp, cluster_selection_epsilon = ep, core_dist_n_jobs=-1).fit_predict(inputs_umap)
        row = get_cluster_metrics(labels, preds)['score'].tolist()
        row.append(ep)
        row.append(min_samp)
        scores.append(row)
        
titles = get_cluster_metrics(labels, preds)['metric'].tolist()
titles.append("Cluster_sel_Epsilon")
titles.append("min_samples")
pd.DataFrame(scores, columns=titles)

Unnamed: 0,metric,score
0,homogeneity_score,0.128242
1,completeness_score,0.216089
2,v_measure_score,0.160959
3,adjusted_rand_score,0.058611
4,adjusted_mutual_info_score,0.159183
