# libraries, helper functions and constants

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from umap import UMAP
from sklearn.decomposition import NMF
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from hdbscan import HDBSCAN
from scipy.optimize import linear_sum_assignment
from sklearn.metrics.cluster import (
    contingency_matrix,
    homogeneity_score,
    v_measure_score,
    completeness_score,
    adjusted_rand_score,
    adjusted_mutual_info_score
)
from plotmat import plot_mat

def get_cluster_metrics(y_true, y_pred, metrics=None):
    if not metrics:
        metrics = [
            homogeneity_score,
            completeness_score,
            v_measure_score,
            adjusted_rand_score,
            adjusted_mutual_info_score
        ]
    d = {}
    for m in metrics:
        d[m.__name__] = m(y_true, y_pred)
    df = pd.DataFrame(d, index=[0]).T
    df.reset_index(inplace=True)
    df.rename(columns={'index': 'metric', 0: 'score'}, inplace=True)
    return df

RANDOM_SEED = 42

# load in data and transform to TF-IDF vectors

In [2]:
df = pd.read_csv("./BBC_News_Train.csv")
df['category_id'], _ = df['Category'].factorize()
category_id_df = df[['Category', 'category_id']].drop_duplicates()

In [3]:
tfidf = TfidfVectorizer(min_df=5, stop_words='english')
tfidf_data = tfidf.fit_transform(df.Text)
labels = df.category_id

# dimension reduction (Truncated SVD, NMF and UMAP with K-mean)

In [4]:
km = KMeans(
    n_clusters=category_id_df.Category.count(),
    random_state=0,
    max_iter=5000,
    n_init=50
)

## Truncated SVD

In [5]:
scores = []
n_components = [1, 2, 3, 5, 10, 20, 50, 100, 300, 500, 800]
for r in n_components:
    input_svd = TruncatedSVD(
        n_components=r, random_state=RANDOM_SEED).fit_transform(tfidf_data)
    preds = km.fit_predict(input_svd)
    scores.append(get_cluster_metrics(labels, preds)['score'].tolist())
metrics = get_cluster_metrics(labels, preds)['metric'].tolist()
svd_metrics_df = pd.DataFrame(scores, columns=metrics, index=n_components)
svd_metrics_df

Unnamed: 0,homogeneity_score,completeness_score,v_measure_score,adjusted_rand_score,adjusted_mutual_info_score
1,0.094331,0.104362,0.099094,0.049817,0.095885
2,0.303752,0.34661,0.323769,0.193236,0.321323
3,0.448852,0.502422,0.474129,0.355048,0.472247
5,0.636503,0.68509,0.659903,0.572929,0.658711
10,0.633928,0.707318,0.668616,0.543747,0.667433
20,0.616071,0.693878,0.652664,0.514318,0.651419
50,0.635522,0.718284,0.674373,0.536805,0.673204
100,0.620452,0.698453,0.657146,0.517965,0.655918
300,0.715419,0.75744,0.73583,0.662944,0.734912
500,0.782276,0.799019,0.790559,0.780867,0.789844


## NMF

In [6]:
scores = []
for r in n_components:
    inputs_nmf = NMF(n_components=r, init='random',
                     random_state=RANDOM_SEED).fit_transform(tfidf_data)
    preds = km.fit_predict(inputs_nmf)
    scores.append(get_cluster_metrics(labels, preds)['score'].tolist())
metrics = get_cluster_metrics(labels, preds)['metric'].tolist()
nmf_metrics_df = pd.DataFrame(scores, columns=metrics, index=n_components)
nmf_metrics_df



Unnamed: 0,homogeneity_score,completeness_score,v_measure_score,adjusted_rand_score,adjusted_mutual_info_score
1,0.094331,0.104362,0.099094,0.049817,0.095885
2,0.247705,0.308859,0.274922,0.127401,0.272187
3,0.477529,0.493823,0.485539,0.343355,0.483772
5,0.663295,0.708868,0.685325,0.612784,0.684226
10,0.472978,0.627064,0.539229,0.319264,0.537447
20,0.254254,0.456657,0.326642,0.074724,0.323695
50,0.143036,0.427838,0.214395,0.049344,0.210228
100,0.068191,0.370604,0.115188,0.011561,0.109439
300,0.084514,0.349946,0.136147,0.015954,0.131116
500,0.032743,0.290324,0.058849,0.005426,0.052556


## UMAP

### eucledian

In [7]:
scores = []
for n in n_components:
    umapfit = UMAP(n_components = n, metric = 'euclidean')
    inputs_umap = umapfit.fit_transform(tfidf_data)
    preds = km.fit_predict(inputs_umap)
    get_cluster_metrics(labels, preds)
    scores.append(get_cluster_metrics(labels, preds)['score'].tolist())
metrics = get_cluster_metrics(labels, preds)['metric'].tolist()
metrics_df = pd.DataFrame(scores, columns=metrics, index=n_components)
metrics_df

Unnamed: 0,homogeneity_score,completeness_score,v_measure_score,adjusted_rand_score,adjusted_mutual_info_score
1,0.712346,0.711114,0.71173,0.73843,0.710757
2,0.732026,0.730112,0.731068,0.757938,0.730161
3,0.760397,0.761329,0.760863,0.794122,0.760055
5,0.7663,0.767033,0.766667,0.799045,0.765878
10,0.771429,0.771964,0.771696,0.805458,0.770925
20,0.769716,0.770296,0.770006,0.802935,0.769229
50,0.770349,0.771133,0.770741,0.80729,0.769967
100,0.772771,0.773233,0.773002,0.806672,0.772235
300,0.766811,0.767555,0.767183,0.802041,0.766396
500,0.763399,0.764254,0.763826,0.795116,0.763028


### cosine

In [8]:
scores = []
for n in n_components:
    umapfit = UMAP(n_components = n, metric = 'cosine')
    inputs_umap = umapfit.fit_transform(tfidf_data)
    preds = km.fit_predict(inputs_umap)
    get_cluster_metrics(labels, preds)
    scores.append(get_cluster_metrics(labels, preds)['score'].tolist())
metrics = get_cluster_metrics(labels, preds)['metric'].tolist()
metrics_df = pd.DataFrame(scores, columns=metrics, index=n_components)
metrics_df

Unnamed: 0,homogeneity_score,completeness_score,v_measure_score,adjusted_rand_score,adjusted_mutual_info_score
1,0.585735,0.608267,0.596788,0.528679,0.5954
2,0.76781,0.768735,0.768273,0.803305,0.76749
3,0.755192,0.756605,0.755898,0.789384,0.755073
5,0.759993,0.761009,0.7605,0.794182,0.759691
10,0.76171,0.762731,0.76222,0.796569,0.761417
20,0.765518,0.766405,0.765961,0.800771,0.765171
50,0.760465,0.76157,0.761017,0.794474,0.76021
100,0.760029,0.761138,0.760583,0.794369,0.759774
300,0.761019,0.762092,0.761556,0.794362,0.76075
500,0.7651,0.765959,0.76553,0.800604,0.764738


# clustering (K-mean, Agglomerative, DBSCAN, HDBSCAN)

In [9]:
svd_train = TruncatedSVD(
    n_components=500, random_state=RANDOM_SEED).fit_transform(tfidf_data)


## K-mean

In [10]:
preds = km.fit_predict(svd_train)

get_cluster_metrics(labels, preds)

Unnamed: 0,metric,score
0,homogeneity_score,0.782276
1,completeness_score,0.799019
2,v_measure_score,0.790559
3,adjusted_rand_score,0.780867
4,adjusted_mutual_info_score,0.789844


## Agglomerative

In [11]:
preds = AgglomerativeClustering(n_clusters = category_id_df.Category.count(), linkage = 'ward').fit_predict(svd_train)

get_cluster_metrics(labels, preds)

Unnamed: 0,metric,score
0,homogeneity_score,0.567887
1,completeness_score,0.609867
2,v_measure_score,0.588129
3,adjusted_rand_score,0.512709
4,adjusted_mutual_info_score,0.586687


## DBSCAN

In [12]:
eps_list = [x * 0.05 for x in range(1, 21)]
min_samples_list = list(range(5,200,10))

scores = []
for ep in eps_list:
    for min_samp in min_samples_list:
        preds = DBSCAN(eps = ep, min_samples = min_samp, n_jobs = -1).fit_predict(inputs_umap)
        row = get_cluster_metrics(labels, preds)['score'].tolist()
        row.append(ep)
        row.append(min_samp)
        scores.append(row)
        
titles = get_cluster_metrics(labels, preds)['metric'].tolist()
titles.append("Epsilon")
titles.append("min_samples")
metrics_df = pd.DataFrame(scores, columns=titles)
metrics_df.to_excel('DBSCAN.xlsx')

Detailed result can be found in DBSCAN.xlsx, here I picked the one with highest score and show it below

In [13]:
preds = DBSCAN(eps = 0.95, min_samples = 75, n_jobs = -1).fit_predict(inputs_umap)

get_cluster_metrics(labels, preds)

Unnamed: 0,metric,score
0,homogeneity_score,0.616435
1,completeness_score,0.641266
2,v_measure_score,0.628606
3,adjusted_rand_score,0.56265
4,adjusted_mutual_info_score,0.627326


## HDBSCAN

In [14]:
clust_eps_list = [x * 0.05 for x in range(1, 21)]
min_samples_list = list(range(5,500,10))

scores = []
for ep in clust_eps_list:
    for min_samp in min_samples_list:
        preds = HDBSCAN(min_cluster_size=100, min_samples = min_samp, cluster_selection_epsilon = ep, core_dist_n_jobs=-1).fit_predict(inputs_umap)
        row = get_cluster_metrics(labels, preds)['score'].tolist()
        row.append(ep)
        row.append(min_samp)
        scores.append(row)
        
titles = get_cluster_metrics(labels, preds)['metric'].tolist()
titles.append("Cluster_sel_Epsilon")
titles.append("min_samples")
metrics_df = pd.DataFrame(scores, columns=titles)
metrics_df.to_excel('HDBSCAN.xlsx')

Detailed result can be found in HDBSCAN.xlsx, here I picked the one with highest score and show it below

In [15]:
preds = HDBSCAN(min_cluster_size=100, min_samples = 5, cluster_selection_epsilon = 0.05, core_dist_n_jobs=-1).fit_predict(inputs_umap)

get_cluster_metrics(labels, preds)

Unnamed: 0,metric,score
0,homogeneity_score,0.557144
1,completeness_score,0.791975
2,v_measure_score,0.654122
3,adjusted_rand_score,0.566095
4,adjusted_mutual_info_score,0.653065
