In [1]:
import math
import numpy as np
import sklearn as sk
import sklearn.cluster as skc
import sklearn.preprocessing as skp
import sklearn.metrics as skm
import sklearn.decomposition as skd
import sklearn.feature_extraction as skf
import scipy as sy
import pandas as pd

import seaborn as sb
import altair as alt

from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# Dataset 1

In [2]:
X_ = pd.read_csv('./Datasets/genedata.csv')
X_.head()

Unnamed: 0,id,class,f1,f2,f3,f4,f5,f6,f7,f8,...,f6991,f6992,f6993,f6994,f6995,f6996,f6997,f6998,f6999,f7000
0,1,5,8.6822,8.3607,7.1078,11.3974,6.7685,9.7552,7.9122,8.5827,...,9.2469,8.992,7.417,10.9942,12.4621,7.4424,12.3991,4.6921,6.21,4.1068
1,2,4,10.2448,9.9164,7.1761,12.0328,7.697,8.2918,10.7112,9.5767,...,9.6918,10.8325,4.2937,10.4636,8.5862,8.3805,12.356,8.6235,7.2895,6.8542
2,3,5,9.6114,5.2214,9.4089,11.7573,8.0103,7.9034,8.5927,9.8662,...,10.3011,9.7244,5.8517,10.0763,9.9599,6.135,11.8595,5.4752,6.7775,8.0328
3,4,5,7.5504,6.5741,5.9008,11.3855,7.7801,8.5287,8.4307,7.9783,...,10.3311,10.1674,6.2599,10.9045,8.7745,7.2404,12.3557,4.0114,4.4892,7.8933
4,5,1,9.628,7.1911,8.7755,11.9096,5.9766,7.7992,8.7471,9.4115,...,10.417,10.0443,5.4938,9.8659,9.2349,7.5788,10.9341,4.8222,7.3389,7.0542


## t-SNE

In [3]:
X = X_.iloc[:, 2:]
X_labels_gt = X_['class'].values

Xu_train = skp.normalize(X) # pd.DataFrame(skp.StandardScaler().fit_transform(X))
y_train = pd.DataFrame(X_labels_gt, columns=['y'])

man = sk.manifold.TSNE(n_components=2, init='pca', method='exact', random_state=0)
Xu_tsne = man.fit_transform(Xu_train)

### Plot

In [4]:
dtsne = pd.concat([pd.DataFrame(Xu_tsne, columns=["x1", "x2"]), y_train], axis=1)

alt.Chart(dtsne).mark_circle().encode(
    x='x1',
    y='x2',
    color=alt.Color('y:O',
                    scale=alt.Scale(scheme='viridis')),
    size=alt.value(50),
    tooltip=['x1','x2','y:O']
).properties(title = "t-SNE").interactive()

### PCA

In [5]:
pca = skd.PCA(2)
Xu_pca = pca.fit_transform(Xu_train)

dpca = pd.concat([pd.DataFrame(Xu_pca, columns=["x1", "x2"]), y_train], axis=1)

alt.Chart(dpca).mark_circle().encode(
    x='x1',
    y='x2',
    color=alt.Color('y:O',
                    scale=alt.Scale(scheme='viridis')),
    size=alt.value(50),
    tooltip=['x1','x2','y:O']
).properties(title = "PCA").interactive()

## K-Means

In [6]:
def goodness_eval(X, K, labels, labels_gt):
  # Silhouette
  sil_score = skm.silhouette_score(X, labels, random_state=0)

  # Davies-Bouldin
  dav_score = skm.davies_bouldin_score(X, labels)

  # Normalized Mutual Information
  nmi_score = skm.normalized_mutual_info_score(labels_gt, labels,
                                               average_method='geometric')

  return pd.DataFrame([sil_score, dav_score, nmi_score],
                      index=['Silhouette', 'Davies-Bouldin', 'NMI'],
                      columns=[f"K={K}"])
  
def kmeans_eval(X, K, labels_gt=None):
  km = skc.KMeans(n_clusters=K, random_state=0).fit(X)
  labels = km.labels_

  G = goodness_eval(X, K, labels, labels_gt)

  return km, G

In [7]:
man = sk.manifold.TSNE(n_components=2, init='pca', method='exact', random_state=0)
Xu_tsne = man.fit_transform(Xu_train)

km, G = kmeans_eval(Xu_tsne, 5, X_labels_gt)
G

Unnamed: 0,K=5
Silhouette,0.738015
Davies-Bouldin,0.328265
NMI,0.9848


## Agglomerative Clustering

In [8]:
agc = skc.AgglomerativeClustering(n_clusters=5, affinity="euclidean",
                                  linkage="complete")
agc_labels = agc.fit_predict(Xu_tsne)

goodness_eval(Xu_tsne, 5, agc_labels, X_labels_gt)

Unnamed: 0,K=5
Silhouette,0.739308
Davies-Bouldin,0.325233
NMI,0.990635


In [9]:
with open("solution1.txt", "w") as f:
    f.write("\n".join([str(a) for a in agc_labels.tolist()]))

# Dataset 2

### Pre-processing

In [10]:
X_ = pd.read_csv('./Datasets/msdata.csv')
X_.head()

Unnamed: 0,id,class,f1,f2,f3,f4,f5,f6,f7,f8,...,f4991,f4992,f4993,f4994,f4995,f4996,f4997,f4998,f4999,f5000
0,1,1,13.8539,15.7167,14.8533,13.9316,13.0663,13.2494,17.1384,13.2184,...,14.6045,14.0689,15.1286,13.6916,13.4927,12.6904,10.8834,15.3469,21.2647,15.7776
1,2,1,14.628,15.728,15.928,13.9575,13.1097,13.7043,16.8954,14.3559,...,16.5467,14.2894,15.7565,12.5565,13.7318,15.3065,11.8305,16.3588,21.6378,16.1079
2,3,1,14.2135,16.469,15.6417,14.2476,12.6421,13.2018,16.4398,13.6928,...,15.8893,14.0552,15.679,12.9687,13.5958,12.7647,11.8214,16.2765,22.1998,16.2995
3,4,1,13.5922,15.5715,15.3669,13.5531,13.1564,13.2907,17.5997,7.9448,...,14.3924,14.946,14.6135,13.1794,15.8272,13.7444,10.2937,15.0915,21.1928,15.8555
4,5,1,13.9646,14.1683,16.2035,13.4925,12.7097,13.0803,16.6297,11.6088,...,15.5511,13.9978,15.34,12.9142,13.751,15.0571,10.2703,16.1585,22.1276,15.8248


In [11]:
X = X_.iloc[:, 2:]
X_labels_gt = X_['class'].values
X_labels = np.zeros_like(X_labels_gt)

Xu_train = skp.normalize(X)

y_train = pd.DataFrame(X_labels_gt, columns=['y'])

pd.DataFrame(Xu_train).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
0,0.01297,0.014714,0.013906,0.013043,0.012233,0.012404,0.016045,0.012375,0.012966,0.023112,...,0.013673,0.013172,0.014164,0.012818,0.012632,0.011881,0.010189,0.014368,0.019908,0.014771
1,0.013104,0.01409,0.014269,0.012504,0.011744,0.012277,0.015135,0.01286,0.012395,0.021547,...,0.014823,0.012801,0.014115,0.011248,0.012301,0.013712,0.010598,0.014655,0.019384,0.01443
2,0.012992,0.015053,0.014297,0.013023,0.011555,0.012067,0.015027,0.012516,0.012776,0.021985,...,0.014523,0.012847,0.014331,0.011854,0.012427,0.011667,0.010805,0.014877,0.020291,0.014898
3,0.01264,0.014481,0.01429,0.012604,0.012235,0.01236,0.016367,0.007388,0.012623,0.02338,...,0.013384,0.013899,0.01359,0.012256,0.014718,0.012782,0.009573,0.014034,0.019708,0.014745
4,0.012696,0.012881,0.014732,0.012267,0.011555,0.011892,0.015119,0.010554,0.013384,0.021647,...,0.014139,0.012726,0.013947,0.011741,0.012502,0.01369,0.009337,0.014691,0.020118,0.014387


## EDA

### Plot

**t-SNE**

In [12]:
tsne = sk.manifold.TSNE(n_components=2, init='pca', method='exact', random_state=0)
Xu_tsne = tsne.fit_transform(Xu_train)

dtsne = pd.concat([pd.DataFrame(Xu_tsne, columns=["x1", "x2"]), y_train], axis=1)

alt.Chart(dtsne).mark_circle().encode(
    x='x1',
    y='x2',
    color=alt.Color('y:O',
                    scale=alt.Scale(scheme='viridis')),
    size=alt.value(50),
    tooltip=['x1','x2','y:O']
).properties(title = "t-SNE").interactive()

**PCA**

In [13]:
pca = skd.PCA(2)
Xu_pca = pca.fit_transform(Xu_train)

dpca = pd.concat([pd.DataFrame(Xu_pca, columns=["x1", "x2"]), y_train], axis=1)

alt.Chart(dpca).mark_circle().encode(
    x='x1',
    y='x2',
    color=alt.Color('y:O',
                    scale=alt.Scale(scheme='viridis')),
    size=alt.value(50),
    tooltip=['x1','x2','y:O']
).properties(title = "PCA").interactive()

## Clustering

### PCA(2) + Agglomerative Clustering (ward linkage)

In [14]:
pca = skd.PCA(2, svd_solver="full")
Xu_pca = pca.fit_transform(Xu_train)

agc = skc.AgglomerativeClustering(n_clusters=3, affinity="euclidean",
                                  linkage="ward")
agc_labels = agc.fit_predict(Xu_pca)

goodness_eval(Xu_pca, 3, agc_labels, X_labels_gt)

Unnamed: 0,K=3
Silhouette,0.624004
Davies-Bouldin,0.501754
NMI,0.950785


In [15]:
with open("solution2.txt", "w") as f:
    f.write("\n".join([str(a) for a in agc_labels.tolist()]))

In [16]:
X_labels_gt_df = pd.DataFrame((3 - X_labels_gt), columns=["y"])

agc_pca_labels = pd.DataFrame(agc_labels, columns=["y"])
agc_pca_misses = np.argwhere((agc_pca_labels != X_labels_gt_df).values.ravel())\
                   .ravel()
# agc_pca_labels.loc[agc_pca_misses, "y"] = -1
agc_pca_plot = dpca.copy()
agc_pca_plot["y"] = agc_pca_labels

alt.Chart(agc_pca_plot).mark_circle().encode(
    x='x1',
    y='x2',
    color=alt.Color('y:O',
                    scale=alt.Scale(scheme='viridis')),
    size=alt.value(50),
    tooltip=['x1','x2','y:O']
).properties(title = "PCA(3) + Agglomerative(3)").interactive()

### PCA(2) + K-Means(3)

In [17]:
pca = skd.PCA(2, svd_solver="full")
Xu_pca = pca.fit_transform(Xu_train)

km = skc.KMeans(n_clusters=3, random_state=0).fit(Xu_pca)
km_labels = km.labels_

G = goodness_eval(Xu_pca, 3, km_labels, X_labels_gt)

G

Unnamed: 0,K=3
Silhouette,0.625858
Davies-Bouldin,0.498606
NMI,0.922516


In [18]:
X_labels_gt_df = pd.DataFrame(( - X_labels_gt), columns=["y"])

km_pca_labels = pd.DataFrame(km_labels, columns=["y"])
km_pca_misses = np.argwhere((km_pca_labels != X_labels_gt_df).values.ravel())\
                   .ravel()
# km_pca_labels.loc[km_pca_misses, "y"] = -1
km_pca_plot = dpca.copy()
km_pca_plot["y"] = km_pca_labels

alt.Chart(km_pca_plot).mark_circle().encode(
    x='x1',
    y='x2',
    color=alt.Color('y:O',
                    scale=alt.Scale(scheme='viridis')),
    size=alt.value(50),
    tooltip=['x1','x2','y:O']
).properties(title = "PCA(3) + K-Means(3)").interactive()