In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from utils import *

### Problem definition
Perform PCA on data set to reduce dimensionality, and perform K-Means Clustering to see if the clustering segregates the data into CA diagnosed and non diagnosed.

# Principal Component Analysis

In [2]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler, PowerTransformer, QuantileTransformer, MaxAbsScaler, RobustScaler
from sklearn import metrics

from sklearn.cluster import KMeans

In [3]:
#final_df = get_lapp(red_flags+medical_history)
#final_df['PHDIAGCD'] = final_df['PHDIAGCD'].fillna(0) 

In [4]:
final_df = get_lapp(red_flags+medical_history)
#final_df['PHDIAGCD'] = final_df['PHDIAGCD'].fillna(0) 
final_df = final_df[final_df['PHDIAGCD'].notna()]

In [5]:
final_df['K/L Ratio'] = final_df['HEKAPRE'] / final_df['HELAMRE']
final_df = final_df.drop(columns=['HEKAPRE', 'HELAMRE'])

In [6]:
# One-hot encode
final_df = pd.get_dummies(final_df, columns=ohe_data)

In [7]:
# Standardize
final_df[standardized_data] = QuantileTransformer(n_quantiles='None', output_distribution='uniform', random_state=0).fit_transform(final_df[standardized_data])
#final_df[standardized_data] = RobustScaler().fit_transform(final_df[standardized_data])

TypeError: '<=' not supported between instances of 'NoneType' and 'int'

In [None]:
final_df = drop_missing_cols(final_df, 30)

In [None]:
plot_missing_percentages(final_df)

In [None]:
male, female = get_male_female(final_df)

In [None]:
colors = {1:'red', 97:'green', 2:'blue'}

## PCA on Male data set

In [None]:
def pca_pipe(dataset):
    pca = PCA(30)
    X = dataset
    
    # Impute NaN value data
    # imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    imp = IterativeImputer(n_nearest_features=10, max_iter=50, initial_strategy='median', random_state=0)
    #imp = KNNImputer(n_neighbors=5, weights="distance")
    imp.fit(X)
    X_new = imp.transform(X)
    
    # Project PCA
    X_proj = pca.fit_transform(X_new)
    t_sne = TSNE(n_components=2, init='pca', perplexity=50, n_iter=5000, n_iter_without_progress=1000)
    X_proj = t_sne.fit_transform(X_proj)
    return X_proj

def tts(X, y, test_size):
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=test_size)
    return X_train, X_test, y_train, y_test

In [None]:
male_X = male.drop(columns=['PHDIAGCD', 'DMSEX', 'PHMH1', 'PHMH3','PHMH4','PHMH5','PHMH10','PHMH11','PHMH12','PHMH13','PHMH14','PHMH15','PHMH16','PHMH17','PHMH18'])
male_y = male['PHDIAGCD']
X_proj = pca_pipe(male_X)

In [None]:
kmeans = KMeans(n_clusters=2, random_state=0).fit(X_proj)
pred = kmeans.predict(X_proj)
centroids = kmeans.cluster_centers_
plt.figure(figsize=(18, 7))

plt.subplot(1, 2, 1)
plt.scatter(X_proj[:,0], X_proj[:,1], c=male_y)
plt.scatter(centroids[:,0] , centroids[:,1] , s = 80, color = 'red')
plt.xlabel('t-SNE 1')
plt.ylabel('t-SNE 2')

plt.subplot(1, 2, 2)
plt.scatter(X_proj[:,0], X_proj[:,1], c=pred)
plt.scatter(centroids[:,0] , centroids[:,1] , s = 80, color = 'red') 
plt.xlabel('t-SNE 1')
plt.ylabel('t-SNE 2')

#plt.savefig('result_pca_tsne_kmeans_male')
plt.show()

## PCA on Female data set

In [None]:
female_X = female.drop(columns=['PHDIAGCD', 'DMSEX', 'PHMH1', 'PHMH3','PHMH4','PHMH5','PHMH10','PHMH11','PHMH12','PHMH13','PHMH14','PHMH15','PHMH16','PHMH17','PHMH18'])
female_y = female['PHDIAGCD']
X_proj = pca_pipe(female_X)

In [None]:
kmeans = KMeans(n_clusters=2, random_state=0).fit(X_proj)
pred = kmeans.predict(X_proj)
centroids = kmeans.cluster_centers_
plt.figure(figsize=(18, 7))

plt.subplot(1, 2, 1)
plt.scatter(X_proj[:,0], X_proj[:,1], c=female_y)
plt.scatter(centroids[:,0] , centroids[:,1] , s = 80, color = 'red')
plt.xlabel('t-SNE 1')
plt.ylabel('t-SNE 2')

plt.subplot(1, 2, 2)
plt.scatter(X_proj[:,0], X_proj[:,1], c=pred)
plt.scatter(centroids[:,0] , centroids[:,1] , s = 80, color = 'red') 
plt.xlabel('t-SNE 1')
plt.ylabel('t-SNE 2')

#plt.savefig('result_pca_tsne_kmeans_male')
plt.show()

## PCA on both gender

In [None]:
X = final_df.drop(columns=['PHDIAGCD', 'DMSEX', 'PHMH1', 'PHMH3','PHMH4','PHMH5','PHMH10','PHMH11','PHMH12','PHMH13','PHMH14','PHMH15','PHMH16','PHMH17','PHMH18'])
y = final_df['PHDIAGCD']
X_proj = pca_pipe(X)

In [None]:
plt.scatter(X_proj[:,0], X_proj[:,1], c=y.map(colors).fillna('yellow'))
plt.show()

# K-Means Clustering

In [None]:
X = final_df.drop(columns=['PHDIAGCD', 'DMSEX', 'PHMH1', 'PHMH3','PHMH4','PHMH5','PHMH10','PHMH11','PHMH12','PHMH13','PHMH14','PHMH15','PHMH16','PHMH17','PHMH18'])
y = final_df['PHDIAGCD']
X_train, X_test, y_train, y_test = tts(X, y, 0.3)

In [None]:
Xt_proj = pca_pipe(X_train)
#Xtest_proj = pca_pipe(X_test)

### How to determine K value?
By using Elbow method, which is a heuristic used in determining the number of clusters in a data set.

In [None]:
inertia_list = []
for num_clusters in np.arange(1, 21):
    km = KMeans(n_clusters=num_clusters)
    km.fit(Xt_proj)
    inertia_list.append(km.inertia_)

In [None]:
#Plotting the Elbow Curve
plt.figure(figsize=(10, 5))
plt.plot(np.arange(1, 21), inertia_list)
plt.grid(True)
plt.xlabel('Values of K')
plt.ylabel('Inertia')
plt.title('Elbow Curve')
plt.show()

From above, we select the optimum value of k by determining the Elbow Point - a point after which the inertia starts decreasing linearly. In this case, we can select the value of k as 5.

### Main

In [None]:
X_pca = pca_pipe(X)
kmeans = KMeans(n_clusters=2, random_state=0).fit(X_pca)
pred = kmeans.predict(X_pca)

In [None]:
plt.figure(figsize=(18, 7))

plt.subplot(1, 2, 1)
plt.scatter(X_pca[:,0], X_pca[:,1], c=y)
centroids = kmeans.cluster_centers_
plt.scatter(centroids[:,0] , centroids[:,1] , s = 80, color = 'red')
plt.xlabel('t-SNE 1')
plt.ylabel('t-SNE 2')

plt.subplot(1, 2, 2)
plt.scatter(X_pca[:,0], X_pca[:,1], c=pred)
centroids = kmeans.cluster_centers_
plt.scatter(centroids[:,0] , centroids[:,1] , s = 80, color = 'red') 
plt.xlabel('t-SNE 1')
plt.ylabel('t-SNE 2')

plt.savefig('result_pca_tsne_kmeans')
plt.show()

In [None]:
plt.scatter(X_pca[:,0], X_pca[:,1], c=y)
centroids = kmeans.cluster_centers_
plt.scatter(centroids[:,0] , centroids[:,1] , s = 80, color = 'red')
plt.xlabel('t-SNE 1')
plt.ylabel('t-SNE 2')
plt.show()

In [None]:
print(metrics.silhouette_score(X_pca, pred, metric='euclidean'))
print(compute_jaccard(y, pred))
cont = metrics.cluster.contingency_matrix(y.replace({1:1, 2:1, 97: 0}), pred)
print(np.sum(np.amax(cont, axis=0)) / np.sum(cont))

### Test

### ignore

# Hierarchical Clustering
https://www.analyticsvidhya.com/blog/2019/05/beginners-guide-hierarchical-clustering/

In [None]:
from sklearn.preprocessing import normalize
import scipy.cluster.hierarchy as shc
from sklearn.cluster import AgglomerativeClustering

In [None]:
X = final_df.drop(columns=['PHDIAGCD', 'DMSEX', 'PHMH1', 'PHMH3','PHMH4','PHMH5','PHMH10','PHMH11','PHMH12','PHMH13','PHMH14','PHMH15','PHMH16','PHMH17','PHMH18'])
y = final_df['PHDIAGCD']
X_train, X_test, y_train, y_test = tts(X, y, 0.3)

X_proj = pca_pipe(X) 
Xt_proj = pca_pipe(X_train)
#Xtest_proj = pca_pipe(X_test)

In [None]:
plt.figure(figsize=(10, 7))  
plt.title("Dendrograms")  
dend = shc.dendrogram(shc.linkage(X_proj, method='ward'))

In [None]:
plt.figure(figsize=(10, 7))  
plt.scatter(X_proj[:,0], X_proj[:,1], c=y) 

# Agglomerative Hierarchical Clustering

In [None]:
cluster = AgglomerativeClustering(n_clusters=2, affinity='euclidean', linkage='ward')  
preds = cluster.fit_predict(X_pca)

#plt.figure(figsize=(10, 7))
#plt.scatter(X_pca[:,0], X_pca[:,1], c=preds)

plt.figure(figsize=(18, 7))

plt.subplot(1, 2, 1)
plt.scatter(X_pca[:,0], X_pca[:,1], c=y)
plt.xlabel('t-SNE 1')
plt.ylabel('t-SNE 2')

plt.subplot(1, 2, 2)
plt.scatter(X_pca[:,0], X_pca[:,1], c=preds)
plt.xlabel('t-SNE 1')
plt.ylabel('t-SNE 2')

plt.savefig('result_pca_tsne_agglo')
plt.show()

In [None]:
plt.figure(figsize=(10, 7))  
plt.scatter(X_pca[:,0], X_pca[:,1], c=y)

In [None]:
print(metrics.silhouette_score(X_pca, preds, metric='euclidean'))
print(compute_jaccard(y, preds))