In [None]:
# Importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn 
from sklearn.cluster import KMeans 
from kmodes.kmodes import KModes
from sklearn.preprocessing import StandardScaler, normalize 
from sklearn.metrics import silhouette_score 
from sklearn.metrics import silhouette_samples
from sklearn.decomposition import PCA
import collections
import scipy.cluster.hierarchy as hier
import plotly.graph_objects as go
import plotly.offline as pyo
import plotly.express as px
from plotly.subplots import make_subplots
from plotly import tools
import plotly.figure_factory as ff
from sklearn_extra.cluster import KMedoids
import scipy.cluster.hierarchy as sch
from sklearn.cluster import AgglomerativeClustering
import warnings
warnings.filterwarnings("ignore")

In [None]:
# reading csv file
df_dep=pd.read_csv("../Datasets/Depression.csv")
df_anx=pd.read_csv("../Datasets/Anxiety.csv")
df_str=pd.read_csv("../Datasets/Stress.csv")
df=pd.read_csv("../Datasets/CleanData.csv")

In [None]:
# sampling dataset
df_dep_samp=df_dep.sample(n=2500)
df_anx_samp=df_anx.sample(n=2500)
df_str_samp=df_str.sample(n=2500)

In [None]:
X_dep=df_dep_samp.iloc[:,0:7]
X_anx=df_anx_samp.iloc[:,0:7]
X_str=df_str_samp.iloc[:,0:7]

### since we have 7features each dataset, it's hard to visualize hence we will reduce the dimensions using PCA

In [None]:
pca = PCA(n_components = 3, random_state=1)
X_pca_dep = pca.fit_transform(X_dep)
pca.explained_variance_ratio_.cumsum()[1]

In [None]:
pca = PCA(n_components = 3, random_state=1)
X_pca_anx = pca.fit_transform(X_anx)
pca.explained_variance_ratio_.cumsum()[1]

In [None]:
pca = PCA(n_components = 3, random_state=1)
X_pca_str = pca.fit_transform(X_str)
pca.explained_variance_ratio_.cumsum()[1]

# Kmeans

### Elbow Method

In [None]:
wcss_dep = []
wcss_anx = []
wcss_str = []
clusters=7
for i in range(1,clusters+1):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 66)
    kmeans.fit(X_dep)
    wcss_dep.append(kmeans.inertia_)
    kmeans = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 66)
    kmeans.fit(X_anx)
    wcss_anx.append(kmeans.inertia_)
    kmeans = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 66)
    kmeans.fit(X_str)
    wcss_str.append(kmeans.inertia_)

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(1,clusters+1)), y=wcss_dep,
                    mode='lines+markers',
                    name='Depression Dataset'))
fig.add_trace(go.Scatter(x=list(range(1,clusters+1)), y=wcss_anx,
                    mode='lines+markers',
                    name='Anxiety Dataset'))
fig.add_trace(go.Scatter(x=list(range(1,clusters+1)), y=wcss_str,
                    mode='lines+markers',
                    name='Stress Dataset'))

fig.show()

### Silhouette Method

In [None]:
for cluster in range(2, clusters+1):
    km = KMeans(n_clusters = cluster, random_state = 1)
    y = km.fit_predict(X_dep)

    fig, ax1 = plt.subplots(1, 1)
    fig.set_size_inches(18, 7)
    
    ax1.set_xlim([-0.1, 1])
    ax1.set_ylim([0, len(X_dep) + (cluster + 1) * 10])
    
    score = silhouette_score(X_dep, y)
    print('score :',score)

    sil_sample = silhouette_samples(X_dep, y)
    y_lower = 10
    for i in range(cluster):
        i_sil_sample = sil_sample[y==i]
        i_sil_sample.sort()
        y_upper = y_lower + i_sil_sample.shape[0]
        ax1.fill_betweenx(np.arange(y_lower,y_upper),0,i_sil_sample)
        y_lower = y_upper + 10

score : 0.39232409588781264
score : 0.27720286655299414
score : 0.22110603157855677
score : 0.2136019593398565
score : 0.21452871640188279
score : 0.1970685583541331

## Kmeans visualization

In [None]:
kmeans = KMeans(n_clusters = 3, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 1)
ymeans = kmeans.fit_predict(X_dep)
print("Depression",collections.Counter(ymeans))

In [None]:
x, y, z = X_pca_dep.T
fig = px.scatter_3d(x = x, y = y, z = z, color = ymeans, opacity= 0.70, template='ggplot2')
fig.update_traces(marker=dict(size=4),
                  selector=dict(mode='markers'))
fig.show()

In [None]:
kmeans = KMeans(n_clusters = 3, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 1)
ymeans = kmeans.fit_predict(X_anx)
print("Anxiety",collections.Counter(ymeans))

In [None]:
x, y, z = X_pca_anx.T
fig = px.scatter_3d(x = x, y = y, z = z, color = ymeans, opacity= 0.70, template='ggplot2')
fig.update_traces(marker=dict(size=4),
                  selector=dict(mode='markers'))
fig.show()

In [None]:
kmeans = KMeans(n_clusters = 3, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 1)
ymeans = kmeans.fit_predict(X_str)
print("Stress",collections.Counter(ymeans))

In [None]:
x, y, z = X_pca_str.T
fig = px.scatter_3d(x = x, y = y, z = z, color = ymeans, opacity= 0.70, template='ggplot2')
fig.update_traces(marker=dict(size=4),
                  selector=dict(mode='markers'))
fig.show()

# Kmediods<a href="#Kmediods" class="anchor-link">¶</a>

### Elbow method<a href="#Elbow-method" class="anchor-link">¶</a>

In [None]:
wcss_dep = []
wcss_anx = []
wcss_str = []
clusters=7
for i in range(1,clusters+1):
    kmed = KMedoids(metric="euclidean", n_clusters=i,init="heuristic", max_iter=7)
    kmed.fit_predict(X_dep)
    wcss_dep.append(kmed.inertia_)
    
    kmed = KMedoids(metric="euclidean", n_clusters=i,init="heuristic", max_iter=7)
    kmed.fit_predict(X_anx)
    wcss_anx.append(kmed.inertia_)
    
    kmed = KMedoids(metric="euclidean", n_clusters=i,init="heuristic", max_iter=7)
    kmed.fit_predict(X_str)
    wcss_str.append(kmed.inertia_)

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(1,clusters+1)), y=wcss_dep,
                    mode='lines+markers',
                    name='Depression Dataset'))
fig.add_trace(go.Scatter(x=list(range(1,clusters+1)), y=wcss_anx,
                    mode='lines+markers',
                    name='Anxiety Dataset'))
fig.add_trace(go.Scatter(x=list(range(1,clusters+1)), y=wcss_str,
                    mode='lines+markers',
                    name='Stress Dataset'))

fig.show()

### Silhouette Method<a href="#Silhouette-Method" class="anchor-link">¶</a>

In [None]:
for cluster in range(2, clusters+1):
    km = KMeans(n_clusters = cluster, random_state = 1)
    y = km.fit_predict(X_dep)
    
    kmed = KMedoids(metric="euclidean", n_clusters=cluster,init="heuristic", max_iter=7)
    kmed.fit_predict(X_dep)
    
    fig, ax1 = plt.subplots(1, 1)
    fig.set_size_inches(18, 7)
    
    ax1.set_xlim([-0.1, 1])
    ax1.set_ylim([0, len(X_dep) + (cluster + 1) * 10])
    
    score = silhouette_score(X_dep, y)
    print('score :',score)

    sil_sample = silhouette_samples(X_dep, y)
    y_lower = 10
    for i in range(cluster):
        i_sil_sample = sil_sample[y==i]
        i_sil_sample.sort()
        y_upper = y_lower + i_sil_sample.shape[0]
        ax1.fill_betweenx(np.arange(y_lower,y_upper),0,i_sil_sample)
        y_lower = y_upper + 10

score : 0.39232409588781264
score : 0.27720286655299414
score : 0.22110603157855677
score : 0.2136019593398565
score : 0.21452871640188279
score : 0.1970685583541331

### Kmediods visualization<a href="#Kmediods-visualization" class="anchor-link">¶</a>

In [None]:
kmed = KMedoids(metric="manhattan", n_clusters=3,init='k-medoids++', max_iter=7)
ymed = kmed.fit_predict(X_dep)
print("Depression",collections.Counter(ymed))

Depression Counter({0: 1147, 1: 678, 2: 675})

In [None]:
x, y, z = X_pca_dep.T
fig = px.scatter_3d(x = x, y = y, z = z, color = ymed, opacity= 0.70, template='ggplot2')
fig.update_traces(marker=dict(size=4),
                  selector=dict(mode='markers'))
fig.show()

In [None]:
kmed = KMedoids(metric="manhattan", n_clusters=3,init='k-medoids++', max_iter=7)
ymed = kmed.fit_predict(X_anx)
print("Anxiety",collections.Counter(ymed))

Anxiety Counter({1: 1041, 0: 878, 2: 581})

In [None]:
x, y, z = X_pca_anx.T
fig = px.scatter_3d(x = x, y = y, z = z, color = ymed, opacity= 0.70, template='ggplot2')
fig.update_traces(marker=dict(size=4),
                  selector=dict(mode='markers'))
fig.show()

In [None]:
kmed = KMedoids(metric="manhattan", n_clusters=3,init='k-medoids++', max_iter=7)
ymed = kmed.fit_predict(X_str)
print("Stress",collections.Counter(ymed))

stress_Counter({0: 1570, 1: 606, 2: 324})

# Kmodes<a href="#Kmodes" class="anchor-link">¶</a>

### Elbow method<a href="#Elbow-method" class="anchor-link">¶</a>

# Hierarchical Clustering<a href="#Hierarchical-Clustering" class="anchor-link">¶</a>

### dendogram to find out ideal no. of clusters<a href="#dendogram-to-find-out-ideal-no.-of-clusters" class="anchor-link">¶</a>

In [None]:
plt.figure(figsize =(8, 8)) 
plt.title('Visualising the data') 
Dendrogram = sch.dendrogram((sch.linkage(X_pca_dep, method ='ward')))

In [None]:
plt.figure(figsize =(8, 8)) 
plt.title('Visualising the data') 
Dendrogram = sch.dendrogram((sch.linkage(X_pca_anx, method ='ward')))

In [None]:
plt.figure(figsize =(8, 8)) 
plt.title('Visualising the data') 
Dendrogram = sch.dendrogram((sch.linkage(X_pca_str, method ='ward')))

In [None]:
agg = AgglomerativeClustering(n_clusters = 3) 
yagg = agg.fit_predict(X_dep)
print("Depression",collections.Counter(yagg))

Depression Counter({0: 1019, 2: 848, 1: 633})

In [None]:
x, y, z = X_pca_dep.T
fig = px.scatter_3d(x = x, y = y, z = z, color = yagg, opacity= 0.70)
fig.update_traces(marker=dict(size=4),
                  selector=dict(mode='markers'))
fig.show()

In [None]:
agg = AgglomerativeClustering(n_clusters = 3) 
yagg = agg.fit_predict(X_anx)
print("Anxiety",collections.Counter(yagg))

Anxiety Counter({0: 1044, 2: 902, 1: 554})

In [None]:
x, y, z = X_pca_anx.T
fig = px.scatter_3d(x = x, y = y, z = z, color = yagg, opacity= 0.70)
fig.update_traces(marker=dict(size=4),
                  selector=dict(mode='markers'))
fig.show()

In [None]:
agg = AgglomerativeClustering(n_clusters = 3) 
yagg = agg.fit_predict(X_str)
print("Stress",collections.Counter(yagg))

Stress Counter({0: 1034, 2: 772, 1: 694})

In [None]:
x, y, z = X_pca_str.T
fig = px.scatter_3d(x = x, y = y, z = z, color = yagg, opacity= 0.70)
fig.update_traces(marker=dict(size=4),
                  selector=dict(mode='markers'))
fig.show()

In [None]:
clusters = 7
cost_dep = []
cost_anx = []
cost_str = []
for cluster in range(2, clusters+1):
    km2 = KModes(n_clusters = cluster, init='Cao', random_state = 1, n_jobs= -1)
    km2.fit_predict(X_dep)
    cost_dep.append(km2.cost_)
    km2 = KModes(n_clusters = cluster, init='Cao', random_state = 1, n_jobs= -1)
    km2.fit_predict(X_anx)
    cost_anx.append(km2.cost_)
    km2 = KModes(n_clusters = cluster, init='Cao', random_state = 1, n_jobs= -1)
    km2.fit_predict(X_str)
    cost_str.append(km2.cost_)

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(1,clusters+1)), y=cost_dep,
                    mode='lines+markers',
                    name='Depression Dataset'))
fig.add_trace(go.Scatter(x=list(range(1,clusters+1)), y=cost_anx,
                    mode='lines+markers',
                    name='Anxiety Dataset'))
fig.add_trace(go.Scatter(x=list(range(1,clusters+1)), y=cost_str,
                    mode='lines+markers',
                    name='Stress Dataset'))

fig.show()

In [None]:
clusters = 7
for cluster in range(2, clusters+1):
    km2 = KModes(n_clusters = cluster,init = 'Cao', random_state = 3, n_jobs= -1)
    y = km2.fit_predict(X_dep)

    fig, ax1 = plt.subplots(1, 1)
    fig.set_size_inches(18, 7)
    
    ax1.set_xlim([-0.1, 1])
    ax1.set_ylim([0, len(X_dep) + (cluster + 1) * 10])
    
    score = silhouette_score(X_dep, y)
    print('score :',score)

    sil_sample = silhouette_samples(X_dep, y)
    y_lower = 10
    for i in range(cluster):
        i_sil_sample = sil_sample[y==i]
        i_sil_sample.sort()
        y_upper = y_lower + i_sil_sample.shape[0]
        ax1.fill_betweenx(np.arange(y_lower,y_upper),0,i_sil_sample, )
        y_lower = y_upper + 10

score : 0.26597458474860103
score : 0.2236947342213366
score : 0.14033351564375293
score : 0.14310833113922744
score : 0.10851150188863369
score : 0.10280108604364291

### Kmodes visualization<a href="#Kmodes-visualization" class="anchor-link">¶</a>

In [None]:
kmod = KModes(n_clusters = 3, init='Cao', random_state = 1, n_jobs= -1)
ymod = kmod.fit_predict(X_dep)
print("Depression",collections.Counter(ymod))

Depression Counter({0: 1098, 2: 798, 1: 604})

In [None]:
x, y, z = X_pca_dep.T
fig = px.scatter_3d(x = x, y = y, z = z, color = ymod, opacity= 0.70)
fig.update_traces(marker=dict(size=4),
                  selector=dict(mode='markers'))
fig.show()

In [None]:
kmod = KModes(n_clusters = 3, init='Cao', random_state = 1, n_jobs= -1)
ymod = kmod.fit_predict(X_anx)
print("Anxiety",collections.Counter(ymod))

Anxiety Counter({0: 1241, 1: 923, 2: 336})

In [None]:
x, y, z = X_pca_anx.T
fig = px.scatter_3d(x = x, y = y, z = z, color = ymod, opacity= 0.70)
fig.update_traces(marker=dict(size=4),
                  selector=dict(mode='markers'))
fig.show()

In [None]:
kmod = KModes(n_clusters = 3, init='Cao', random_state = 1, n_jobs= -1)
ymod = kmod.fit_predict(X_str)
print("Stress",collections.Counter(ymod))

Stress Counter({0: 1570, 1: 606, 2: 324})

In [None]:
x, y, z = X_pca_str.T
fig = px.scatter_3d(x = x, y = y, z = z, color = ymod, opacity= 0.70)
fig.update_traces(marker=dict(size=4),
                  selector=dict(mode='markers'))
fig.show()

In [None]:
x, y, z = X_pca_str.T
fig = px.scatter_3d(x = x, y = y, z = z, color = ymed, opacity= 0.70, template='ggplot2')
fig.update_traces(marker=dict(size=4),
                  selector=dict(mode='markers'))
fig.show()