In [1]:
# Imports
import pandas as pd
import numpy as np
from scipy.stats import zscore
from scipy.cluster.hierarchy import dendrogram, linkage, cophenet
from scipy.spatial.distance import cdist, pdist

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib

from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler, LabelEncoder

from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer

%matplotlib inline

ModuleNotFoundError: No module named 'yellowbrick'

In [None]:
# reading in final dataframe
df = pd.read_csv('../data/cleaned_data/kahuna.csv')

In [None]:
def histogram_boxplot(data, feature, figsize=(12, 7), kde=False, bins=None):
    """
    Boxplot and histogram combined

    data: dataframe
    feature: dataframe column
    figsize: size of figure (default (12,7))
    kde: whether to the show density curve (default False)
    bins: number of bins for histogram (default None)
    """
    f2, (ax_box2, ax_hist2) = plt.subplots(
        nrows=2,  # Number of rows of the subplot grid= 2
        sharex=True,  # x-axis will be shared among all subplots
        gridspec_kw={"height_ratios": (0.25, 0.75)},
        figsize=figsize,
    )  # creating the 2 subplots
    sns.boxplot(
        data=data, x=feature, ax=ax_box2, showmeans=True, color="violet"
    )  # boxplot will be created and a star will indicate the mean value of the column
    sns.histplot(
        data=data, x=feature, kde=kde, ax=ax_hist2, bins=bins, palette="winter"
    ) if bins else sns.histplot(
        data=data, x=feature, kde=kde, ax=ax_hist2
    )  # For histogram
    ax_hist2.axvline(
        data[feature].mean(), color="green", linestyle="--"
    )  # Add mean to the histogram
    ax_hist2.axvline(
        data[feature].median(), color="black", linestyle="-"
    )  # Add median to the histogram

-------------------------

## KMeans Clustering

In [None]:
X = df.drop(columns=['Entity', 'Year'])
sc = StandardScaler()
X_sc = sc.fit_transform(X)

In [None]:
km = KMeans(n_clusters=4, random_state=42)
km.fit(X_sc)

In [None]:
km.cluster_centers_;

In [None]:
df['cluster'] = km.labels_
df.head()

In [None]:
# comparision of numerical representations of clusters means
df.groupby('cluster').mean().T

In [None]:
columns = df.columns.drop(['Entity', 'cluster', 'Year'])

In [None]:
km.cluster_centers_;

In [None]:
df.shape

In [None]:
centroids = pd.DataFrame(
    sc.inverse_transform(km.cluster_centers_),
    columns=columns
)
centroids

In [None]:
scores = []
for k in range(2, 31):
    cl = KMeans(n_clusters=k)
    cl.fit(X_sc)
    inertia = cl.inertia_
    sil = silhouette_score(X_sc, cl.labels_)
    scores.append([k, inertia, sil])
    
score_df = pd.DataFrame(scores)
score_df.columns = ['k', 'inertia', 'silhouette']

In [None]:
score_df.head(11)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 7))
axes[0].plot(score_df.k, score_df.inertia)
axes[0].set_title('Inertia over k')
axes[1].plot(score_df.k, score_df.silhouette);
axes[1].set_title('Silhouette Score over k');

In [None]:
# group is the largest cluster by far
df['cluster'].value_counts()

In [None]:
km_10 = KMeans(n_clusters=10, random_state=42)
km_10.fit(X_sc)

In [None]:
df['clusterx10'] = km_10.labels_

In [None]:
df['clusterx10'].value_counts()

In [None]:
df.groupby(by='clusterx10').mean().T.plot(kind='barh')

In [None]:
df['Entity'].value_counts()

In [None]:
df.groupby('cluster').mean().T.plot(kind='barh')
plt.figure(figsize=(40,20))


In [None]:
lis = [1,2,3,4,4]
lis.reverse()
lis

In [None]:
rev_columns = list(columns.astype(str))
rev_columns.reverse()

In [None]:
china = df[df['Entity'] == 'China']

In [None]:
plt.bar(china['Year'], china['Deaths %'])
plt.xlabel('Year')
plt.ylabel('Death %')
plt.title('China: Death % over Time')

In [None]:
plt.scatter(china['Year'], china['Deaths %'])
plt.xlabel('Year')
plt.ylabel('Death %')
plt.title('China: Death % over Time')

In [None]:
for x, y in zip(columns, rev_columns):
    colors = ["red", "green", "blue"]
    df['color'] = df['cluster'].map(lambda p: colors[p])

    ax = df.plot(    
        kind="scatter", 
        x=x, y=y,
        figsize=(10,8),
        c = df['color']
    )
    centroids.plot(
    kind="scatter", 
    x=x, y=y, 
    marker="*", c=["maroon", "darkgreen", "navy"], s=550,
    ax=ax
    );

### 

In [None]:
plt.figure(figsize=(15, 7))
sns.heatmap(df.corr(), annot=True, vmin=-1, vmax=1, fmt=".2f", cmap="Spectral")
plt.show()

In [None]:
le = LabelEncoder()
df['Entity'] = le.fit_transform(df['Entity'])
scaler = StandardScaler()
subset = df.copy()
subset.color = subset.color.map({'red':1, 'green':2, 'blue':3})
subset_scaled = scaler.fit_transform(subset)
subset_scaled_df = pd.DataFrame(subset_scaled, columns=subset.columns)
k_means_df = subset_scaled_df.copy()

In [None]:
model = KMeans(random_state=1)
visualizer = KElbowVisualizer(model, k=(2, 30), metric="silhouette", timings=True)
visualizer.fit(k_means_df)  # fit the data to the visualizer
visualizer.show()  # finalize and render figure

In [None]:
sil_score = []
cluster_list = range(2, 10)
for n_clusters in cluster_list:
    clusterer = KMeans(n_clusters=n_clusters, random_state=1)
    preds = clusterer.fit_predict((subset_scaled_df))
    score = silhouette_score(k_means_df, preds)
    sil_score.append(score)
    print("For n_clusters = {}, the silhouette score is {})".format(n_clusters, score))

In [None]:
# finding optimal no. of clusters with silhouette coefficients
visualizer = SilhouetteVisualizer(KMeans(5, random_state=1))
visualizer.fit(k_means_df)
visualizer.show()

In [None]:
# finding optimal no. of clusters with silhouette coefficients
visualizer = SilhouetteVisualizer(KMeans(4, random_state=1))
visualizer.fit(k_means_df)
visualizer.show()

In [None]:
# finding optimal no. of clusters with silhouette coefficients
visualizer = SilhouetteVisualizer(KMeans(3, random_state=1))
visualizer.fit(k_means_df)
visualizer.show()

In [None]:
# finding optimal no. of clusters with silhouette coefficients
visualizer = SilhouetteVisualizer(KMeans(2, random_state=1))
visualizer.fit(k_means_df)
visualizer.show()

**Observations**

- The silhouette coefficient for 3 clusters is the highest.
- We can also see that the score for 3 clusters is close to the average score and the shape of the clusters is very uniform in SilhouetteVisualizer, even though the magnitude may be different.
- So, we will proceed with 3 clusters.

## Hierarchical Clustering

In [None]:
df.color = df.color.map({'red':1, 'green':2, 'blue':3})

In [None]:
ss = StandardScaler()
df_sc = ss.fit_transform(df)

model = AgglomerativeClustering(distance_threshold=100, affinity='euclidean',  linkage='average', n_clusters =None)
model.fit(df_sc)
df['labels'] = model.labels_
df_clust = df.groupby(['labels'])

In [None]:
Z = linkage(df_sc, metric='euclidean', method='average')
c, coph_dists = cophenet(Z , pdist(df_sc))

In [None]:
%%time
plt.figure(figsize=(25, 5))
plt.title('Agglomerative Hierarchical Clustering Dendogram')
plt.xlabel('sample index')
plt.ylabel('Distance')
dendrogram(Z, leaf_rotation=90.,color_threshold = 40, truncate_mode ='level' ,leaf_font_size=8. )
plt.tight_layout()