## 0) Import

In [91]:
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.pipeline import Pipeline

## 1) Data

In [92]:
df = pd.read_csv('data/Country-data.csv')

In [93]:
# country_mapping = {
#     'Dem. Rep. Congo': 'Congo, Dem. Rep.',
#     'Congo': 'Congo, Rep.',
#     'Côte d\'Ivoire': "Cote d'Ivoire",
#     'South Korea': 'Korea, South',
#     'United States': 'United States of America',
#     'United Kingdom': 'United Kingdom', 
# }

In [94]:
# df['country'] = df['country'].map(country_mapping).fillna(df['country'])
# df

In [95]:
df.set_index("country", inplace=True)
X = df.values

## 2) Clustering

In [96]:
df_final = pd.DataFrame()
df_final["country"] = df.index
df_final

Unnamed: 0,country
0,Afghanistan
1,Albania
2,Algeria
3,Angola
4,Antigua and Barbuda
...,...
162,Vanuatu
163,Venezuela
164,Vietnam
165,Yemen


### 2.1) KMeans

In [97]:
pipeline_kmeans = Pipeline([
    ('scaler', StandardScaler()),
    ('kmeans', KMeans(n_clusters=3, random_state=42))
])

In [98]:
pipeline_kmeans.fit(X)

cluster_labels = pipeline_kmeans.predict(X)

  super()._check_params_vs_input(X, default_n_init=10)


In [99]:
df_clustered_kmeans = df.copy()
df_clustered_kmeans['Cluster_kmeans'] = cluster_labels
df_final['Cluster_kmeans'] = cluster_labels

df_clustered_kmeans = df_clustered_kmeans.reset_index()
df_clustered_kmeans.head()

Unnamed: 0,country,child_mort,exports,health,imports,income,inflation,life_expec,total_fer,gdpp,Cluster_kmeans
0,Afghanistan,90.2,10.0,7.58,44.9,1610,9.44,56.2,5.82,553,1
1,Albania,16.6,28.0,6.55,48.6,9930,4.49,76.3,1.65,4090,2
2,Algeria,27.3,38.4,4.17,31.4,12900,16.1,76.5,2.89,4460,2
3,Angola,119.0,62.3,2.85,42.9,5900,22.4,60.1,6.16,3530,1
4,Antigua and Barbuda,10.3,45.5,6.03,58.9,19100,1.44,76.8,2.13,12200,2


### 2.2) Agglomerative Clustering

In [100]:
pipeline_agglomerative = Pipeline([
    ('scaler', StandardScaler()),
    ('agglomerative', AgglomerativeClustering(n_clusters=3))
])

In [101]:
cluster_labels_agglomerative = pipeline_agglomerative.fit_predict(X)

In [102]:
df_clustered_agglomerative = df.copy()
df_clustered_agglomerative['Cluster_AggClu'] = cluster_labels_agglomerative
df_final['Cluster_AggClu'] = cluster_labels_agglomerative

df_clustered_agglomerative = df_clustered_agglomerative.reset_index()

df_clustered_agglomerative.head()

Unnamed: 0,country,child_mort,exports,health,imports,income,inflation,life_expec,total_fer,gdpp,Cluster_AggClu
0,Afghanistan,90.2,10.0,7.58,44.9,1610,9.44,56.2,5.82,553,2
1,Albania,16.6,28.0,6.55,48.6,9930,4.49,76.3,1.65,4090,1
2,Algeria,27.3,38.4,4.17,31.4,12900,16.1,76.5,2.89,4460,1
3,Angola,119.0,62.3,2.85,42.9,5900,22.4,60.1,6.16,3530,1
4,Antigua and Barbuda,10.3,45.5,6.03,58.9,19100,1.44,76.8,2.13,12200,1


### 2.3) DBScan

In [103]:
# DBSCAN
pipeline_dbscan = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=2)),
    ('dbscan', DBSCAN(eps=0.5, min_samples=10))
])

In [104]:
cluster_labels_dbscan = pipeline_dbscan.fit_predict(X)

In [105]:
df_clustered_dbscan = df.copy()
df_clustered_dbscan['Cluster_dbscan'] = cluster_labels_dbscan
df_final['Cluster_dbscan'] = cluster_labels_dbscan

df_clustered_dbscan = df_clustered_dbscan.reset_index()

df_clustered_dbscan.head()

Unnamed: 0,country,child_mort,exports,health,imports,income,inflation,life_expec,total_fer,gdpp,Cluster_dbscan
0,Afghanistan,90.2,10.0,7.58,44.9,1610,9.44,56.2,5.82,553,-1
1,Albania,16.6,28.0,6.55,48.6,9930,4.49,76.3,1.65,4090,0
2,Algeria,27.3,38.4,4.17,31.4,12900,16.1,76.5,2.89,4460,0
3,Angola,119.0,62.3,2.85,42.9,5900,22.4,60.1,6.16,3530,-1
4,Antigua and Barbuda,10.3,45.5,6.03,58.9,19100,1.44,76.8,2.13,12200,0


### 2.4) Gaussian Mixture

In [106]:
pipeline_gmm = Pipeline([
    ('scaler', StandardScaler()),
    ('gmm', GaussianMixture(n_components=3, random_state=42))
])

In [107]:
cluster_labels_gmm = pipeline_gmm.fit_predict(X)



In [108]:
df_clustered_gmm = df.copy()
df_clustered_gmm['Cluster_GM'] = cluster_labels_gmm
df_final['Cluster_GM'] = cluster_labels_gmm
df_clustered_gmm = df_clustered_gmm.reset_index()

df_clustered_gmm.head()

Unnamed: 0,country,child_mort,exports,health,imports,income,inflation,life_expec,total_fer,gdpp,Cluster_GM
0,Afghanistan,90.2,10.0,7.58,44.9,1610,9.44,56.2,5.82,553,2
1,Albania,16.6,28.0,6.55,48.6,9930,4.49,76.3,1.65,4090,0
2,Algeria,27.3,38.4,4.17,31.4,12900,16.1,76.5,2.89,4460,0
3,Angola,119.0,62.3,2.85,42.9,5900,22.4,60.1,6.16,3530,2
4,Antigua and Barbuda,10.3,45.5,6.03,58.9,19100,1.44,76.8,2.13,12200,0


## 3) Save results

In [117]:
# df_final.to_csv("./data/clusters_results.csv", index=False)
df_final

Unnamed: 0,country,Cluster_kmeans,Cluster_AggClu,Cluster_dbscan,Cluster_GM
0,Afghanistan,1,2,-1,2
1,Albania,2,1,0,0
2,Algeria,2,1,0,0
3,Angola,1,1,-1,2
4,Antigua and Barbuda,2,1,0,0
...,...,...,...,...,...
162,Vanuatu,2,1,-1,2
163,Venezuela,2,1,0,0
164,Vietnam,2,1,-1,2
165,Yemen,1,1,-1,2
