## 0) Import

In [99]:
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN, MiniBatchKMeans
from sklearn.mixture import GaussianMixture
from sklearn.pipeline import Pipeline

## 1) Data

### 1.1) Get data

In [100]:
df = pd.read_csv('data/Country-data.csv')
df

Unnamed: 0,country,child_mort,exports,health,imports,income,inflation,life_expec,total_fer,gdpp
0,Afghanistan,90.2,10.0,7.58,44.9,1610,9.44,56.2,5.82,553
1,Albania,16.6,28.0,6.55,48.6,9930,4.49,76.3,1.65,4090
2,Algeria,27.3,38.4,4.17,31.4,12900,16.10,76.5,2.89,4460
3,Angola,119.0,62.3,2.85,42.9,5900,22.40,60.1,6.16,3530
4,Antigua and Barbuda,10.3,45.5,6.03,58.9,19100,1.44,76.8,2.13,12200
...,...,...,...,...,...,...,...,...,...,...
162,Vanuatu,29.2,46.6,5.25,52.7,2950,2.62,63.0,3.50,2970
163,Venezuela,17.1,28.5,4.91,17.6,16500,45.90,75.4,2.47,13500
164,Vietnam,23.3,72.0,6.84,80.2,4490,12.10,73.1,1.95,1310
165,Yemen,56.3,30.0,5.18,34.4,4480,23.60,67.5,4.67,1310


In [101]:
# country_mapping = {
#     'Dem. Rep. Congo': 'Congo, Dem. Rep.',
#     'Congo': 'Congo, Rep.',
#     'Côte d\'Ivoire': "Cote d'Ivoire",
#     'South Korea': 'Korea, South',
#     'United States': 'United States of America',
#     'United Kingdom': 'United Kingdom', 
# }

In [16]:
# df['country'] = df['country'].map(country_mapping).fillna(df['country'])
# df

### 1.2) Select Columns
We decided to take out the columns "life_expec", "total_fer" and "gdpp", beacuse of the strong correlation betwen them and "child_mort" and "income" 

In [17]:
# country	child_mort	exports	health	imports	income	inflation	life_expec	total_fer	gdpp
df = df[["country", "exports", "health", "imports", "income", "inflation", "child_mort"]]

In [18]:
df.set_index("country", inplace=True)
X = df.values
X

array([[1.00e+01, 7.58e+00, 4.49e+01, 1.61e+03, 9.44e+00, 9.02e+01],
       [2.80e+01, 6.55e+00, 4.86e+01, 9.93e+03, 4.49e+00, 1.66e+01],
       [3.84e+01, 4.17e+00, 3.14e+01, 1.29e+04, 1.61e+01, 2.73e+01],
       ...,
       [7.20e+01, 6.84e+00, 8.02e+01, 4.49e+03, 1.21e+01, 2.33e+01],
       [3.00e+01, 5.18e+00, 3.44e+01, 4.48e+03, 2.36e+01, 5.63e+01],
       [3.70e+01, 5.89e+00, 3.09e+01, 3.28e+03, 1.40e+01, 8.31e+01]])

## 2) Clustering

In [19]:
df_final = pd.DataFrame()
df_final["country"] = df.index

### 2.1) KMeans

In [20]:
pipeline_kmeans = Pipeline([
    ('scaler', MinMaxScaler()), 
    ('feature_selection', PCA(n_components=0.95)),
    ('kmeans', MiniBatchKMeans(n_clusters=3, random_state=42))
])

In [107]:
pipeline_kmeans.fit(X)

cluster_labels = pipeline_kmeans.predict(X)

  super()._check_params_vs_input(X, default_n_init=3)




In [108]:
df_clustered_kmeans = df.copy()
df_clustered_kmeans['Cluster_kmeans'] = cluster_labels
df_final['Cluster_kmeans'] = cluster_labels

df_clustered_kmeans = df_clustered_kmeans.reset_index()
df_clustered_kmeans.head()

Unnamed: 0,country,exports,health,imports,income,inflation,child_mort,Cluster_kmeans
0,Afghanistan,10.0,7.58,44.9,1610,9.44,90.2,1
1,Albania,28.0,6.55,48.6,9930,4.49,16.6,0
2,Algeria,38.4,4.17,31.4,12900,16.1,27.3,0
3,Angola,62.3,2.85,42.9,5900,22.4,119.0,1
4,Antigua and Barbuda,45.5,6.03,58.9,19100,1.44,10.3,0


### 2.2) Agglomerative Clustering

In [109]:
pipeline_agglomerative = Pipeline([
    ('scaler', MinMaxScaler()), 
    ('feature_selection', PCA(n_components=0.95)),
    ('agglomerative', AgglomerativeClustering(n_clusters=4))
])

In [110]:
cluster_labels_agglomerative = pipeline_agglomerative.fit_predict(X)

In [111]:
df_clustered_agglomerative = df.copy()
df_clustered_agglomerative['Cluster_AggClu'] = cluster_labels_agglomerative
df_final['Cluster_AggClu'] = cluster_labels_agglomerative

df_clustered_agglomerative = df_clustered_agglomerative.reset_index()

df_clustered_agglomerative.head()

Unnamed: 0,country,exports,health,imports,income,inflation,child_mort,Cluster_AggClu
0,Afghanistan,10.0,7.58,44.9,1610,9.44,90.2,1
1,Albania,28.0,6.55,48.6,9930,4.49,16.6,0
2,Algeria,38.4,4.17,31.4,12900,16.1,27.3,0
3,Angola,62.3,2.85,42.9,5900,22.4,119.0,1
4,Antigua and Barbuda,45.5,6.03,58.9,19100,1.44,10.3,0


### 2.3) DBScan

In [112]:
# DBSCAN
pipeline_dbscan = Pipeline([
    ('scaler', MinMaxScaler()),
    ('pca', PCA(n_components=0.95)),
    ('dbscan', DBSCAN(eps=0.1, min_samples=4))
])

In [113]:
cluster_labels_dbscan = pipeline_dbscan.fit_predict(X)

In [114]:
df_clustered_dbscan = df.copy()
df_clustered_dbscan['Cluster_dbscan'] = cluster_labels_dbscan
df_final['Cluster_dbscan'] = cluster_labels_dbscan

df_clustered_dbscan = df_clustered_dbscan.reset_index()

df_clustered_dbscan.head()

Unnamed: 0,country,exports,health,imports,income,inflation,child_mort,Cluster_dbscan
0,Afghanistan,10.0,7.58,44.9,1610,9.44,90.2,-1
1,Albania,28.0,6.55,48.6,9930,4.49,16.6,0
2,Algeria,38.4,4.17,31.4,12900,16.1,27.3,-1
3,Angola,62.3,2.85,42.9,5900,22.4,119.0,-1
4,Antigua and Barbuda,45.5,6.03,58.9,19100,1.44,10.3,0


### 2.4) Gaussian Mixture

In [115]:
pipeline_gmm = Pipeline([
    ('scaler', MinMaxScaler()),
    ('gmm', GaussianMixture(n_components=3, random_state=42))
])

In [116]:
cluster_labels_gmm = pipeline_gmm.fit_predict(X)



In [117]:
df_clustered_gmm = df.copy()
df_clustered_gmm['Cluster_GM'] = cluster_labels_gmm
df_final['Cluster_GM'] = cluster_labels_gmm
df_clustered_gmm = df_clustered_gmm.reset_index()

df_clustered_gmm.head()

Unnamed: 0,country,exports,health,imports,income,inflation,child_mort,Cluster_GM
0,Afghanistan,10.0,7.58,44.9,1610,9.44,90.2,2
1,Albania,28.0,6.55,48.6,9930,4.49,16.6,0
2,Algeria,38.4,4.17,31.4,12900,16.1,27.3,0
3,Angola,62.3,2.85,42.9,5900,22.4,119.0,2
4,Antigua and Barbuda,45.5,6.03,58.9,19100,1.44,10.3,0


## 3) Save results

In [118]:
merged_df = pd.merge(df, df_final, on='country')

In [119]:
merged_df.to_csv("./data/clusters_results.csv", index=False)
merged_df

Unnamed: 0,country,exports,health,imports,income,inflation,child_mort,Cluster_kmeans,Cluster_AggClu,Cluster_dbscan,Cluster_GM
0,Afghanistan,10.0,7.58,44.9,1610,9.44,90.2,1,1,-1,2
1,Albania,28.0,6.55,48.6,9930,4.49,16.6,0,0,0,0
2,Algeria,38.4,4.17,31.4,12900,16.10,27.3,0,0,-1,0
3,Angola,62.3,2.85,42.9,5900,22.40,119.0,1,1,-1,2
4,Antigua and Barbuda,45.5,6.03,58.9,19100,1.44,10.3,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
162,Vanuatu,46.6,5.25,52.7,2950,2.62,29.2,0,0,0,0
163,Venezuela,28.5,4.91,17.6,16500,45.90,17.1,0,0,-1,2
164,Vietnam,72.0,6.84,80.2,4490,12.10,23.3,0,0,-1,0
165,Yemen,30.0,5.18,34.4,4480,23.60,56.3,1,1,-1,2
