# Clustering of countries

In [None]:
# imports
import pandas as pd
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
import pickle
import cluster_vis
import cluster_methods
import pandas as pd

import dataprep
import decisiontree_help

%load_ext autoreload
%autoreload 2

In [None]:
# reading the data
source = "gwp_data/clean_per_year/full_clean"
df = pd.read_pickle(source)

with open('meta/countrynum_to_name_dict.pickle', 'rb') as fp:
    countrynum_to_name_dict = pickle.load(fp)

# clusters by ISO with visualization
with open('meta/countrynum_to_ISO_dict.pickle', 'rb') as fp:
    countrynum_to_ISO_dict = pickle.load(fp)

source = "meta/columns"
df_meta = pd.read_pickle(source)

In [None]:
df = dataprep.remove_unwanted(df, df_meta)
df = dataprep.remove_notallcountry(df)
df.head()


### Cleaning
- remove meaningless columns (f.e.: "Unique Case ID")
- remove columns with NaN values 
This is reasonable, in most of the cases not only a few cell was missing. 

In [None]:
df.drop(["YEAR_WAVE: Wave Year", "COUNTRY_ISO3: Country ISO alpha-3 code"], axis=1, inplace=True)
df = df.convert_dtypes()

### Dummy clustering with mean aggregation

In [None]:
cluster_methods.elbow_method(cluster_methods.mean_aggregation(df), 2, 20, 'kmeans')
K = 6

clusters = cluster_methods.dummy_cluster(df, "mean", K)

# clusters by name
df_mean = df.groupby("WP5: Country").mean()

countrynum_to_cluster_dict = {elem:cluster for (elem, cluster) in zip(list(df_mean.index),clusters)}

ISO_to_cluster_dict = dict((countrynum_to_ISO_dict[key], value) for (key, value) in countrynum_to_cluster_dict.items())

df_result = pd.DataFrame.from_dict(ISO_to_cluster_dict, orient='index', columns=["cluster"])
df_result["WP5: Country"] = df_result.index
df_result["COUNTRY_ISO3: Country ISO alpha-3 code"] = df_result.index

# print clusters
countryname_cluster_dict = {countrynum_to_name_dict[elem]:cluster for (elem, cluster) in zip(list(df_mean.index), clusters)}

for c in range(K):
    print(f"========= cluster: {c} ============")
    for country, cluster in countryname_cluster_dict.items():
        if cluster == c:
            print(country)

cluster_vis.cluster_visualization(df_result, clusters, "cluster_origin_mean")


### Dummy aggregation with K-Modes

In [None]:
cluster_methods.elbow_method(cluster_methods.mode_aggregation(df), 2, 20, 'kmeans')
K = 5

clusters = cluster_methods.dummy_cluster(df, "mode", K)

# clusters by name
df_mode = df.groupby("WP5: Country").mean()

countrynum_to_cluster_dict = {elem:cluster for (elem, cluster) in zip(list(df_mode.index),clusters)}

ISO_to_cluster_dict = dict((countrynum_to_ISO_dict[key], value) for (key, value) in countrynum_to_cluster_dict.items())

df_result = pd.DataFrame.from_dict(ISO_to_cluster_dict, orient='index', columns=["cluster"])
df_result["WP5: Country"] = df_result.index
df_result["COUNTRY_ISO3: Country ISO alpha-3 code"] = df_result.index

# print clusters
countryname_cluster_dict = {countrynum_to_name_dict[elem]:cluster for (elem, cluster) in zip(list(df_mode.index), clusters)}

for c in range(K):
    print(f"========= cluster: {c} ============")
    for country, cluster in countryname_cluster_dict.items():
        if cluster == c:
            print(country)

cluster_vis.cluster_visualization(df_result, clusters, "cluster_origin_mode")