# Hierarchical clustering - migration aspiration

#### Imports

In [None]:
import pandas as pd
import pickle
from tqdm import tqdm
from sklearn import tree
import os
import math

import hierarchical_help

import warnings
warnings.filterwarnings("ignore")
%load_ext autoreload
%autoreload 2
import pickle

#### Reading the files

In [None]:
# reading the data
source = "gwp_data/prepared_aspiration/clean_data_from18to22"
df = pd.read_pickle(source)

In [None]:
source = "meta/columns"
df_meta = pd.read_pickle(source)


In [None]:
# save the original dataframe to be able to recover the answers
df_original = df.copy()
df_original['WP1325: Move Permanently to Another Country'] = df_original['WP1325: Move Permanently to Another Country'] - 1
df_original

In [None]:
with open('meta/countrynum_to_name_dict', 'rb') as fp:
    countrynum_to_name_dict = pickle.load(fp)

with open('meta/countrynum_to_ISO_dict.pickle', 'rb') as fp:
    countrynum_to_ISO_dict = pickle.load(fp)

#### Data preparation

In [None]:
# keep only predictive columns
non_pred_columns = ['Region: Region',\
                    'YEAR_WAVE: Wave Year',\
                    'COUNTRY_ISO3: Country ISO alpha-3 code',\
                    'WP1325: Move Permanently to Another Country',\
                    'WP3120: Country Would Move To',\
                    'WP5889: Questionnaire Serial Number'   ]

df = df.loc[:, ~df.columns.isin(non_pred_columns)]
df['WP9048: Country Where Born'] = df['WP9048: Country Where Born']==df["WP5: Country"]

In [None]:
yes_columns = []
yn_columns = []
ordinal_columns = []
no_columns = []

for col in df.columns:
    l = list(df_meta[df_meta['column'].str.contains(col)]["categorical?"])
    if len(l) !=0:
        if "yes" in l[0]:
            yes_columns.append(col)
        if "yn" in l[0] :
            yn_columns.append(col)
        if "ordinal" in l[0] :
            ordinal_columns.append(col)
        if "no" in l[0] :
            no_columns.append(col)

In [None]:
countries = df["WP5: Country"].unique()

In [None]:
cat_columns = set(yn_columns + yes_columns).intersection(df.columns)
cat_columns = cat_columns.difference(set('WP1325: Move Permanently to Another Country')).intersection(df.columns)
df = pd.get_dummies(df, columns = cat_columns, prefix=cat_columns)
df["WP5: Country"] = df_original["WP5: Country"]

In [None]:
depth = [4, 6, 8, 10, 12]

In [None]:
# initalize country to df dictionary
if os.path.exists("gwp_data/prepared_hierarchical/country_df_dict.p")==False:
    country_df_dict = hierarchical_help.create_country_df_dict(countries, df)
    pickle.dump(country_df_dict, open("gwp_data/prepared_hierarchical/country_df_dict.p", "wb"))
else:
    country_df_dict = pickle.load(open("gwp_data/prepared_hierarchical/country_df_dict.p", "rb"))

# initalize clusters with standalone countries
if os.path.exists("gwp_data/prepared_hierarchical/init_clusters.p")==False:
    init_clusters = hierarchical_help.init_clusters(countries)
    pickle.dump(init_clusters, open("gwp_data/prepared_hierarchical/init_clusters.p", "wb"))
else:
    init_clusters = pickle.load(open("gwp_data/prepared_hierarchical/init_clusters.p", "rb"))

for depth in depth:

    if os.path.exists(f"gwp_data/prepared_hierarchical/country_tree_dict_depth{depth}.p")==False:
        country_tree_dict = hierarchical_help.create_country_tree_dict(countries, df_original, country_df_dict, depth)
        pickle.dump(country_tree_dict, open(f"gwp_data/prepared_hierarchical/country_tree_dict_depth{depth}.p", "wb"))
    
    if os.path.exists(f"gwp_data/prepared_hierarchical/dist_dict_depth{depth}.p")==False and os.path.exists(f"gwp_data/prepared_hierarchical/name_to_list_depth{depth}.p")==False:
        dist_dict, name_to_list = hierarchical_help.init_distances(init_clusters, country_df_dict, country_tree_dict, df_original)
        pickle.dump(dist_dict, open(f"gwp_data/prepared_hierarchical/dist_dict_depth{depth}.p", "wb"))
        pickle.dump(name_to_list, open(f"gwp_data/prepared_hierarchical/name_to_list_depth{depth}.p", "wb"))
    

#### Clustering

In [None]:
DEPTH = 8 # used depth in the experiment
ROUND = 2 # values: False or number

In [None]:
for DEPTH in [8,10]:
    for ROUND in [3]:
        dist_dict_og = pickle.load(open(f'gwp_data/prepared_hierarchical/dist_dict_depth{DEPTH}.p', "rb"))
        name_to_list_og = pickle.load(open(f'gwp_data/prepared_hierarchical/name_to_list_depth{DEPTH}.p', "rb"))
        country_tree_dict = pickle.load(open(f"gwp_data/prepared_hierarchical/country_tree_dict_depth{DEPTH}.p", "rb"))

        dist_dict = dist_dict_og.copy()
        name_to_list = name_to_list_og.copy()
        clusters = init_clusters.copy()

        if type(ROUND)==int:
            num = math.floor(len(clusters)/ROUND)

            if os.path.exists(f"results/hierarchical/depth{DEPTH}_round{ROUND}.p")==False:
                round_clusters = hierarchical_help.clustering_round(num, ROUND, clusters, name_to_list, dist_dict, country_df_dict, country_tree_dict,  df_original, DEPTH)
                pickle.dump(round_clusters, open(f"results/hierarchical/depth{DEPTH}_round{ROUND}.p", "wb"))
            else:
                clusters = pickle.load(open(f"results/hierarchical/depth{DEPTH}_round{ROUND}.p", "rb"))

            if os.path.exists(f"results/hierarchical/depth{DEPTH}_round{ROUND}_full.p")==False:
                clusters = hierarchical_help.clustering(10, round_clusters, name_to_list, dist_dict, country_df_dict, country_tree_dict,  df_original, DEPTH)
                pickle.dump(clusters, open(f"results/hierarchical/depth{DEPTH}_round{ROUND}_full.p", "wb"))
            else:
                clusters = pickle.load(open(f"results/hierarchical/depth{DEPTH}_round{ROUND}_full.p", "rb"))

        else:
            if os.path.exists(f"results/hierarchical/depth{DEPTH}_full.p")==False:
                clusters = hierarchical_help.clustering(10, clusters, name_to_list, dist_dict, country_df_dict, country_tree_dict,  df_original, DEPTH)
                pickle.dump(clusters, open(f"results/hierarchical/depth{DEPTH}_full.p", "wb"))
            else:
                clusters = pickle.load(open(f"results/hierarchical/depth{DEPTH}_full.p", "rb"))
            