# Decision tree - migration aspiration

#### Imports

In [None]:
# imports
import pandas as pd
import numpy as np
import pickle

import matplotlib.pyplot as plt
import plotly.express as px

from sklearn import tree
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA

import cluster_methods
import decisiontree_help
import cluster_vis

import warnings
warnings.filterwarnings("ignore")
%load_ext autoreload
%autoreload 2

#### Reading the files 
Choose between data with or without global region variables.

In [None]:
# reading the data

source = "gwp_data/prepared_aspiration/clean_data_from18to22_"
df_w = pd.read_pickle(source)

source = "gwp_data/prepared_aspiration/clean_data_from18to22_woregions"
df_wo = pd.read_pickle(source)

source = "country_data/country_per_year.pickle"
df_country_per_year = pd.read_pickle(source)

Join dataframes

In [None]:
df_wo = df_wo.set_index(["COUNTRY_ISO3: Country ISO alpha-3 code", "YEAR_WAVE: Wave Year"])
df_country_per_year = df_country_per_year.set_index(["COUNTRY_ISO3: Country ISO alpha-3 code", "YEAR_WAVE: Wave Year"])

In [None]:
# Join the df_filtered and df_help_full dataframes on the 'ISO' and 'YEAR_WAVE: Wave Year' columns
df_joined = df_wo.join(df_country_per_year, on=['COUNTRY_ISO3: Country ISO alpha-3 code', 'YEAR_WAVE: Wave Year'])
# Apply a function to the df_joined DataFrame, replacing lists with their mean values
df_joined = df_joined.applymap(lambda x: x.mean() if type(x)==list else x)
df_wo = df_joined.reset_index() 
df_joined.head()

In [None]:
# Important characteristics of the dataframe
print("Size of the dataframe:", df_joined.shape)

In [None]:
with open('meta/countrynum_to_name_dict.pickle', 'rb') as fp:
    countrynum_to_name_dict = pickle.load(fp)

with open('meta/countrynum_to_ISO_dict.pickle', 'rb') as fp:
    countrynum_to_ISO_dict = pickle.load(fp)

In [None]:
source = "meta/columns"
df_meta = pd.read_pickle(source)

In [None]:
# save the original dataframe to be able to recover the answers
df_original_w = df_w.copy()
df_original_wo = df_wo.copy()

#### Data preparation for the decision tree

In [None]:
# keep only predictive columns
non_pred_columns = ['Region: Region',\
                    'YEAR_WAVE: Wave Year',\
                    'COUNTRY_ISO3: Country ISO alpha-3 code',\
                    'WP1325: Move Permanently to Another Country',\
                    'WP3120: Country Would Move To',\
                    'WP5889: Questionnaire Serial Number'   ]

df_w = df_w.loc[:, ~df_w.columns.isin(non_pred_columns)]

df_wo = df_wo.loc[:, ~df_wo.columns.isin(non_pred_columns)]

In [None]:
df_w['WP9048: Country Where Born'] = df_w['WP9048: Country Where Born']==df_w["WP5: Country"]
df_w.drop('WP5: Country', axis=1, inplace=True)

df_wo['WP9048: Country Where Born'] = df_wo['WP9048: Country Where Born']==df_wo["WP5: Country"]
df_wo.drop('WP5: Country', axis=1, inplace=True)

#### Categorical data

In [None]:
yes_columns = []
yn_columns = []
ordinal_columns = []
no_columns = []

for col in df_w.columns:
    l = list(df_meta[df_meta['column'].str.contains(col)]["categorical?"])
    if len(l) !=0:
        if "yes" in l[0]:
            yes_columns.append(col)
        if "yn" in l[0] :
            yn_columns.append(col)
        if "ordinal" in l[0] :
            ordinal_columns.append(col)
        if "no" in l[0] :
            no_columns.append(col)

In [None]:
cat_columns = set(yn_columns + yes_columns).intersection(df_wo.columns)
cat_columns = cat_columns.difference(set('WP1325: Move Permanently to Another Country'))

df_wo = pd.get_dummies(df_wo, columns = cat_columns, prefix=cat_columns)
df_wo.convert_dtypes()

cat_columns = set(yn_columns + yes_columns).intersection(df_w.columns)
cat_columns = cat_columns.difference(set('WP1325: Move Permanently to Another Country'))

df_w = pd.get_dummies(df_w, columns = cat_columns, prefix=cat_columns)
df_w.convert_dtypes()

In [None]:
X_wo = df_wo.loc[:, ~df_wo.columns.isin(['WP1325: Move Permanently to Another Country'])]
X_wo.dropna(axis=1, inplace=True)

X_w = df_w.loc[:, ~df_w.columns.isin(['WP1325: Move Permanently to Another Country'])]
X_w.dropna(axis=1, inplace=True)

In [None]:
df_original_w['WP1325: Move Permanently to Another Country'] = df_original_w['WP1325: Move Permanently to Another Country'] - 1
Y_w = df_original_w['WP1325: Move Permanently to Another Country'].astype(int)

df_original_wo['WP1325: Move Permanently to Another Country'] = df_original_wo['WP1325: Move Permanently to Another Country'] - 1
Y_wo = df_original_wo['WP1325: Move Permanently to Another Country'].astype(int)

# Without region

#### Training the decision tree

In [None]:
clf = tree.DecisionTreeClassifier(max_depth=3)
clf = clf.fit(X_wo, Y_wo)

plt.figure(figsize=(12,12), dpi=400)
tree.plot_tree(clf, fontsize=3, filled=True)

In [None]:
sum(clf.predict(X_wo) == Y_wo) / len(Y_wo)

In [None]:
decisiontree_help.tree_to_code(clf, df_wo.columns)

In [None]:
tree_ = clf.tree_
tree_.feature

### Clustering 1 - MAX Leaves
The countries are in the same cluster if they are most represented in the same leafes.

In [None]:
name_iso = df_original_wo.get(["WP5: Country", "COUNTRY_ISO3: Country ISO alpha-3 code"])

In [None]:
df_with_country = X_wo.join(name_iso, how='left')
df_with_country

In [None]:
df_cluster1 = decisiontree_help.clustering1_depth3(clf, X_wo, df_with_country, countrynum_to_name_dict)
df_cluster1["WP5: Country"] = [countrynum_to_name_dict[c_code] for c_code in df_cluster1.index]
df_cluster1["COUNTRY_ISO3: Country ISO alpha-3 code"] = [countrynum_to_ISO_dict[c_code] for c_code in df_cluster1.index]
cluster_vis.cluster_visualization(df_cluster1, df_cluster1["cluster_1"], "D1")

In [None]:
df_cluster1


### Clustering 2 - MAX 2 leaves
The countries are in the same cluster if the two most represented leafes are the same.

In [None]:
df_cluster2 = decisiontree_help.clustering2(df_cluster1, 3, countrynum_to_name_dict)
df_cluster2["WP5: Country"] = [countrynum_to_name_dict[c_code] for c_code in df_cluster2.index]
df_cluster2["COUNTRY_ISO3: Country ISO alpha-3 code"] = [countrynum_to_ISO_dict[c_code] for c_code in df_cluster2.index]
cluster_vis.cluster_visualization(df_cluster2, df_cluster2["cluster_2"], "D2")

In [None]:
df_cluster2

### Clustering 3 - DBSCAN

In [None]:
decisiontree_help.run_clusters_distribution(8, X_wo, Y_wo, df_original_wo, 'dbscan', 0.040) 

### Clustering 4 - K-Means

#### depth = 8

In [None]:
df_help_8_kmeans = decisiontree_help.create_df(8, X_wo, Y_wo, df_original_wo)
cluster_methods.elbow_method(df_help_8_kmeans, 2, 10, "kmeans")

In [None]:
decisiontree_help.run_clusters_distribution(8, X_wo, Y_wo, df_original_wo, 'kmeans', 8)

#### depth = max

In [None]:
# df_help_max_kmeans = decisiontree_help.create_df('max', X_wo, Y_wo, df_original_wo)
# cluster_methods.elbow_method(df_help_max_kmeans, 2, 10, "kmeans")

# decisiontree_help.run_clusters_distribution('max', X_wo, Y_wo, df_original_wo, 'kmeans', 8) 

### Clustering 5 - Agglomerative clustering

#### depth = 8

In [None]:
df_help_8_agglo = decisiontree_help.create_df(8, X_wo, Y_wo, df_original_wo)
cluster_methods.elbow_method(df_help_8_agglo, 2, 20, "agglo")

In [None]:
decisiontree_help.run_clusters_distribution(8, X_wo, Y_wo, df_original_wo, 'agglo', 6) 

#### depth = max

In [None]:
# df_help_max_agglo = decisiontree_help.create_df('max', X_wo, Y_wo, df_original_wo)
# cluster_methods.elbow_method(df_help_max_agglo, 2, 20, "agglo")

# decisiontree_help.run_clusters_distribution('max', X_wo, Y_wo, df_original_wo, 'agglo', 10) 