# 4 - Destination choice

#### Imports

In [None]:
import pandas as pd
import pickle
import numpy as np
import statsmodels.api as sm
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

import cluster_methods
import decisiontree_help
import cluster_vis

import warnings
warnings.filterwarnings("ignore")
%load_ext autoreload
%autoreload 2

#### Data preparation

Columns data

In [None]:
# read column data
source = "meta/columns"
df_meta = pd.read_pickle(source)

Prepare country_per_year dataframe

In [None]:
# read data from pickle
source = "country_data/country_per_year.pickle"
df_country_per_year = pd.read_pickle(source)

Read number to ISO code

In [None]:
with open('meta/countrynum_to_ISO_dict.pickle', 'rb') as fp:
    num_to_ISO = pickle.load(fp)

Prepare Gallup data

In [None]:
# Read in the 'clean_data_from18to22' pickle file from the 'gwp_data/prepared_destination' directory
df_gwp = pd.read_pickle("gwp_data/prepared_destination/clean_data_from18to22_")

# Keep only rows where the value in the 'WP1325: Move Permanently to Another Country' column is 1
df_gwp = df_gwp[df_gwp['WP1325: Move Permanently to Another Country']== 1]

# Drop the 'WP1325: Move Permanently to Another Country', 'COUNTRY_ISO3: Country ISO alpha-3 code',
# 'WP5889: Questionnaire Serial Number', and 'WP5: Country' columns from the DataFrame
df_gwp.drop(["WP1325: Move Permanently to Another Country", "COUNTRY_ISO3: Country ISO alpha-3 code","WP5889: Questionnaire Serial Number", "WP5: Country"], axis=1, inplace=True)

# 900, 901, 902, 997, 998, 999, 903, 997, 200, 207, 133, 203, 199, 0 remove
not_needed_index = df_gwp[df_gwp["WP3120: Country Would Move To"].isin([900, 901, 902, 997, 998, 999, 903, 997, 200, 207, 133, 203, 199, 0])].index
df_gwp.drop(not_needed_index, inplace=True)

# convert numbers to ISO code
df_gwp["WP3120: Country Would Move To"] = df_gwp["WP3120: Country Would Move To"].map(num_to_ISO).fillna(df_gwp["WP3120: Country Would Move To"])

Adding Gallup data to the country_per_year datafram

In [None]:
# Initialize empty lists to store the columns based on their categorical type
yes_columns = []
yn_columns = []
ordinal_columns = []
no_columns = []

# Iterate over the columns in the df_gwp DataFrame
for col in df_gwp.columns:
    # Select the rows in the df_meta DataFrame that contain the current column in the "column" column
    l = list(df_meta[df_meta['column'].str.contains(col)]["categorical?"])
    # If a match was found in the df_meta DataFrame
    if len(l) !=0:
        # If the "categorical?" column contains "yes", add the column to the yes_columns list
        if "yes" in l[0]:
            yes_columns.append(col)
        # If the "categorical?" column contains "yn", add the column to the yn_columns list
        if "yn" in l[0] :
            yn_columns.append(col)
        # If the "categorical?" column contains "ordinal", add the column to the ordinal_columns list
        if "ordinal" in l[0] :
            ordinal_columns.append(col)
        # If the "categorical?" column contains "no", add the column to the no_columns list
        if "no" in l[0] :
            no_columns.append(col)

# Create a set of the yes_columns and yn_columns lists, and intersect it with the columns in the df_gwp DataFrame
cat_columns = set(yn_columns + yes_columns).intersection(df_gwp.columns)
# Remove the "YEAR_WAVE: Wave Year", "WP3120: Country Would Move To", and "WP5889: Questionnaire Serial Number" columns from the cat_columns set
cat_columns = cat_columns.difference(set(["YEAR_WAVE: Wave Year", "WP3120: Country Would Move To", "WP5889: Questionnaire Serial Number"]))

# Create a set of the ordinal_columns and no_columns lists, and intersect it with the columns in the df_gwp DataFrame
count_columns = set(ordinal_columns + no_columns).intersection(df_gwp.columns)
count_columns = count_columns.difference(set(["YEAR_WAVE: Wave Year", "WP3120: Country Would Move To", "WP5889: Questionnaire Serial Number"]))

df = df_country_per_year.set_index(["COUNTRY_ISO3: Country ISO alpha-3 code", "YEAR_WAVE: Wave Year"]).drop_duplicates()

In [None]:
# Initialize an empty dataframe
df_help_full = pd.DataFrame()

# Loop through the years
for year in range(2016, 2022):
    # Create a new dataframe with the data for the current year
    df_help = pd.DataFrame()
    df_help["WP3120: Country Would Move To"] = df_gwp[df_gwp["YEAR_WAVE: Wave Year"]==year]["WP3120: Country Would Move To"].unique()
    df_help["YEAR_WAVE: Wave Year"] = [year]* len(df_help["WP3120: Country Would Move To"])
    for col in count_columns:
        df_help[col] = list(df_gwp[df_gwp["YEAR_WAVE: Wave Year"]==year].groupby(["WP3120: Country Would Move To"])[col].mean())
    for col in cat_columns:
        mode = list(df_gwp[df_gwp["YEAR_WAVE: Wave Year"]==year].groupby(["WP3120: Country Would Move To"])[col].apply(lambda x: x.mode() if type(x)==int else np.mean(x.mode().astype(int))))
        df_help[col] = [x.mean() if isinstance(x, list) else x for x in mode]

    # Append the data from the current year to the df_help_full dataframe
    df_help_full = df_help_full.append(df_help, ignore_index=True)

# Convert numbers to ISO code
df_help_full["WP3120: Country Would Move To"] = df_help_full["WP3120: Country Would Move To"].map(num_to_ISO).fillna(df_help_full["WP3120: Country Would Move To"]).astype(str)

# Set the index to be the country and year columns
df_help_full = df_help_full.set_index(["WP3120: Country Would Move To", "YEAR_WAVE: Wave Year"])

Calculate popularity score for each county

In [None]:
# Initialize an empty dictionary to store the popularity values
dict_help = dict()

# Loop through the years from 2007 to 2021
for year in range(2007, 2022):
    # Initialize an empty dataframe to store the data for the current year
    df_help = pd.DataFrame()
    # Calculate the popularity of each destination country as the number of people who selected it as a destination, divided by the total number of respondents for the year, and multiplied by 1,000,000
    df_help['popularity'] = df_gwp[df_gwp["YEAR_WAVE: Wave Year"]==year].groupby(["WP3120: Country Would Move To"])["WP3120: Country Would Move To"].count()/df_gwp[df_gwp["YEAR_WAVE: Wave Year"]==year]["YEAR_WAVE: Wave Year"].sum()*1000000

    # Iterate over the rows in the df_help DataFrame
    for ind, row in df_help.iterrows():
        # Add the popularity value to the dict_help dictionary, with the destination country and year as the key
        dict_help[(ind, year)] = row[0]

# Create a set of the index values in the df DataFrame
index_set = set(df.index)
# Create a set of the keys in the dict_help dictionary
dict_set = set(dict_help.keys())
# Find the intersection of the index_set and dict_set sets
intersect = index_set.intersection(dict_set)

# Filter the df DataFrame to keep only the rows with index values in the intersect set
df_filtered = df.drop_duplicates().loc[intersect]
# Filter the dict_help dictionary to keep only the key-value pairs with keys in the intersect set
filtered_dict = {k:v for (k,v) in dict_help.items() if k in intersect}

# Add a 'popularity' column to the df_filtered DataFrame, mapping the filtered_dict dictionary to the index values of the DataFrame
df_filtered["popularity"] = df_filtered.index.map(filtered_dict)

Join dataframes for regression

In [None]:
# Rename the index of the df_filtered DataFrame to 'ISO'
df_filtered = df_filtered.rename_axis(index={'COUNTRY_ISO3: Country ISO alpha-3 code': 'ISO'})
# Rename the index of the df_help_full DataFrame to 'ISO'
df_help_full = df_help_full.rename_axis(index={'WP3120: Country Would Move To': 'ISO'})
# Join the df_filtered and df_help_full dataframes on the 'ISO' and 'YEAR_WAVE: Wave Year' columns
df_joined = df_filtered.join(df_help_full, on=['ISO', 'YEAR_WAVE: Wave Year'])
# Apply a function to the df_joined DataFrame, replacing lists with their mean values
df_joined = df_joined.applymap(lambda x: x.mean() if type(x)==list else x)

# Create a copy of the df_joined DataFrame, called df_lasso, and drop the 'freedom', 'POP', and 'logPOP' columns
df_lasso =df_joined.copy()

# Impute missing values in the df_lasso DataFrame
df_lasso = df_lasso.fillna(df_lasso.mean())

# Print the df_lasso DataFrame
df_lasso

Regression to determine coefficients

In [None]:
# data is assigned the values of the dataframe df_lasso as a numpy array
data = df_lasso.values

# X is assigned all the columns of data except the last one, while y is assigned the last column of data
X, y = data[:, :-1], data[:, -1]

# Add a constant term to the predictor variables in X
X = sm.add_constant(X)

# Create a linear regression model
LR = LinearRegression()

# Feature selection 
rfe = RFE(LR, n_features_to_select=35)

# Fit the RFE object to the data
rfe = rfe.fit(X, y)

# Transform the data using the selected features
X_rfe = rfe.transform(X)

# Create an OLS model using y as the dependent variable and X as the predictor variables
model = sm.OLS(y, X_rfe)

# Fit the model to the data and store the results in a variable called results
results = model.fit()

# Print a summary of the model's fit to the data
results.summary()

Weight dataframe with the coefficients

In [None]:
# Get names of the columns to the list of predictors
predictors = df_lasso.columns[rfe.support_]
print(predictors)

# Get the coefficients of the model, excluding the constant term, and append a value of 1 to the end of the list
params = results.params

# merge the names of the predictors with their corresponding coefficients
coef = pd.Series(params, predictors).sort_values()

coef.plot(kind='bar', title='Model Coefficients')

In [None]:
# Get names of the columns to the list of predictors
predictors = df_lasso.columns[rfe.support_]
print(predictors)

# Get the p-values
p_values = results.pvalues

# merge the names of the predictors with their corresponding coefficients
coef = pd.Series(p_values, predictors)

coef.plot(kind='bar', title='Model P-values')
# put a line at 0.05
plt.axhline(y=0.05, color='r', linestyle='-')

In [None]:
# Create a copy of the df_lasso DataFrame and drop all the columns that are not in the list of predictors
df_lasso_small = df_lasso[predictors]

# Multiply all the values in the dataframe df_lasso by the values in the params list
df_lasso_small = df_lasso_small*params

# Reset the index of the dataframe
df_lasso_small = df_lasso_small.reset_index()

# Print the first 5 rows of the dataframe
df_lasso_small.head()

#### Clustering in year 2021

In [None]:
data_wyear = df_lasso_small[df_lasso_small["YEAR_WAVE: Wave Year"]==2021]
data = data_wyear.drop([ "YEAR_WAVE: Wave Year"], axis=1)

In [None]:
# Get the number of clusters with elbow method
cluster_methods.elbow_method(data_wyear.set_index('ISO'), 2, 20, 'agglo')

In [None]:
K = 5
clusters = cluster_methods.simple_cluster(data_wyear.set_index('ISO') , K, "agglo")
c_names = list(set(clusters))
ISO_to_num = {v:k for k,v in num_to_ISO.items()}

decisiontree_help.print_clusters(num_to_ISO, c_names, clusters, list(data_wyear["ISO"]))

df_vis = pd.DataFrame()
df_vis["WP5: Country"] = data_wyear["ISO"]
df_vis["COUNTRY_ISO3: Country ISO alpha-3 code"] = data_wyear["ISO"]
df_vis["cluster"] = clusters

cluster_vis.cluster_visualization(df_vis, clusters, "decision_K6_agglo")