# Exploratory Data Analysis

In [None]:
# imports
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

### Read from pickle

In [None]:
source = "gwp_data/prepared_aspiration/full_aspiration"
df_asp = pd.read_pickle(source)

source = "gwp_data/prepared_destination/full_destination"
df_des = pd.read_pickle(source)

#### Basic statistics

In [None]:
print("Size of the dataframe: ", df_asp.shape)
print("Year range: ", df_asp["YEAR_WAVE: Wave Year"].min(), " - ", df_asp["YEAR_WAVE: Wave Year"].max())
print("Number of countries: ", df_asp["COUNTRY_ISO3: Country ISO alpha-3 code"].nunique())

### Age

In [None]:
sns.histplot(data = df_asp['WP1220: Age'], bins = 20)

age = [df_asp["WP1220: Age"].where(df_asp["WP1220: Age"] < 100).dropna().values]

print("Age range: ", np.min(age), " - ", np.max(age))
print("Age mean: ", np.mean(age))
print("Age median: ", np.median(age))
print("Age standard deviation: ", np.std(age))

#### Gender

In [None]:
sns.histplot(data = df_asp['WP1219: Gender'])

print(df_asp["WP1219: Gender"].value_counts(normalize = True))

#### Country

In [None]:
df_asp['WP5: Country'].describe()

#### Number of respondents per countries

In [None]:
plt.figure(figsize=(20,8))

mini_df = pd.DataFrame(df_asp['WP5: Country'].value_counts())
mini_df['ISO'] = df_asp['COUNTRY_ISO3: Country ISO alpha-3 code'].unique()
mini_df.set_index('ISO', inplace = True)
mini_df = mini_df.sort_values(by = 'WP5: Country', ascending = False)

g = sns.barplot(x = mini_df.index, y = mini_df.values.flatten())
plt.xticks(rotation=90)
sns.set(font_scale = 1)

plt.xlabel('Country ISO3 code')
plt.ylabel('Number of respondents')
plt.title('Number of respondents per country')

plt.show()

In [None]:
df_asp['WP5: Country'].value_counts().sort_values

#### Countries and migration aspiration

In [None]:
df_asp.groupby(['WP5: Country', 'WP1325: Move Permanently to Another Country'])['YEAR_WAVE: Wave Year'].count()

In [None]:
help_df_decision = pd.DataFrame()
help_df_decision['all_answers'] = df_asp.groupby(['WP5: Country'])['WP1325: Move Permanently to Another Country'].count()
help_df_decision['yes_answers'] = df_asp[df_asp['WP1325: Move Permanently to Another Country']==1].groupby(['WP5: Country'])['WP1325: Move Permanently to Another Country'].count()
help_df_decision['percent_want_to_go'] = help_df_decision['yes_answers']/help_df_decision['all_answers']
help_df_decision['ISO'] = df_asp['COUNTRY_ISO3: Country ISO alpha-3 code'].unique()
help_df_decision.set_index('ISO', inplace = True)
help_df_decision.sort_values(by='percent_want_to_go', ascending = False, inplace = True)
plt.figure(figsize=(18,8))
g = sns.barplot(x = help_df_decision.index, y=help_df_decision['percent_want_to_go'])
plt.xticks(rotation=90)
plt.xlabel('Country ISO3 code')
plt.ylabel('Percent of respondents who want to move permanently to another country')
plt.title('Percent of respondents who want to move permanently to another country per country')
sns.set(font_scale = 0.6) 
plt.show()

In [None]:
print("Percent of respondents who want to move permanently to another country range: ", help_df_decision['percent_want_to_go'].min()*100, " - ", help_df_decision['percent_want_to_go'].max()*100)
print("Percent of respondents who want to move permanently to another country mean: ", help_df_decision['percent_want_to_go'].mean()*100)
print("Top 10 countries with the highest percent of respondents who want to move permanently to another country: ")
print(help_df_decision['percent_want_to_go'].head(10)*100)
print("Top 10 countries with the lowest percent of respondents who want to move permanently to another country: ")
print(help_df_decision['percent_want_to_go'].tail(10)*100)

In [None]:
source = "country_data/country_per_year.pickle"
df_country = pd.read_pickle(source)

df_country.groupby(['COUNTRY_ISO3: Country ISO alpha-3 code'])['POP'].mean()

help_df_decision['POP'] = df_country.groupby(['COUNTRY_ISO3: Country ISO alpha-3 code'])['POP'].mean()
help_df_decision['volume'] = help_df_decision['POP']*help_df_decision['percent_want_to_go']

help_df_decision.sort_values(by='volume', ascending = False, inplace = True)


plt.figure(figsize=(18,8))
# log scale
g = sns.barplot(x = help_df_decision.index, y=help_df_decision['volume'], log = True)
plt.xticks(rotation=90)
plt.xlabel('Country ISO3 code')
plt.ylabel('Logarithm of the volume of people who want to move permanently to another country')
plt.title('Log-volume of people who want to move permanently to another country per country')
sns.set(font_scale = 0.6)
plt.show()


In [None]:
# top 10 countries with the highest volume of people who want to move permanently to another country
print("Top 10 countries with the highest volume of people who want to move permanently to another country: ")
print(help_df_decision['volume'].head(10))

#### Country where to move

In [None]:
# import num to ISO3 from pickle file
source = "meta/countrynum_to_ISO_dict.pickle"
num_to_ISO = pd.read_pickle(source)

In [None]:
help_df_where = pd.DataFrame()
help_df_where['number_data'] = df_des.groupby(["WP5: Country","WP3120: Country Would Move To"])["YEAR_WAVE: Wave Year"].count()
list = []
dict = help_df_where.groupby(["WP5: Country"]).count()
for index, elem in help_df_where['number_data'].iteritems():
    list.append(elem / dict[dict.index == index[0]]['number_data'].values[0])
help_df_where['percent'] = list
help_df_where1 =help_df_where.drop('number_data', axis=1)

help_df_where1

In [None]:
start_num = len(help_df_where1.index.get_level_values(0).unique())
end_num = len(help_df_where1.index.get_level_values(1).unique())
map = np.zeros((start_num, end_num))
for index, elem in help_df_where1.iterrows():
    if int(index[0]) < start_num and int(index[1]) < end_num:
        map[int(index[0])][int(index[1])] = elem[0]

plt.figure(figsize=(15,15))
plt.xlabel('Country ISO3 code of destination')
plt.ylabel('Country ISO3 code of origin')
plt.title('Percent of respondents who want to move permanently to another country per country')
sns.set(font_scale = 0.6)
sns.heatmap(map, cmap="crest", vmax = 1)#, vmax=50)#, annot=True)

Country Would Move To:
(Asked only of those who would like to move to another country.)

In [None]:
# read pickle file country number to name
source = "meta/countrynum_to_name_dict.pickle"
countrynum_to_name_dict = pd.read_pickle(source)

In [None]:
help_df2 = pd.DataFrame(help_df_where.groupby('WP3120: Country Would Move To')['number_data'].sum()/help_df_where["number_data"].sum())
# map country num to ISO3
help_df2['ISO'] = help_df2.index.map(countrynum_to_name_dict)
help_df2 = help_df2.dropna()
help_df2.set_index('ISO', inplace = True)
help_df2.sort_values(by='number_data', ascending=False, inplace = True)
help_df2 *= 100

In [None]:
# top 10 countries where people want to move
print("Top 10 countries where people want to move:")
print(help_df2.head(10))

# bottom 10 countries where people want to move if not nan
print("Bottom 10 countries where people want to move:")
print(help_df2.tail(10))
