In [1]:
import pandas as pd
from IPython.display import display

## Process global refugee data
From UNHCR 1990 onward for all countries

In [2]:
refugees = pd.read_csv('../refugee_data/1990_onward_unhcr.csv', skiprows=14)
refugees = refugees[refugees.columns[:6]]
refugees = refugees[(refugees['Year']>=2000) & (refugees['Year']<=2020)]
display(refugees.head())

Unnamed: 0,Year,Country of origin,Country of origin (ISO),Country of asylum,Country of asylum (ISO),Refugees under UNHCR's mandate
14274,2000,Afghanistan,AFG,Afghanistan,AFG,0
14275,2000,Afghanistan,AFG,Egypt,EGY,60
14276,2000,Afghanistan,AFG,Australia,AUS,4358
14277,2000,Afghanistan,AFG,Austria,AUT,679
14278,2000,Afghanistan,AFG,Azerbaijan,AZE,172


## Bordering analysis

Determine whether the origin and asylum countries border each other

In [3]:
borders = pd.read_csv('../refugee_data/GEODATASOURCE-COUNTRY-BORDERS.CSV')
iso_codes = pd.read_csv('../refugee_data/wikipedia-iso-country-codes.csv')
iso_lookup = dict(zip(iso_codes['Alpha-2 code'],iso_codes['Alpha-3 code']))
borders['iso3'] = borders.country_code.apply(lambda x: iso_lookup.get(x))
borders['iso3_border'] = borders.country_border_code.apply(lambda x: iso_lookup.get(x))

borders_lookup = {}
for kk in borders.iso3.unique():
    borders_lookup[kk] = set(borders[borders['iso3']==kk]['iso3_border'].to_list())

print(f"Borders for USA are: {borders_lookup['USA']}")

Borders for USA are: {'MEX', 'CAN'}


In [4]:
missing = set()
def is_bordering(row):
    try:
        bordering = borders_lookup[row['Country of origin (ISO)']]
    except:
        missing.add(row['Country of origin (ISO)'])
        return None
    if row['Country of asylum (ISO)'] in bordering:
        return True
    else:
        return False

In [5]:
refugees['is_bordering'] = refugees.apply(lambda row: is_bordering(row), axis=1)

missed = list(refugees[refugees['Country of origin (ISO)'].isin(missing)]['Country of origin'].unique())
print(f"Unable to match the following countries: {missed}")

Unable to match the following countries: ['Stateless', 'Tibetan', 'Unknown ', 'Curacao ']


In [6]:
print(f"Of the total {refugees.shape[0]} observations, only {refugees[refugees['is_bordering']==True].shape[0]}"\
      " were between bordering countries")

Of the total 89468 observations, only 5661 were between bordering countries


## Process historic GDP
Data is from [World Bank](https://data.worldbank.org/indicator/NY.GDP.MKTP.CD?most_recent_year_desc=false)

In [7]:
gdp = pd.read_csv('../refugee_data/API_NY.GDP.MKTP.CD_DS2_en_csv_v2_4019306.csv', skiprows=4)
gdp = pd.melt(gdp, id_vars=gdp.columns[:4], value_vars=gdp.columns[4:65])\
        .rename(columns={'variable': 'Year', 'value': 'GDP (current US$)'})\
        [['Country Name','Country Code','Year','GDP (current US$)']]
gdp['Year'] = gdp.Year.apply(lambda x: int(x))
display(gdp.head())

Unnamed: 0,Country Name,Country Code,Year,GDP (current US$)
0,Aruba,ABW,1960,
1,Africa Eastern and Southern,AFE,1960,20082720000.0
2,Afghanistan,AFG,1960,537777800.0
3,Africa Western and Central,AFW,1960,10404280000.0
4,Angola,AGO,1960,


## Process historic population
Data is from [World Bank](https://data.worldbank.org/indicator/SP.POP.TOTL?most_recent_year_desc=false)

In [8]:
pop = pd.read_csv('../refugee_data/API_SP.POP.TOTL_DS2_en_csv_v2_4019998.csv', skiprows=4)
pop = pop.dropna(subset=['Country Code'])
pop = pd.melt(pop, id_vars=pop.columns[:4], value_vars=pop.columns[4:65])\
        .rename(columns={'variable': 'Year', 'value': 'Population, total'})\
        [['Country Name','Country Code','Year','Population, total']]
pop['Year'] = pop.Year.apply(lambda x: int(x))
display(pop.head())

Unnamed: 0,Country Name,Country Code,Year,"Population, total"
0,Aruba,ABW,1960,54208.0
1,Africa Eastern and Southern,AFE,1960,130836765.0
2,Afghanistan,AFG,1960,8996967.0
3,Africa Western and Central,AFW,1960,96396419.0
4,Angola,AGO,1960,5454938.0


## V-Dem
Process V-Dem data from March 2022 [V-Dem core](https://www.v-dem.net/vdemds.html)

In [9]:
vdem = pd.read_csv('../refugee_data/vdem_2000_2020.csv')[['country_name','country_text_id','year','v2xeg_eqdr','v2x_libdem']]
vdem = vdem.rename(columns={'country_name': 'Country Name',
                            'country_text_id': 'Country Code',
                            'year': 'Year'})
display(vdem.head())

Unnamed: 0,Country Name,Country Code,Year,v2xeg_eqdr,v2x_libdem
0,Mexico,MEX,2001,0.317,0.487
1,Mexico,MEX,2002,0.317,0.494
2,Mexico,MEX,2003,0.317,0.501
3,Mexico,MEX,2004,0.317,0.508
4,Mexico,MEX,2005,0.317,0.508


### Combining Data

In [10]:
pop.drop(labels=['Country Name'], axis=1, inplace=True)
gdp.drop(labels=['Country Name'], axis=1, inplace=True)
vdem.drop(labels=['Country Name'], axis=1, inplace=True)

pop_gdp = pd.merge(gdp, pop, how='outer', 
             left_on=['Year','Country Code'],
             right_on=['Year','Country Code'])

vdem_pop_gdp = pd.merge(vdem, pop_gdp, how='outer', 
                 left_on=['Year','Country Code'],
                 right_on=['Year','Country Code'])
display(vdem_pop_gdp.head())

Unnamed: 0,Country Code,Year,v2xeg_eqdr,v2x_libdem,GDP (current US$),"Population, total"
0,MEX,2001,0.317,0.487,756706300000.0,100298152.0
1,MEX,2002,0.317,0.494,772106400000.0,101684764.0
2,MEX,2003,0.317,0.501,729336300000.0,103081020.0
3,MEX,2004,0.317,0.508,782240600000.0,104514934.0
4,MEX,2005,0.317,0.508,877476200000.0,106005199.0


In [11]:
merged = pd.merge(refugees, vdem_pop_gdp, how='left',
            left_on=['Year','Country of asylum (ISO)'],
            right_on=['Year','Country Code'])

In [14]:
merged.to_csv('../refugee_data/merged_refugee_data.csv', index=False)

### Ukraine only data

In [16]:
borders_lookup['UKR']

{'BLR', 'HUN', 'MDA', 'POL', 'ROU', 'RUS', 'SVK'}

In [28]:
vdem_pop_gdp[(vdem_pop_gdp['Country Code'].isin(borders_lookup['UKR'])) & \
             (vdem_pop_gdp['Year']==2020)] \
            .to_csv('../refugee_data/ukr_pred_inputs.csv', index=False)