In [1]:
import pandas as pd
from IPython.display import display

## Process global refugee data
From UNHCR 1990 onward for all countries

In [2]:
refugees = pd.read_csv('../refugee_data/1990_onward_unhcr.csv', skiprows=14)
refugees = refugees[refugees.columns[:6]]
refugees = refugees[(refugees['Year']>=2000) & (refugees['Year']<=2020)]
display(refugees.head())

Unnamed: 0,Year,Country of origin,Country of origin (ISO),Country of asylum,Country of asylum (ISO),Refugees under UNHCR's mandate
14274,2000,Afghanistan,AFG,Afghanistan,AFG,0
14275,2000,Afghanistan,AFG,Egypt,EGY,60
14276,2000,Afghanistan,AFG,Australia,AUS,4358
14277,2000,Afghanistan,AFG,Austria,AUT,679
14278,2000,Afghanistan,AFG,Azerbaijan,AZE,172


## Bordering analysis

Determine whether the origin and asylum countries border each other

In [3]:
borders = pd.read_csv('../refugee_data/GEODATASOURCE-COUNTRY-BORDERS.CSV')
iso_codes = pd.read_csv('../refugee_data/wikipedia-iso-country-codes.csv')
iso_lookup = dict(zip(iso_codes['Alpha-2 code'],iso_codes['Alpha-3 code']))
borders['iso3'] = borders.country_code.apply(lambda x: iso_lookup.get(x))
borders['iso3_border'] = borders.country_border_code.apply(lambda x: iso_lookup.get(x))

borders_lookup = {}
for kk in borders.iso3.unique():
    borders_lookup[kk] = set(borders[borders['iso3']==kk]['iso3_border'].to_list())

print(f"Borders for USA are: {borders_lookup['USA']}")

Borders for USA are: {'MEX', 'CAN'}


In [4]:
missing = set()
def is_bordering(row):
    try:
        bordering = borders_lookup[row['Country of origin (ISO)']]
    except:
        missing.add(row['Country of origin (ISO)'])
        return None
    if row['Country of asylum (ISO)'] in bordering:
        return True
    else:
        return False

In [5]:
refugees['is_bordering'] = refugees.apply(lambda row: is_bordering(row), axis=1)

missed = list(refugees[refugees['Country of origin (ISO)'].isin(missing)]['Country of origin'].unique())
print(f"Unable to match the following countries: {missed}")

Unable to match the following countries: ['Stateless', 'Tibetan', 'Unknown ', 'Curacao ']


In [6]:
print(f"Of the total {refugees.shape[0]} observations, only {refugees[refugees['is_bordering']==True].shape[0]}"\
      " were between bordering countries")

Of the total 89468 observations, only 5661 were between bordering countries


## Process historic GDP
Data is from [World Bank](https://data.worldbank.org/indicator/NY.GDP.MKTP.CD?most_recent_year_desc=false)

In [96]:
gdp = pd.read_csv('../refugee_data/API_NY.GDP.MKTP.CD_DS2_en_csv_v2_4019306.csv', skiprows=4)
gdp = pd.melt(gdp, id_vars=gdp.columns[:4], value_vars=gdp.columns[4:65])\
        .rename(columns={'variable': 'Year', 'value': 'GDP (current US$)'})\
        [['Country Name','Country Code','Year','GDP (current US$)']]
gdp['Year'] = gdp.Year.apply(lambda x: int(x))
gdp = gdp.sort_values(by=['Country Name','Year']).backfill()
display(gdp.head())

Unnamed: 0,Country Name,Country Code,Year,GDP (current US$)
2,Afghanistan,AFG,1960,537777800.0
268,Afghanistan,AFG,1961,548888900.0
534,Afghanistan,AFG,1962,546666700.0
800,Afghanistan,AFG,1963,751111200.0
1066,Afghanistan,AFG,1964,800000000.0


## Process historic population
Data is from [World Bank](https://data.worldbank.org/indicator/SP.POP.TOTL?most_recent_year_desc=false)

In [98]:
pop = pd.read_csv('../refugee_data/API_SP.POP.TOTL_DS2_en_csv_v2_4019998.csv', skiprows=4)
pop = pop.dropna(subset=['Country Code'])
pop = pd.melt(pop, id_vars=pop.columns[:4], value_vars=pop.columns[4:65])\
        .rename(columns={'variable': 'Year', 'value': 'Population, total'})\
        [['Country Name','Country Code','Year','Population, total']]
pop['Year'] = pop.Year.apply(lambda x: int(x))
display(pop.head())

Unnamed: 0,Country Name,Country Code,Year,"Population, total"
0,Aruba,ABW,1960,54208.0
1,Africa Eastern and Southern,AFE,1960,130836765.0
2,Afghanistan,AFG,1960,8996967.0
3,Africa Western and Central,AFW,1960,96396419.0
4,Angola,AGO,1960,5454938.0


## V-Dem
Process V-Dem data from March 2022 [V-Dem core](https://www.v-dem.net/vdemds.html)

In [99]:
vdem = pd.read_csv('../refugee_data/vdem_1990_2020.csv')[['country_name','country_text_id','year','v2xeg_eqdr','v2x_libdem']]
vdem = vdem.rename(columns={'country_name': 'Country Name',
                            'country_text_id': 'Country Code',
                            'year': 'Year'})
display(vdem.head())

Unnamed: 0,Country Name,Country Code,Year,v2xeg_eqdr,v2x_libdem
0,Mexico,MEX,1991,0.283,0.204
1,Mexico,MEX,1992,0.283,0.216
2,Mexico,MEX,1993,0.283,0.224
3,Mexico,MEX,1994,0.283,0.266
4,Mexico,MEX,1995,0.294,0.281


### Combining Data

In [100]:
pop.drop(labels=['Country Name'], axis=1, inplace=True)
gdp.drop(labels=['Country Name'], axis=1, inplace=True)
vdem.drop(labels=['Country Name'], axis=1, inplace=True)

pop_gdp = pd.merge(gdp, pop, how='outer', 
             left_on=['Year','Country Code'],
             right_on=['Year','Country Code'])

vdem_pop_gdp = pd.merge(vdem, pop_gdp, how='outer', 
                 left_on=['Year','Country Code'],
                 right_on=['Year','Country Code'])
display(vdem_pop_gdp.head())

Unnamed: 0,Country Code,Year,v2xeg_eqdr,v2x_libdem,GDP (current US$),"Population, total"
0,MEX,1991,0.283,0.204,313142800000.0,85512621.0
1,MEX,1992,0.283,0.216,363157600000.0,87075136.0
2,MEX,1993,0.283,0.224,500736100000.0,88625440.0
3,MEX,1994,0.283,0.266,527813200000.0,90156396.0
4,MEX,1995,0.294,0.281,360073900000.0,91663290.0


In [101]:
vdem_pop_gdp.to_csv('../refugee_data/vdem_pop_gdp.csv', index=False)

In [102]:
merged = pd.merge(refugees, vdem_pop_gdp, how='left',
            left_on=['Year','Country of asylum (ISO)'],
            right_on=['Year','Country Code'])

In [114]:
merged.head()

Unnamed: 0,Year,Country of origin,Country of origin (ISO),Country of asylum,Country of asylum (ISO),Refugees under UNHCR's mandate,is_bordering,Country Code,v2xeg_eqdr,v2x_libdem,GDP (current US$),"Population, total"
0,2000,Afghanistan,AFG,Afghanistan,AFG,0,False,AFG,0.04,0.028,4055180000.0,20779957.0
1,2000,Afghanistan,AFG,Egypt,EGY,60,False,EGY,0.167,0.158,99838540000.0,68831561.0
2,2000,Afghanistan,AFG,Australia,AUS,4358,False,AUS,0.94,0.854,415576200000.0,19153000.0
3,2000,Afghanistan,AFG,Austria,AUT,679,False,AUT,0.936,0.791,197289600000.0,8011566.0
4,2000,Afghanistan,AFG,Azerbaijan,AZE,172,False,AZE,0.189,0.083,5272798000.0,8048600.0


In [115]:
merged.to_csv('../refugee_data/merged_refugee_data.csv', index=False)

### Ukraine only data

In [116]:
borders_lookup['UKR']

{'BLR', 'HUN', 'MDA', 'POL', 'ROU', 'RUS', 'SVK'}

In [127]:
vdem_pop_gdp[(vdem_pop_gdp['Country Code'].isin(borders_lookup['UKR'])) & \
             (vdem_pop_gdp['Year']==2020)] \
            .to_csv('../refugee_data/ukr_pred_inputs.csv', index=False)

## Update Collected Data

In [180]:
iso_name_lookup = dict(zip(iso_codes['English short name lower case'], iso_codes['Alpha-3 code']))
iso_name_lookup['Iran'] = 'IRN'
iso_name_lookup['Democratic Republic of the Congo'] = 'COD'
iso_name_lookup['Tanzania'] = 'TZA'
iso_name_lookup['Republic of the Congo'] = 'COG'
iso_name_lookup['Moldova'] = 'MDA'
iso_name_lookup['Syria'] = 'SYR'
iso_name_lookup['Venezuela'] = 'VEN'

In [181]:
unhcr_conf = pd.read_csv('../refugee_data/unhcr_conflict_data.csv')

def lag_year(x):
    if x > 2021:
        return 2020
    else:
        return x-1

unhcr_conf['year_t-1'] = unhcr_conf['conflict_start_year'].apply(lambda x: lag_year(x))

In [182]:
unhcr_conf['Country Code'] = unhcr_conf.country.apply(lambda x: iso_name_lookup[x])
unhcr_conf['Conflict Country Code'] = unhcr_conf.conflict.apply(lambda x: iso_name_lookup[x])

In [183]:
unhcr_conf = pd.merge(unhcr_conf, vdem_pop_gdp, how='left', left_on=['Country Code','year_t-1'], right_on=['Country Code','Year'])

In [184]:
missing = set()
def is_bordering(row):
    try:
        bordering = borders_lookup[row['Conflict Country Code']]
    except:
        missing.add(row['Country Code'])
        return None
    if row['Country Code'] in bordering:
        return True
    else:
        return False

unhcr_conf['is_bordering'] = unhcr_conf.apply(lambda row: is_bordering(row), axis=1)

In [185]:
def gen_gradient(row):
    match = vdem_pop_gdp[(vdem_pop_gdp['Country Code']==row['Conflict Country Code']) & (vdem_pop_gdp['Year']==row['year_t-1'])].iloc[0]
    row['GDP.gradient'] = row['GDP (current US$)'] - match['GDP (current US$)']
    row['v2x_libdem.gradient'] = row['v2x_libdem'] - match['v2x_libdem']
    return row

In [186]:
unhcr_conf = unhcr_conf.apply(lambda row: gen_gradient(row),axis=1)

In [187]:
from sklearn.preprocessing import MinMaxScaler

cols_to_scale = ['GDP.gradient','GDP (current US$)']
scaler = MinMaxScaler()
for col in cols_to_scale:
    print(f"Normalizing column: {col}")
    normed = pd.DataFrame()
    
    for y, x in unhcr_conf.groupby('conflict'):
        norm_ = [i[0] for i in scaler.fit_transform(x[col].values.reshape(-1,1))]
        countries = x['country']
        conflict_ = x['conflict']
        res = pd.DataFrame(tuple(zip(countries,conflict_,norm_)), columns=['country','conflict',f"{col}_norm"])
        normed = pd.concat([normed,res])
    unhcr_conf = pd.merge(unhcr_conf, normed, left_on=['country','conflict'], right_on=['country','conflict'], how='right')

Normalizing column: GDP.gradient
Normalizing column: GDP (current US$)


In [188]:
unhcr_conf['pct_tot'] = unhcr_conf['refugees'] / unhcr_conf.groupby('conflict')['refugees'].transform('sum')

In [189]:
unhcr_conf

Unnamed: 0,country,conflict,refugees,conflict_start_year,year_t-1,Country Code,Conflict Country Code,Year,v2xeg_eqdr,v2x_libdem,GDP (current US$),"Population, total",is_bordering,GDP.gradient,v2x_libdem.gradient,GDP.gradient_norm,GDP (current US$)_norm,pct_tot
0,Iran,Afghanistan,780000,2001,2000,IRN,AFG,2000,0.792,0.167,1.095917e+11,65623397.0,True,1.055365e+11,0.139,1.000000,1.000000,0.348385
1,Pakistan,Afghanistan,1448100,2001,2000,PAK,AFG,2000,0.116,0.098,8.201774e+10,142343583.0,True,7.796256e+10,0.070,0.746402,0.746402,0.646791
2,Tajikistan,Afghanistan,10700,2001,2000,TJK,AFG,2000,0.473,0.081,8.605211e+08,6216329.0,True,-3.194658e+09,0.053,0.000000,0.000000,0.004779
3,Uzbekistan,Afghanistan,50,2001,2000,UZB,AFG,2000,0.683,0.035,1.376051e+10,24650400.0,True,9.705334e+09,0.007,0.118641,0.118641,0.000022
4,Turkmenistan,Afghanistan,50,2001,2000,TKM,AFG,2000,0.574,0.017,2.904663e+09,4516128.0,True,-1.150517e+09,-0.011,0.018800,0.018800,0.000022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,Ecuador,Venezuela,508935,2014,2013,ECU,VEN,2013,0.669,0.279,9.512966e+10,15707473.0,False,-2.758757e+11,0.166,0.036847,0.036847,0.102881
60,Guyana,Venezuela,24500,2014,2013,GUY,VEN,2013,0.661,0.397,4.167800e+09,759281.0,True,-3.668376e+11,0.284,0.000000,0.000000,0.004953
61,Mexico,Venezuela,82978,2014,2013,MEX,VEN,2013,0.314,0.418,1.274443e+12,118827158.0,False,9.034377e+11,0.305,0.514562,0.514562,0.016774
62,Panama,Venezuela,121598,2014,2013,PAN,VEN,2013,0.697,0.563,4.559999e+10,3835447.0,False,-3.254054e+11,0.450,0.016783,0.016783,0.024581


In [190]:
unhcr_conf.to_csv('../refugee_data/refugee_conflict_5.20.22.csv', index=False)

In [191]:
iso3_numeric = dict(zip(iso_codes['Alpha-3 code'], iso_codes['Numeric code']))

In [192]:
iso3_numeric['SSD'] = 9999

In [193]:
unhcr_conf['ccode_origin'] = unhcr_conf['Conflict Country Code'].apply(lambda x: iso3_numeric[x]).astype(int)
unhcr_conf['ccode_dest'] = unhcr_conf['Country Code'].apply(lambda x: iso3_numeric[x]).astype(int)

In [208]:
training = unhcr_conf[(unhcr_conf['conflict']!='Ukraine')&(unhcr_conf['is_bordering']==True)]\
        [['conflict','country','pct_tot','v2x_libdem.gradient','GDP.gradient_norm']]
training.to_csv('../mr-qap/training.csv', index=False)

In [209]:
test = unhcr_conf[(unhcr_conf['conflict']=='Ukraine')&(unhcr_conf['is_bordering']==True)]\
        [['conflict','country','pct_tot','v2x_libdem.gradient','GDP.gradient_norm']]
test.to_csv('../mr-qap/test.csv', index=False)

In [205]:
test

Unnamed: 0,ccode_origin,ccode_dest,pct_tot,v2x_libdem.gradient,GDP.gradient_norm
46,804,348,0.091808,0.044,0.097781
47,804,112,0.004001,-0.241,0.032851
48,804,498,0.068523,0.158,0.0
49,804,616,0.504044,0.15,0.397333
50,804,642,0.13818,0.249,0.160915
51,804,643,0.130068,-0.215,1.0
52,804,703,0.063375,0.435,0.063372


In [204]:
unhcr_conf[unhcr_conf['conflict']=='Ukraine'][['country','ccode_dest']]

Unnamed: 0,country,ccode_dest
46,Hungary,348
47,Belarus,112
48,Moldova,498
49,Poland,616
50,Romania,642
51,Russian Federation,643
52,Slovakia,703


In [212]:
results = pd.read_csv('../mr-qap/mr-qap-results.csv')

In [215]:
results.columns

Index(['Unnamed: 0', 'x'], dtype='object')

In [218]:
results = results.rename(columns={'Unnamed: 0': 'country', 'x': 'predicted'})

In [221]:
results.predicted = results.predicted / results.predicted.sum()

In [222]:
results.predicted.sum()

1.0

In [223]:
results

Unnamed: 0,country,predicted
0,Hungary,0.060802
1,Belarus,-0.045704
2,Moldova,0.045239
3,Poland,0.229855
4,Romania,0.145133
5,Russian Federation,0.415794
6,Slovakia,0.148879
