In [182]:
import pandas as pd
import re

import numpy as np
import multiprocess as mp
 
cores = mp.cpu_count()-2 #Number of CPU cores on your system
partitions = cores #Define as many partitions as you want

from glob import glob

In [148]:
df1 = pd.read_csv('/Users/brandonrose/Downloads/S1_File/Data_1.csv')
df2 = pd.read_csv('/Users/brandonrose/Downloads/S1_File/Data_2.csv')
df3 = pd.read_csv('/Users/brandonrose/Downloads/S1_File/Data_3.csv')
df4 = pd.read_csv('/Users/brandonrose/Downloads/S1_File/Data_4.csv')
df5 = pd.read_csv('/Users/brandonrose/Downloads/S1_File/Data_5.csv')

df = pd.DataFrame()
df = df.append([df1,df2,df3,df4,df5])

In [149]:
base_features = df.columns[:7]

In [150]:
exp = re.compile(r'(19\d{2}|20\d{2})')

In [151]:
variable_lookup = {}
for i in df.columns[7:]:
    try:
        year = re.findall(exp, i)[0]
        variable_lookup[i] = (int(year), 
                              i.replace(f"{year}", '')\
                              .replace('..','.')\
                              .rstrip('.'))
    except:
        variable_lookup[i] = ('All', i)
    
features = {}
for kk, vv in variable_lookup.items():
    if vv[1] not in features:
        features[vv[1]]=[kk]
    else:
        features[vv[1]].append(kk)

In [152]:
kk = 'ref.flow'
vv = features[kk]

In [153]:
df_ = df[list(base_features) + vv]
df_ = pd.melt(df_, id_vars=base_features, value_vars=vv)

In [170]:
refugee_dyads = df_[df_['value']>1000]['dyad.id'].unique()

In [175]:
print(f"There are {len(refugee_dyads)} dyads with more than 1000 refugees")

There are 315 dyads with more than 1000 refugees


In [176]:
df = df[df['dyad.id'].isin(refugee_dyads)]

In [177]:
df.shape

(1590, 376)

In [178]:
def featurize(row):
    res = variable_lookup[row.variable]
    row['year'] = res[0]
    row['feature'] = res[1]
    return row

def get_feats(df):
    df = df.apply(lambda row: featurize(row), axis=1)
    return df

In [179]:
def parallelize(data, func):
    data_split = np.array_split(data, partitions)
    pool = mp.Pool(cores)
    data = pd.concat(pool.map(func, data_split))
    pool.close()
    pool.join()
    return data

In [180]:
print(f"There are {len(features.keys())} features to process.\n\nThey are {list(features.keys())}")

There are 24 features to process.

They are ['trade', 'ref.flow', 'alliance.defense', 'riv.strategic', 'rivalry.strategic.positional', 'rivalry.strategic.spatial', 'rivalry.strategic.ideological', 'rivalry.strategic.interv', 'contiguity', 'contiguity.any', 'min.distance', 'immigrant.population', 'pts.gradient', 'gdppc.gradient', 'polyarchy.additive.gradient', 'remittances', 'arms', 'remit', 'trips', 'arms.inverse', 'riv.strategic.positional', 'riv.strategic.ideological', 'riv.strategic.spatial', 'riv.strategic.interv']


In [181]:
for kk, vv in features.items():
    print(f"Trying {kk}")
    df_ = df[list(base_features) + vv]
    df_ = pd.melt(df_, id_vars=base_features, value_vars=vv)
    data = parallelize(df_, get_feats)
    data.rename(columns={'value': kk}, inplace=True)
    data.drop(labels=['variable','feature'], axis=1, inplace=True)
    data.to_csv(f'../refugee_data/s1_cleaned/{kk}.csv',index=False)
    del(data)
    del(df_)

Trying trade
Trying ref.flow
Trying alliance.defense
Trying riv.strategic
Trying rivalry.strategic.positional
Trying rivalry.strategic.spatial
Trying rivalry.strategic.ideological
Trying rivalry.strategic.interv
Trying contiguity
Trying contiguity.any
Trying min.distance
Trying immigrant.population
Trying pts.gradient
Trying gdppc.gradient
Trying polyarchy.additive.gradient
Trying remittances
Trying arms
Trying remit
Trying trips
Trying arms.inverse
Trying riv.strategic.positional
Trying riv.strategic.ideological
Trying riv.strategic.spatial
Trying riv.strategic.interv


In [234]:
files = glob('../refugee_data/s1_cleaned/*.csv')
d = pd.read_csv(files[0]).dropna()

In [235]:
for file in files[1:]:
    print(file)
    d_ = pd.read_csv(file).dropna()
    d = pd.merge(d, d_.replace('All',1990), left_on=list(base_features)+['year'], 
                 right_on=list(base_features)+['year'], how='outer')

../refugee_data/s1_cleaned/rivalry.strategic.positional.csv
../refugee_data/s1_cleaned/pts.gradient.csv
../refugee_data/s1_cleaned/riv.strategic.positional.csv
../refugee_data/s1_cleaned/trade.csv
../refugee_data/s1_cleaned/polyarchy.additive.gradient.csv
../refugee_data/s1_cleaned/alliance.defense.csv
../refugee_data/s1_cleaned/arms.inverse.csv
../refugee_data/s1_cleaned/immigrant.population.csv
../refugee_data/s1_cleaned/gdppc.gradient.csv
../refugee_data/s1_cleaned/ref.flow.csv
../refugee_data/s1_cleaned/riv.strategic.spatial.csv
../refugee_data/s1_cleaned/arms.csv
../refugee_data/s1_cleaned/rivalry.strategic.ideological.csv
../refugee_data/s1_cleaned/riv.strategic.csv
../refugee_data/s1_cleaned/remittances.csv
../refugee_data/s1_cleaned/contiguity.csv
../refugee_data/s1_cleaned/rivalry.strategic.interv.csv
../refugee_data/s1_cleaned/min.distance.csv
../refugee_data/s1_cleaned/remit.csv
../refugee_data/s1_cleaned/trips.csv
../refugee_data/s1_cleaned/rivalry.strategic.spatial.csv
../

In [236]:
d.shape

(9540, 32)

In [237]:
d.head()

Unnamed: 0,ccode1,ccode2,state.destination.name,state.destination.abb,state.origin.name,state.origin.abb,dyad.id,riv.strategic.interv,year,rivalry.strategic.positional,...,riv.strategic,remittances,contiguity,rivalry.strategic.interv,min.distance,remit,trips,rivalry.strategic.spatial,riv.strategic.ideological,contiguity.any
0,2,20,Canada,CAN,United States of America,USA,220,0.0,2016,,...,0.0,,,,,881.0,0.0,,0.0,
1,2,255,Germany,GMY,United States of America,USA,2255,0.0,2016,,...,0.0,,,,,0.0,0.0,,0.0,
2,20,255,Germany,GMY,Canada,CAN,20255,0.0,2016,,...,0.0,,,,,0.0,0.0,,0.0,
3,135,2,United States of America,USA,Peru,PER,1352,0.0,2016,,...,0.0,,,,,0.0,0.0,,0.0,
4,210,255,Germany,GMY,Netherlands,NTH,210255,0.0,2016,,...,0.0,,,,,0.0,0.0,,0.0,


Demonstrate that we have at least _some_ non nulls

In [239]:
d[(d['trade'].notnull()) & \
  (d['ref.flow'].notnull()) & \
  (d['remit'].notnull()) & \
  (d['trips'].notnull()) & \
  (d['arms.inverse'].notnull())] \
    [list(base_features) + ['trade','ref.flow','remit','trips','arms.inverse']].head()

Unnamed: 0,ccode1,ccode2,state.destination.name,state.destination.abb,state.origin.name,state.origin.abb,dyad.id,trade,ref.flow,remit,trips,arms.inverse
636,2,20,Canada,CAN,United States of America,USA,220,346062.59,236.0,0.0,0.0,2.4
637,2,255,Germany,GMY,United States of America,USA,2255,123181.04,0.0,0.0,0.0,0.0
638,20,255,Germany,GMY,Canada,CAN,20255,15889.159,0.0,0.0,0.0,0.0
639,135,2,United States of America,USA,Peru,PER,1352,9759.9238,0.0,0.0,0.0,1.28
640,210,255,Germany,GMY,Netherlands,NTH,210255,84728.898,1.0,0.0,8131865.0,0.0


Fix issues with `contiguity`, `contiguity.any` and `min.distance` since those are fixed over time. We set them to null except for 1990 so let's fill all other years with the values from 1990. 

In [240]:
def impute_dist_cont(row):
    if row.year==1990:
        return row
    else:
        imputer = d[(d['year']==1990)&(d['ccode1']==row.ccode1)&(d['ccode2']==row.ccode2)]
        row['contiguity.any'] = imputer['contiguity.any'].iloc[0]
        row['contiguity'] = imputer['contiguity'].iloc[0]
        row['min.distance'] = imputer['min.distance'].iloc[0]
        return row

In [241]:
d = d.apply(lambda row: impute_dist_cont(row), axis=1)

Let's check that this worked.

In [242]:
d['contiguity'].unique()

array([1., 0., 2., 4., 3., 5.])

In [243]:
d.groupby(['year','min.distance']).ccode1.count()

year  min.distance
1960  0.000000e+00    30
      1.410000e-12     2
      1.520000e-12     1
      1.760000e-12     1
      1.860000e-12     1
                      ..
2016  9.896475e+03     1
      9.911236e+03     1
      1.030244e+04     1
      1.054190e+04     1
      1.223704e+04     2
Name: ccode1, Length: 6420, dtype: int64

In [245]:
d.to_csv('../refugee_data/s1_process.csv', index=False)