## Prepare Time Series Data for Map

* Normalize edge weights: Calculate the share of persons of concern based on the country's population
* Filter out data pre 2008
* Filter out `'Various/Unknown'` and `'Stateless'`

In [3]:
import pandas as pd
import numpy as np

In [10]:
df_population = pd.read_csv('../data/world_population_by_year.csv', skiprows=4)

df_timeseries = pd.read_csv('../data/unhcr_time_series_iso.csv')

1. Get population from csv for each country / year
2. Calculate share

For the following countries, we don't have historic data on the population. In these cases, we resort to manually fixing a number from another data source. 

In [11]:
for iso in df_timeseries['iso-origin'].unique():
    if not any(df_population['Country Code'] == iso):
        print(iso)

nan
GUF
ANT
MSR
AIA


Add column for percentages.

In [12]:
df_timeseries['share'] = np.nan

This is only performed once to create the new csv file, so performance is secondary.

In [13]:
for index, country in df_population.iterrows():
    for index, edge in df_timeseries.loc[(df_timeseries['iso-origin'] == country['Country Code']) & (df_timeseries['year'] >= 1960)].iterrows():
        year = str(edge['year'])
        share = edge['value'] / country[year]
        df_timeseries.loc[index,'share'] = share

**Missing shares**:

* SRB: Data only from 1990 on
* GUF: No data
* KWT: Missing data 1992 - 1994
* ANT: No data
* MSR: No data
* ERI: Missing data from 2012 on
* AIA: No data

In [16]:
df_timeseries.loc[df_timeseries['share'].isnull() & (df_timeseries['year'] >= 1960) & (df_timeseries['origin'] != "Various/Unknown")]['iso-origin'].unique()

array(['SRB', 'GUF', 'KWT', 'ANT', 'MSR', 'ERI', 'AIA'], dtype=object)

Fix missing data

In [17]:
# ANT
for index, edge in df_timeseries.loc[(df_timeseries['iso-origin'] == 'ANT') & (df_timeseries['year'] >= 1960)].iterrows():
        year = str(edge['year'])
        share = edge['value'] / 304759 # Source: https://en.wikipedia.org/wiki/Netherlands_Antilles (Accessed February 12, 2019)
        df_timeseries.loc[index,'share'] = share
        
# MSR
for index, edge in df_timeseries.loc[(df_timeseries['iso-origin'] == 'MSR') & (df_timeseries['year'] >= 1960)].iterrows():
        year = str(edge['year'])
        share = edge['value'] / 296711 # Source: https://en.wikipedia.org/wiki/Montserrat (Accessed February 12, 2019)
        df_timeseries.loc[index,'share'] = share
        
# AIA
for index, edge in df_timeseries.loc[(df_timeseries['iso-origin'] == 'AIA') & (df_timeseries['year'] >= 1960)].iterrows():
        year = str(edge['year'])
        share = edge['value'] / 14764 # Source: https://en.wikipedia.org/wiki/Anguilla (Accessed February 12, 2019)
        df_timeseries.loc[index,'share'] = share
        
# GUF
for index, edge in df_timeseries.loc[(df_timeseries['iso-origin'] == 'GUF') & (df_timeseries['year'] >= 1960)].iterrows():
        year = str(edge['year'])
        share = edge['value'] / 296711 # Source: https://en.wikipedia.org/wiki/French_Guiana (Accessed February 12, 2019)
        df_timeseries.loc[index,'share'] = share

# KWT
for index, edge in df_timeseries.loc[(df_timeseries['iso-origin'] == 'KWT') & (df_timeseries['year'] >= 1992) & (df_timeseries['year'] <= 1994)].iterrows():
        year = str(edge['year'])
        share = edge['value'] / ((2035661 + 1610651) / 2) # Average of 1991 and 1995 (world_population_by_year.csv)
        df_timeseries.loc[index,'share'] = share
        
# SRB
for index, edge in df_timeseries.loc[(df_timeseries['iso-origin'] == 'SRB') & (df_timeseries['year'] <= 1989)].iterrows():
        year = str(edge['year'])
        share = edge['value'] / 7586000 # From 1990 (world_population_by_year.csv)
        df_timeseries.loc[index,'share'] = share
        
# ERI
for index, edge in df_timeseries.loc[(df_timeseries['iso-origin'] == 'ERI') & (df_timeseries['year'] >= 2012)].iterrows():
        year = str(edge['year'])
        share = edge['value'] / 4474690 # From 2011 (world_population_by_year.csv)
        df_timeseries.loc[index,'share'] = share

In [18]:
df_timeseries.loc[df_timeseries['share'].isnull() & (df_timeseries['year'] >= 1960) & (df_timeseries['origin'] != "Various/Unknown")]['iso-origin'].unique()

array([], dtype=object)

Filter out data pre 2008 and without ISO code.

In [20]:
df_filtered = df_timeseries[(df_timeseries['year'] >= 2008) & (df_timeseries['iso-origin'].notnull()) & (df_timeseries['iso-destination'].notnull())]

df_filtered.head()

Unnamed: 0,year,origin,destination,type,value,iso-origin,iso-destination,share
81997,2008,Afghanistan,Afghanistan,Internally displaced persons,230670,AFG,AFG,0.008451298
81998,2008,Afghanistan,Afghanistan,Returned IDPs,6453,AFG,AFG,0.0002364253
81999,2008,Afghanistan,Eritrea,Asylum-seekers,1,AFG,ERI,3.663805e-08
82000,2008,Afghanistan,Iran (Islamic Rep. of),Refugees (incl. refugee-like situations),30,AFG,IRN,1.099141e-06
82001,2008,Afghanistan,Iran (Islamic Rep. of),Asylum-seekers,4,AFG,IRN,1.465522e-07


In [22]:
df_filtered.to_csv('../data/unhcr_time_series_normalized.csv', encoding='utf-8', index=False)