## Add ISO Codes

Adds the official ISO 3166-1 alpha-3 country codes to the time series data set.

In [1]:
import pandas as pd
import numpy as np

In [2]:
df_country_list = pd.read_csv('../data/country_list.csv')

df_countries = pd.read_csv('../data/UNHCR_country_names.csv')
df_countries.columns = ["iso","unhcr","display_titles","display_article","notes"]

df_timeseries = pd.read_csv('../data/unhcr_popstats_export_time_series_all_data.csv', skiprows=3, encoding='latin-1', dtype={"Value": object})
df_timeseries = df_timeseries.replace(to_replace='*', value='2')
df_timeseries['Value'] = df_timeseries['Value'].astype(np.int64)
df_timeseries.columns = ["year","destination","origin","type","value"]

Change country names, so that they match the ones in `UNHCR_country_names.csv`

In [3]:
df_timeseries['destination'].replace("Palestinian", "State of Palestine", inplace=True)
df_timeseries['destination'].replace("Dem. People's Rep. of Korea", "Rep. of Korea", inplace=True)

df_timeseries['origin'].replace("Palestinian", "State of Palestine", inplace=True)
df_timeseries['origin'].replace("Tibetan", "China", inplace=True) # No international country code for Tibet
df_timeseries['origin'].replace("Dem. People's Rep. of Korea", "Rep. of Korea", inplace=True)
df_timeseries['origin'].replace("Holy See (the)", "Holy See", inplace=True)

In [10]:
# Use UNHCR codes from country_list.csv
for index, country in df_country_list.iterrows():
    df_timeseries.loc[df_timeseries['origin'] == country['name_en'], 'iso-origin'] = country['country_code']
    df_timeseries.loc[df_timeseries['destination'] == country['name_en'], 'iso-destination'] = country['country_code']
    
# Overwrite codes with UNHCR_country_names.csv when possible
for index, country in df_countries.iterrows():
    df_timeseries.loc[df_timeseries['origin'] == country['unhcr'], 'iso-origin'] = country['iso']
    df_timeseries.loc[df_timeseries['destination'] == country['unhcr'], 'iso-destination'] = country['iso']

# Manual fixes
df_timeseries['iso-destination'].replace("WES", "WSM", inplace=True) # Samoa
df_timeseries['iso-destination'].replace("SEY", "SYC", inplace=True) # Seychelles
df_timeseries['iso-destination'].replace("WSH", "ESH", inplace=True) # Western Sahara
df_timeseries['iso-destination'].replace("PUE", "PRI", inplace=True) # Puerto Rico
df_timeseries['iso-destination'].replace("SMA", "SMR", inplace=True) # San Marino
df_timeseries['iso-destination'].replace("BER", "BMU", inplace=True) # Bermuda
df_timeseries['iso-destination'].replace("FPO", "PYF", inplace=True) # French Polynesia
df_timeseries['iso-destination'].replace("FNC", "NCL", inplace=True) # New Caledonia

df_timeseries['iso-origin'].replace("WES", "WSM", inplace=True) # Samoa
df_timeseries['iso-origin'].replace("SEY", "SYC", inplace=True) # Seychelles
df_timeseries['iso-origin'].replace("WSH", "ESH", inplace=True) # Western Sahara
df_timeseries['iso-origin'].replace("PUE", "PRI", inplace=True) # Puerto Rico
df_timeseries['iso-origin'].replace("SMA", "SMR", inplace=True) # San Marino
df_timeseries['iso-origin'].replace("BER", "BMU", inplace=True) # Bermuda
df_timeseries['iso-origin'].replace("FPO", "PYF", inplace=True) # French Polynesia
df_timeseries['iso-origin'].replace("FNC", "NCL", inplace=True) # New Caledonia

df_timeseries.loc[df_timeseries['origin'] == 'Guadeloupe', 'iso-origin'] = 'GLP'
df_timeseries.loc[df_timeseries['origin'] == 'Norfolk Island', 'iso-origin'] = 'NFK'
df_timeseries.loc[df_timeseries['origin'].str.startswith('Wallis'), 'iso-origin'] = 'WLF'
df_timeseries.loc[df_timeseries['origin'] == 'Svalbard and Jan Mayen', 'iso-origin'] = 'SJM'
df_timeseries.loc[df_timeseries['origin'] == 'Saint-Pierre-et-Miquelon', 'iso-origin'] = 'SPM'
df_timeseries.loc[df_timeseries['origin'] == 'American Samoa', 'iso-origin'] = 'ASM'
df_timeseries.loc[df_timeseries['origin'] == 'Guam', 'iso-origin'] = 'GUM'

df_timeseries.head()

Unnamed: 0,year,destination,origin,type,value,iso-origin,iso-destination
0,1951,Australia,Various/Unknown,Refugees (incl. refugee-like situations),180000,,AUS
1,1951,Austria,Various/Unknown,Refugees (incl. refugee-like situations),282000,,AUT
2,1951,Belgium,Various/Unknown,Refugees (incl. refugee-like situations),55000,,BEL
3,1951,Canada,Various/Unknown,Refugees (incl. refugee-like situations),168511,,CAN
4,1951,"China, Hong Kong SAR",Various/Unknown,Refugees (incl. refugee-like situations),30000,,HKG


Check whether there are any countries for which we do not have a code (except `'Various/Unknown'` and `'Stateless'`)
-> We have all codes

In [11]:
df_timeseries.loc[df_timeseries['iso-destination'].isnull() & (df_timeseries['destination'] != "Various/Unknown")]

Unnamed: 0,year,destination,origin,type,value,iso-origin,iso-destination


In [12]:
df_timeseries.loc[df_timeseries['iso-origin'].isnull() & (df_timeseries['origin'] != "Various/Unknown") & (df_timeseries['origin'] != "Stateless")]

Unnamed: 0,year,destination,origin,type,value,iso-origin,iso-destination


In [13]:
df_timeseries.to_csv('../data/unhcr_time_series_iso.csv', encoding='utf-8', index=False)