# Imports

In [23]:
import pandas as pd
import numpy as np
import pdb
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
%matplotlib inline
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
import folium
from folium.plugins import HeatMap

# Mount to Google Drive

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# Save dataset in Google Drive and search for it using the command below

In [24]:
!ls drive/My\ Drive/Save\ the\ Children/unhcr.csv

'drive/My Drive/Save the Children/unhcr.csv'


# Import dataset from the path (change depending on where it was saved) to pandas dataframe

## UNHCR Dataset (Afghanistan) (Depends on which location is being investigated)

In [4]:
import_path = 'drive/My Drive/Save the Children/unhcr.csv'
demographics_df = pd.read_csv(import_path)
demographics_df.head()

Unnamed: 0,Year,Country of Origin Code,Country of Asylum Code,Country of Origin Name,Country of Asylum Name,Population Type,location,urbanRural,accommodationType,Female 0-4,Female 5-11,Female 12-17,Female 18-59,Female 60 or more,Female Unknown,Female Total,Male 0-4,Male 5-11,Male 12-17,Male 18-59,Male 60 or more,Male Unknown,Male Total,Total
0,#date+year,#country+code+origin,#country+code+asylum,#country+name+origin,#country+name+asylum,#indicator+population_type,,,,#affected+f+infants+age_0_4,#affected+f+children+age_5_11,#affected+f+adolescents+age_12_17,#affected+f+adults+age_18_59,#affected+f+elderly+age_60,#affected+f+unknown_age,#affected+f+total,#affected+m+infants+age_0_4,#affected+m+children+age_5_11,#affected+m+adolescents+age_12_17,#affected+m+adults+age_18_59,#affected+m+elderly+age_60,#affected+m+unknown_age,#affected+m+total,#affected+all+total
1,2001,AFG,AFG,Afghanistan,Afghanistan,IDP,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1200000
2,2001,AFG,AFG,Afghanistan,Afghanistan,RET,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,26092
3,2001,AFG,EGY,Afghanistan,Egypt,ASY,,,,0,0,0,0,0,5,5,0,0,0,0,0,16,16,21
4,2001,AFG,EGY,Afghanistan,Egypt,REF,,,,5,0,0,9,0,0,14,5,0,0,42,0,5,52,66


In [5]:
demographics_df = demographics_df.drop(demographics_df.index[0])
demographics_df.head()

Unnamed: 0,Year,Country of Origin Code,Country of Asylum Code,Country of Origin Name,Country of Asylum Name,Population Type,location,urbanRural,accommodationType,Female 0-4,Female 5-11,Female 12-17,Female 18-59,Female 60 or more,Female Unknown,Female Total,Male 0-4,Male 5-11,Male 12-17,Male 18-59,Male 60 or more,Male Unknown,Male Total,Total
1,2001,AFG,AFG,Afghanistan,Afghanistan,IDP,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1200000
2,2001,AFG,AFG,Afghanistan,Afghanistan,RET,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,26092
3,2001,AFG,EGY,Afghanistan,Egypt,ASY,,,,0,0,0,0,0,5,5,0,0,0,0,0,16,16,21
4,2001,AFG,EGY,Afghanistan,Egypt,REF,,,,5,0,0,9,0,0,14,5,0,0,42,0,5,52,66
5,2001,AFG,ARG,Afghanistan,Argentina,ASY,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [6]:
# Cleaning data

def clean_convert_unhcr_data(col):
    '''Clean mislabeled data for demographic levels
    '''
    demographics_df[col] = demographics_df[col].fillna(0).replace('1e+05', 100000).astype(int)
columns = ['Female 0-4','Female 5-11', 'Female 12-17', 'Female 18-59', 'Female 60 or more',
       'Female Unknown', 'Female Total', 'Male 0-4', 'Male 5-11', 'Male 12-17',
       'Male 18-59', 'Male 60 or more', 'Male Unknown', 'Male Total', 'Total']
for col in columns:
    clean_convert_unhcr_data(col)
demographics_df["Year"] = pd.to_numeric(demographics_df["Year"])


In [11]:
demographics_df = demographics_df.drop(["Country of Origin Code", "Country of Asylum Code"], axis=1)
demographics_df.head()

Unnamed: 0,Year,Country of Origin Name,Country of Asylum Name,Population Type,location,urbanRural,accommodationType,Female 0-4,Female 5-11,Female 12-17,Female 18-59,Female 60 or more,Female Unknown,Female Total,Male 0-4,Male 5-11,Male 12-17,Male 18-59,Male 60 or more,Male Unknown,Male Total,Total
1,2001,Afghanistan,Afghanistan,IDP,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1200000
2,2001,Afghanistan,Afghanistan,RET,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,26092
3,2001,Afghanistan,Egypt,ASY,,,,0,0,0,0,0,5,5,0,0,0,0,0,16,16,21
4,2001,Afghanistan,Egypt,REF,,,,5,0,0,9,0,0,14,5,0,0,42,0,5,52,66
5,2001,Afghanistan,Argentina,ASY,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [12]:
demographics_df = demographics_df.rename(columns={"Country of Origin Name": "Name"})

## IDMC Dataset

In [7]:
import_path = 'drive/My Drive/Save the Children/idmc.csv' # Path might change depending on where the dataset is saved
idmc_df = pd.read_csv(import_path)
idmc_df.head()

Unnamed: 0,ISO3,Name,Year,Conflict Stock Displacement,Conflict New Displacements,Disaster New Displacements,Disaster Stock Displacement
0,#country+code,#country+name,#date+year,#affected+idps+ind+stock+conflict,#affected+idps+ind+newdisp+conflict,#affected+idps+ind+newdisp+disaster,#affected+idps+ind+stock+disaster
1,AB9,Abyei Area,2014,20000,,,
2,AB9,Abyei Area,2015,82000,,,
3,AB9,Abyei Area,2016,20000,,,
4,AB9,Abyei Area,2017,31000,,,


In [8]:
# Data cleaning
idmc_df = idmc_df.drop(idmc_df.index[0])
idmc_df = idmc_df.fillna(0)
idmc_df[["Conflict Stock Displacement", "Conflict New Displacements", "Disaster New Displacements", "Disaster Stock Displacement"]] = idmc_df[["Conflict Stock Displacement", "Conflict New Displacements", "Disaster New Displacements", "Disaster Stock Displacement"]].apply(pd.to_numeric)

# Case study --> Afghanistan (Can be changed if investigating different location)

idmc_df_af = idmc_df[idmc_df.Name == "Afghanistan"]


In [15]:
idmc_df_af = idmc_df_af.drop(["ISO3"], axis=1)
idmc_df_af

Unnamed: 0,Name,Year,Conflict Stock Displacement,Conflict New Displacements,Disaster New Displacements,Disaster Stock Displacement
7,Afghanistan,2008,0,0,3400,0
8,Afghanistan,2009,297000,0,28000,0
9,Afghanistan,2010,352000,102000,71000,0
10,Afghanistan,2011,450000,186000,3000,0
11,Afghanistan,2012,492000,100000,30000,0
12,Afghanistan,2013,631000,124000,15000,0
13,Afghanistan,2014,805000,156000,13000,0
14,Afghanistan,2015,1174000,335000,71000,0
15,Afghanistan,2016,1553000,653000,7400,0
16,Afghanistan,2017,1286000,474000,27000,0


# Merge Datasets after Preprocessing

In [17]:
result = demographics_df.append(idmc_df_af, sort=False)

In [22]:
result

Unnamed: 0,Year,Name,Country of Asylum Name,Population Type,location,urbanRural,accommodationType,Female 0-4,Female 5-11,Female 12-17,Female 18-59,Female 60 or more,Female Unknown,Female Total,Male 0-4,Male 5-11,Male 12-17,Male 18-59,Male 60 or more,Male Unknown,Male Total,Total,Conflict Stock Displacement,Conflict New Displacements,Disaster New Displacements,Disaster Stock Displacement
1,2001,Afghanistan,Afghanistan,IDP,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1200000.0,,,,
2,2001,Afghanistan,Afghanistan,RET,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,26092.0,,,,
3,2001,Afghanistan,Egypt,ASY,,,,0.0,0.0,0.0,0.0,0.0,5.0,5.0,0.0,0.0,0.0,0.0,0.0,16.0,16.0,21.0,,,,
4,2001,Afghanistan,Egypt,REF,,,,5.0,0.0,0.0,9.0,0.0,0.0,14.0,5.0,0.0,0.0,42.0,0.0,5.0,52.0,66.0,,,,
5,2001,Afghanistan,Argentina,ASY,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14,2015,Afghanistan,,,,,,,,,,,,,,,,,,,,,1174000.0,335000.0,71000.0,0.0
15,2016,Afghanistan,,,,,,,,,,,,,,,,,,,,,1553000.0,653000.0,7400.0,0.0
16,2017,Afghanistan,,,,,,,,,,,,,,,,,,,,,1286000.0,474000.0,27000.0,0.0
17,2018,Afghanistan,,,,,,,,,,,,,,,,,,,,,2598000.0,372000.0,435000.0,0.0
