# Classifying Refugee Generating Countries
### Objective: Build model to predict whether a country will or will not gerenate refugees.

In [1]:
import pandas as pd

dataset = pd.read_csv('/Users/joel/Documents/Brainstation/Tablaeu/CLEANDATA/clean_merged_all.csv').drop('Unnamed: 0', axis=1)
refugees = pd.read_csv('/Users/joel/Documents/Brainstation/Tablaeu/immigrantbyorigin.csv')

In [15]:
### Getting titles for each heading:

fields = []

for i,j in enumerate(dataset.columns):
    if i>=1 and i<=40:
        fields.append(j)
        
titles = {}

for field in fields:
            path = '/Users/joel/Documents/Brainstation/Tablaeu/UNINFO/'+field+'.csv'
            file = pd.read_csv(path, encoding='ISO-8859-1')
            
            heading = field
            title = file.columns[-1]

            titles[heading] = title
titles

{'Demography4': 'Population, ages 15\x9664 (millions)',
 'Demography5': 'Population, ages 65 and older (millions)',
 'Demography6': 'Population, total (millions)',
 'Demography7': 'Population, under age 5 (millions)',
 'Demography8': 'Population, urban (%)',
 'Education2': 'Expected years of schooling (years)',
 'Education3': 'Education Index',
 'Education9': 'Mean years of schooling (years)',
 'Environment3': 'Forest area (% of total land area)',
 'Equality11': 'Inequality-adjusted life expectancy index',
 'Equality7': 'Inequality in life expectancy (%)',
 'Gender1': 'Adolescent birth rate (births per 1,000 women ages 15-19)',
 'Gender11': 'Labour force participation rate, female (% ages 15 and older)',
 'Gender12': 'Labour force participation rate, male (% ages 15 and older)',
 'Gender14': 'Life expectancy at birth, female (years)',
 'Gender15': 'Life expectancy at birth, male (years)',
 'Gender16': 'Maternal mortality ratio (deaths per 100,000 live births)',
 'Gender17': 'Mean years

In [3]:
### Adding the sum of all arrivals to dataset

sums = []

for index in dataset.index:
    sums.append(dataset.loc[index,'Alabama':].sum(0))
    
dataset['sum_destination'] = sums

In [4]:
###adding classes to data to denote refugee generating countries (and non-refugee generating countries)

classes = []

for index in dataset.index:
    if dataset.loc[index,'sum_destination']>0:
        classes.append(1)
    else:
        classes.append(0)
        
dataset['classes'] = classes

In [5]:
import numpy as np

### We'll be validating on 2015 data. So first split the set by year = 2015

df_fifteen = dataset.loc[dataset['year'] == 2015]
df_else = dataset.loc[dataset['year'] != 2015]

### Now split these sets into dependent and independent variables

X_test = df_fifteen.loc[:,'Demography4':'Work8']
X_train = df_else.loc[:,'Demography4':'Work8']
y_test = df_fifteen.loc[:,'classes']
y_train = df_else.loc[:,'classes']

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier

In [7]:
### Build pipeline LOGISTIC
estimators = [
    ('model', RandomForestClassifier(n_estimators=200))
]

pipe = Pipeline(estimators)

In [8]:
### fit pipe

pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('model', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [14]:
pipe.score(X_test, y_test)

0.8950276243093923

### Our model is able to correctly predict whether a country will generate refugees with .9 accuracy using a RandomForest classifier. Though this binary outcome has some relevant application in the real world resettlement process, it's not nearly as useful as knowing exactly how many people from an origin country will resttle to a destination city or state.

In our case here, Randomforest outperforms boosting methods