# CSCI451 Project: UNHCR ML Challenge
Jamie Hackney, Mihir Singh, Jake Gilbert

In [33]:
# imports
import numpy as np
import pandas as pd
# imports for models
from sklearn.linear_model import LinearRegression
from sklearn import tree 
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.metrics import r2_score

# create models
LR = LinearRegression()
Svc = SVC()
Tree = tree.DecisionTreeClassifier()
NN = MLPClassifier()

# load the data
df = pd.read_csv('/Users/mihirsingh/Documents/Middlebury/CSCI451/UNHCR-ml-challenge/data/combined_data.csv')
df_imp = df.copy()

# encode data that is not numerical
encoder = LabelEncoder()
encoder.fit(df_imp['Region'])
df_imp['Region'] = encoder.transform(df_imp['Region'])
encoder.fit(df_imp['District'])
df_imp['District'] = encoder.transform(df_imp['District'])
encoder.fit(df_imp['Month'])
df_imp['Month'] = encoder.transform(df_imp['Month'])

# turn string numbers into floats
features = ['CDI','Month','Year','NDVI','Rainfall','Water Price',
            'Conflict Fatalities','Conflict Incidents','Cholera Deaths',
            'Cholera Cases','Malaria','Measles','Cost Min Basket',
            'Goat Price','Goat to Cereal','Maize Price','Rice Price',
            'Sorghum Price','Wage Price','Wage to Cereal', 'Departures','Arrivals']

for feature in features:
    df_imp[feature] = df_imp[feature].apply(lambda x: x.replace(',', '') if isinstance(x, str) else x)
    
# impute data
# imp = SimpleImputer(missing_values=np.nan, strategy='median')
# array_imp = imp.fit_transform(df_imp)
# df_imp = pd.DataFrame(array_imp, columns=df.columns)

imp = IterativeImputer(max_iter=10, random_state=0)
array_imp = imp.fit_transform(df_imp)
df_imp = pd.DataFrame(array_imp, columns=df.columns)   

df_imp.head()

  df = pd.read_csv('/Users/mihirsingh/Documents/Middlebury/CSCI451/UNHCR-ml-challenge/data/combined_data.csv')


Unnamed: 0.1,Unnamed: 0,Region,District,CDI,Month,Year,NDVI,Rainfall,Water Price,Conflict Fatalities,...,Cost Min Basket,Goat Price,Goat to Cereal,Maize Price,Rice Price,Sorghum Price,Wage Price,Wage to Cereal,Departures,Arrivals
0,0.0,0.0,13.0,1.05,4.0,2018.0,0.168,4.342,4000.0,0.0,...,1090175.0,297750.0,57.0,4000.0,5200.0,3200.0,57400.0,11.0,6.0,199.0
1,1.0,0.0,5.0,0.62,4.0,2018.0,0.132,6.065,8816.146932,0.0,...,3221842.0,1034149.0,96.505868,12674.026322,20302.198169,15778.178913,103823.463756,9.562667,415.870257,69.0
2,2.0,0.0,53.0,0.63,4.0,2018.0,0.047,7.915,15724.612735,0.0,...,1674934.0,300000.0,50.0,4729.703777,7000.0,6000.0,90000.0,15.0,253.0,24.0
3,3.0,0.0,73.0,1.1,4.0,2018.0,0.052,8.121,6000.0,0.0,...,1160475.0,410000.0,59.0,6000.0,7000.0,5500.0,100000.0,14.0,-32.184029,68.0
4,4.0,17.0,41.0,0.78,4.0,2018.0,0.0,2.247,11000.0,0.0,...,1366665.0,460000.0,92.0,6000.0,5000.0,5000.0,70000.0,14.0,400.0,663.0


In [34]:
df_imp.shape

(328856, 25)

In [35]:
# split data into training and testing data
columns = ['CDI','Month','Year','NDVI','Rainfall','Water Price',
            'Conflict Fatalities','Conflict Incidents','Cholera Deaths',
            'Cholera Cases','Malaria','Measles','Cost Min Basket',
            'Goat Price','Goat to Cereal','Maize Price','Rice Price',
            'Sorghum Price','Wage Price','Wage to Cereal', 'Departures']
X_train, X_test, y_train, y_test = train_test_split(df_imp[columns], df_imp['Arrivals'], test_size=0.15)
X_train.head()
X_train.shape


(279527, 21)

With the data processed, we can now build some models.

In [36]:
def has_negative(values):
    for value in values:
        if value < 0:
            return True
    return False


X_train.head()
list(y_train)

[292.27444928884506,
 623.0,
 580.0,
 969.0,
 417.87752908468246,
 66.0,
 257.0,
 39314.0,
 722.1557554900646,
 452.0,
 1148.0,
 123.09566090255976,
 450.0,
 -503.3073698505759,
 21.0,
 1122.1815052777529,
 259.0,
 988.8415446951985,
 603.0,
 136.0,
 307.4288030192256,
 2590.0,
 95.0,
 447.0,
 97.0,
 50.647932939231396,
 451.9927923157811,
 1373.3028630837798,
 259.0,
 131.0,
 1274.8334339037538,
 769.2380238324404,
 830.0,
 -183.96964418143034,
 841.6423884630203,
 1232.0,
 17.0,
 6.0,
 87.0,
 10.0,
 1285.2347754836082,
 2781.0,
 298.6791302636266,
 687.0191293358803,
 1140.9950025305152,
 174.9833825454116,
 858.6078008264303,
 94.0,
 75.75029297173023,
 696.4150807708502,
 33.0,
 221.0,
 8.0,
 677.0,
 1288.0,
 539.0437263101339,
 -245.28921725600958,
 136.0,
 256.0,
 820.0217649117112,
 463.7859693095088,
 178.0,
 603.0,
 3091.2344495505095,
 19.0,
 559.2955532222986,
 249.0,
 120.0,
 830.0,
 998.0,
 -220.6894744336605,
 254.0,
 142.0,
 345.0,
 -481.2401549965143,
 480.1994605511427

In [39]:
# fitting models
LR.fit(X_train, list(y_train))

# predicting
LR_pred = LR.predict(X_test)
# evaluate
print("on training data: ", mean_squared_error(list(y_test), LR_pred, squared=False))

# caluclate R^2
r2_score(list(y_test), LR_pred)

on training data:  2161.2532638899884


0.7316947200444546

Studies that try to explain human behavior generally have R^2 values lower than 50%