# CSCI451 Project: UNHCR ML Challenge
Jamie Hackney, Mihir Singh, Jake Gilbert

In [26]:
# imports
import numpy as np
import pandas as pd
# imports for models
from sklearn.linear_model import LinearRegression
from sklearn import tree 
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer

# create models
LR = LinearRegression()
Svc = SVC()
Tree = tree.DecisionTreeClassifier()
NN = MLPClassifier()

# load the data
df = pd.read_csv('/Users/mihirsingh/Documents/Middlebury/CSCI451/UNHCR-ml-challenge/data/combined_data.csv')
df_imp = df.copy()

# encode data that is not numerical
encoder = LabelEncoder()
encoder.fit(df_imp['Region'])
df_imp['Region'] = encoder.transform(df_imp['Region'])
encoder.fit(df_imp['District'])
df_imp['District'] = encoder.transform(df_imp['District'])
encoder.fit(df_imp['Month'])
df_imp['Month'] = encoder.transform(df_imp['Month'])

# turn string numbers into floats
features = ['CDI','Month','Year','NDVI','Rainfall','Water Price',
            'Conflict Fatalities','Conflict Incidents','Cholera Deaths',
            'Cholera Cases','Malaria','Measles','Cost Min Basket',
            'Goat Price','Goat to Cereal','Maize Price','Rice Price',
            'Sorghum Price','Wage Price','Wage to Cereal', 'Departures','Arrivals']

for feature in features:
    df_imp[feature] = df_imp[feature].apply(lambda x: x.replace(',', '') if isinstance(x, str) else x)
    
# impute data
# imp = SimpleImputer(missing_values=np.nan, strategy='median')
# array_imp = imp.fit_transform(df_imp)
# df_imp = pd.DataFrame(array_imp, columns=df.columns)

imp = IterativeImputer(max_iter=10, random_state=0)
array_imp = imp.fit_transform(df_imp)
df_imp = pd.DataFrame(array_imp, columns=df.columns)   

df_imp.head()

  df = pd.read_csv('/Users/mihirsingh/Documents/Middlebury/CSCI451/UNHCR-ml-challenge/data/combined_data.csv')


Unnamed: 0.1,Unnamed: 0,Region,District,CDI,Month,Year,NDVI,Rainfall,Water Price,Conflict Fatalities,...,Cost Min Basket,Goat Price,Goat to Cereal,Maize Price,Rice Price,Sorghum Price,Wage Price,Wage to Cereal,Departures,Arrivals
0,0.0,0.0,13.0,1.05,4.0,2018.0,0.168,4.342,4000.0,0.0,...,1090175.0,297750.0,57.0,4000.0,5200.0,3200.0,57400.0,11.0,6.0,199.0
1,1.0,0.0,5.0,0.62,4.0,2018.0,0.132,6.065,8816.146932,0.0,...,3221842.0,1034149.0,96.505868,12674.026322,20302.198169,15778.178913,103823.463756,9.562667,415.870257,69.0
2,2.0,0.0,53.0,0.63,4.0,2018.0,0.047,7.915,15724.612735,0.0,...,1674934.0,300000.0,50.0,4729.703777,7000.0,6000.0,90000.0,15.0,253.0,24.0
3,3.0,0.0,73.0,1.1,4.0,2018.0,0.052,8.121,6000.0,0.0,...,1160475.0,410000.0,59.0,6000.0,7000.0,5500.0,100000.0,14.0,-32.184029,68.0
4,4.0,17.0,41.0,0.78,4.0,2018.0,0.0,2.247,11000.0,0.0,...,1366665.0,460000.0,92.0,6000.0,5000.0,5000.0,70000.0,14.0,400.0,663.0


In [27]:
df_imp.shape

(328856, 25)

In [28]:
# split data into training and testing data
columns = ['CDI','Month','Year','NDVI','Rainfall','Water Price',
            'Conflict Fatalities','Conflict Incidents','Cholera Deaths',
            'Cholera Cases','Malaria','Measles','Cost Min Basket',
            'Goat Price','Goat to Cereal','Maize Price','Rice Price',
            'Sorghum Price','Wage Price','Wage to Cereal', 'Departures']
X_train, X_test, y_train, y_test = train_test_split(df_imp[columns], df_imp['Arrivals'], test_size=0.15)
X_train.head()
X_train.shape


(279527, 21)

With the data processed, we can now build some models.

In [29]:
def has_negative(values):
    for value in values:
        if value < 0:
            return True
    return False


X_train.head()
list(y_train)

[445.0,
 261.0,
 981.0,
 179.0,
 1618.0,
 362.76844434440136,
 1904.0,
 259.0,
 -28.937619134783745,
 970.7860896289349,
 475.0,
 816.0,
 261.0,
 113.69490417093039,
 1876.7046327143908,
 551.4815684556961,
 188.0,
 1399.0,
 1107.5497242063284,
 21.0,
 45.0,
 19.0,
 103.0,
 1288.0,
 600.1866604909301,
 -1229.489833213389,
 -8705.387980788946,
 560.0,
 11.0,
 873.6945407539606,
 -376.55490873008966,
 22.0,
 503.0,
 -646.4085885584354,
 610.1106046959758,
 7507.0,
 28.0,
 178.17591500282288,
 64.0,
 13.0,
 1028.9121128246188,
 -520.4672205671668,
 322.0,
 2302.8895513266325,
 1456.10358043015,
 281.7081555277109,
 2360.0,
 -744.6501461341977,
 145.11791390925646,
 -219.58592873066664,
 153.0,
 29.0,
 -473.3956663310528,
 204.0,
 116.59632451832294,
 376.2793243974447,
 27.0,
 574.0899833962321,
 46.0,
 -250.28704019635916,
 29.0,
 -250.93429297953844,
 587.0,
 16.0,
 420.34892243891954,
 136.0,
 2490.0,
 694.5805999711156,
 -64.83229380100965,
 637.0,
 36.0,
 888.0,
 100.0,
 176.0,
 559.

In [30]:
# fitting models
LR.fit(X_train, list(y_train))

# predicting
LR_pred = LR.predict(X_train)
# evaluate
print("on training data: ", mean_squared_error(list(y_train), LR_pred))


on training data:  4622056.836219689
