# CSCI451 Project: UNHCR ML Challenge
Jamie Hackney, Mihir Singh, Jake Gilbert

In [80]:
# imports
import numpy as np
import pandas as pd
# imports for models
from sklearn.linear_model import LinearRegression
from sklearn import tree 
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# create models
LR = LinearRegression()
Svc = SVC()
Tree = tree.DecisionTreeClassifier()
NN = MLPClassifier()

# load the data
df = pd.read_csv('/Users/mihirsingh/Documents/Middlebury/CSCI451/UNHCR-ml-challenge/data/combined_data.csv')

# impute data
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
array_imp = imp.fit_transform(df)
df_imp = pd.DataFrame(array_imp, columns=df.columns)

# encode data that is not numerical
encoder = LabelEncoder()
encoder.fit(df_imp['Region'])
df_imp['Region'] = encoder.transform(df_imp['Region'])
encoder.fit(df_imp['District'])
df_imp['District'] = encoder.transform(df_imp['District'])
encoder.fit(df_imp['Month'])
df_imp['Month'] = encoder.transform(df_imp['Month'])

df_imp.head()

  df = pd.read_csv('/Users/mihirsingh/Documents/Middlebury/CSCI451/UNHCR-ml-challenge/data/combined_data.csv')


Unnamed: 0.1,Unnamed: 0,Region,District,CDI,Month,Year,NDVI,Rainfall,Water Price,Conflict Fatalities,...,Cost Min Basket,Goat Price,Goat to Cereal,Maize Price,Rice Price,Sorghum Price,Wage Price,Wage to Cereal,Departures,Arrivals
0,0,0,13,1.05,4,2018,0.168,4.342,4000.0,0.0,...,1090175.0,297750,57.0,4000,5200,3200,57400,11.0,6.0,199.0
1,1,0,5,0.62,4,2018,0.132,6.065,30000.0,0.0,...,0.0,850000,63.0,10000,16000,8000,100000,7.0,24.0,69.0
2,2,0,53,0.63,4,2018,0.047,7.915,30000.0,0.0,...,0.0,300000,50.0,10000,7000,6000,90000,15.0,253.0,24.0
3,3,0,73,1.1,4,2018,0.052,8.121,6000.0,0.0,...,1160475.0,410000,59.0,6000,7000,5500,100000,14.0,24.0,68.0
4,4,17,41,0.78,4,2018,0.0,2.247,11000.0,0.0,...,1366665.0,460000,92.0,6000,5000,5000,70000,14.0,400.0,663.0


In [81]:
# turn string numbers into floats
features = ['CDI','Month','Year','NDVI','Rainfall','Water Price',
            'Conflict Fatalities','Conflict Incidents','Cholera Deaths',
            'Cholera Cases','Malaria','Measles','Cost Min Basket',
            'Goat Price','Goat to Cereal','Maize Price','Rice Price',
            'Sorghum Price','Wage Price','Wage to Cereal', 'Departures','Arrivals']

for feature in features:
    df_imp[feature] = df_imp[feature].apply(lambda x: x.replace(',', '') if isinstance(x, str) else x)

df_imp.head()

Unnamed: 0.1,Unnamed: 0,Region,District,CDI,Month,Year,NDVI,Rainfall,Water Price,Conflict Fatalities,...,Cost Min Basket,Goat Price,Goat to Cereal,Maize Price,Rice Price,Sorghum Price,Wage Price,Wage to Cereal,Departures,Arrivals
0,0,0,13,1.05,4,2018,0.168,4.342,4000.0,0.0,...,1090175.0,297750,57.0,4000,5200,3200,57400,11.0,6.0,199.0
1,1,0,5,0.62,4,2018,0.132,6.065,30000.0,0.0,...,0.0,850000,63.0,10000,16000,8000,100000,7.0,24.0,69.0
2,2,0,53,0.63,4,2018,0.047,7.915,30000.0,0.0,...,0.0,300000,50.0,10000,7000,6000,90000,15.0,253.0,24.0
3,3,0,73,1.1,4,2018,0.052,8.121,6000.0,0.0,...,1160475.0,410000,59.0,6000,7000,5500,100000,14.0,24.0,68.0
4,4,17,41,0.78,4,2018,0.0,2.247,11000.0,0.0,...,1366665.0,460000,92.0,6000,5000,5000,70000,14.0,400.0,663.0


In [95]:
# split data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(df_imp, df_imp['Arrivals'], test_size=0.25)

X_train.drop(columns=['Arrivals'], inplace=True)
print(X_train.columns)
X_test.drop(columns=['Arrivals'], inplace=True)
print(X_test.columns)

Index(['Unnamed: 0', 'Region', 'District', 'CDI', 'Month', 'Year', 'NDVI',
       'Rainfall', 'Water Price', 'Conflict Fatalities', 'Conflict Incidents',
       'Cholera Deaths', 'Cholera Cases', 'Malaria', 'Measles',
       'Cost Min Basket', 'Goat Price', 'Goat to Cereal', 'Maize Price',
       'Rice Price', 'Sorghum Price', 'Wage Price', 'Wage to Cereal',
       'Departures'],
      dtype='object')
Index(['Unnamed: 0', 'Region', 'District', 'CDI', 'Month', 'Year', 'NDVI',
       'Rainfall', 'Water Price', 'Conflict Fatalities', 'Conflict Incidents',
       'Cholera Deaths', 'Cholera Cases', 'Malaria', 'Measles',
       'Cost Min Basket', 'Goat Price', 'Goat to Cereal', 'Maize Price',
       'Rice Price', 'Sorghum Price', 'Wage Price', 'Wage to Cereal',
       'Departures'],
      dtype='object')


With the data processed, we can now build some models.

In [96]:

# fitting models
LR.fit(X_train, y_train)

# predicting
LR_pred = LR.predict(X_test)
correctLabels = list(y_test)
# evaluate
print(LR_pred)

def evaluate(predictions, labels):
    correct = 0

    for i in range(len(labels)):
        if predictions[i] <= 1.2 * labels[i] and predictions[i] >= 0.8 * labels[i]:
            correct += 1
    
    return correct/len(labels)

evaluate(LR_pred, correctLabels)



[-235.11949116 6759.58495572  386.41030251 ... -187.63087662   43.67898199
 1798.74352713]


0.06787165203006787