# CSCI451 Project: UNHCR ML Challenge
Jamie Hackney, Mihir Singh, Jake Gilbert

In [44]:
# imports
import numpy as np
import pandas as pd
# imports for models
from sklearn.linear_model import LinearRegression
from sklearn import tree 
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.metrics import r2_score

# create models
LR = LinearRegression()
Svc = SVC()
Tree = tree.DecisionTreeClassifier()
NN = MLPClassifier()

# load the data
df = pd.read_csv('data/combined_data.csv')
df_imp = df.copy()

# encode data that is not numerical
encoder = LabelEncoder()
encoder.fit(df_imp['Region'])
df_imp['Region'] = encoder.transform(df_imp['Region'])
encoder.fit(df_imp['District'])
df_imp['District'] = encoder.transform(df_imp['District'])
encoder.fit(df_imp['Month'])
df_imp['Month'] = encoder.transform(df_imp['Month'])

# turn string numbers into floats
features = ['CDI','Month','Year','NDVI','Rainfall','Water Price',
            'Conflict Fatalities','Conflict Incidents','Cholera Deaths',
            'Cholera Cases','Malaria','Measles','Cost Min Basket',
            'Goat Price','Goat to Cereal','Maize Price','Rice Price',
            'Sorghum Price','Wage Price','Wage to Cereal', 'Departures','Arrivals']

for feature in features:
    df_imp[feature] = df_imp[feature].apply(lambda x: x.replace(',', '') if isinstance(x, str) else x)


df_new = df_imp.copy()


  df = pd.read_csv('data/combined_data.csv')


In [46]:
df_new[features].isna().sum()


CDI                      3992
Month                       0
Year                        0
NDVI                     2951
Rainfall                   75
Water Price            138636
Conflict Fatalities    155030
Conflict Incidents     155030
Cholera Deaths         326472
Cholera Cases          238536
Malaria                187613
Measles                197325
Cost Min Basket        146660
Goat Price              82937
Goat to Cereal         107508
Maize Price            149816
Rice Price              81109
Sorghum Price          144088
Wage Price              82700
Wage to Cereal         107266
Departures              88573
Arrivals               137440
dtype: int64

Based on the amount of data marked NaN, we need to impute our data

In [None]:
# impute data
# imp = SimpleImputer(missing_values=np.nan, strategy='median')
# array_imp = imp.fit_transform(df_imp)
# df_imp = pd.DataFrame(array_imp, columns=df.columns)

imp = IterativeImputer(max_iter=10, random_state=0)
array_imp = imp.fit_transform(df_imp)
df_imp = pd.DataFrame(array_imp, columns=df.columns)   

df_imp.head()

In [116]:
important_features_arrivals = ["Conflict Fatalities", "Conflict Incidents", "Goat Price", "Water Price", "Rainfall", "Arrivals"]
important_features = ["Conflict Fatalities", "Conflict Incidents", "Goat Price", "Water Price", "Rainfall"]

df_new_dropped = df_new[important_features_arrivals].dropna()
df_new_dropped

Unnamed: 0,Goat Price,Water Price,Rainfall,Arrivals
0,297750,4000.0,4.342,199.0
3,410000,6000.0,8.121,68.0
4,460000,11000.0,2.247,663.0
6,378600,13000.0,2.106,110.0
7,450000,10000.0,2.120,816.0
...,...,...,...,...
328777,914000,10000.0,61.321,82.0
328778,914000,10000.0,61.321,82.0
328779,914000,10000.0,61.321,82.0
328780,914000,10000.0,61.321,82.0


In [117]:
# split data into training and testing data
columns = ['CDI','Month','Year','NDVI','Rainfall','Water Price',
            'Conflict Fatalities','Conflict Incidents','Cholera Deaths',
            'Cholera Cases','Malaria','Measles','Cost Min Basket',
            'Goat Price','Goat to Cereal','Maize Price','Rice Price',
            'Sorghum Price','Wage Price','Wage to Cereal']


X_train, X_test, y_train, y_test = train_test_split(df_new_dropped[important_features], df_new_dropped['Arrivals'], test_size=0.15)
X_train.head()


Unnamed: 0,Goat Price,Water Price,Rainfall
272914,402500,10000.0,12.982
126632,932500,16000.0,5.634
29948,1475000,15000.0,6.264
53612,825000,50000.0,1.095
46032,2475000,36500.0,2.141


With the data processed, we can now build some models.

In [88]:
def has_negative(values):
    for value in values:
        if value < 0:
            return True
    return False


X_train.head()
X_train.shape


(232, 20)

In [119]:
# fitting models
LR.fit(X_train, list(y_train))

# predicting
LR_pred = LR.predict(X_test)
# evaluate
print("on training data: ", mean_squared_error(list(y_test), LR_pred, squared=False))

# caluclate R^2
r2_score(list(y_test), LR_pred)

LR.score(X_test, y_test)


on training data:  4798.782980616257


0.014225411377253194

Studies that try to explain human behavior generally have R^2 values lower than 50%