# CSCI451 Project: UNHCR ML Challenge
Jamie Hackney, Mihir Singh, Jake Gilbert

In [129]:
# imports
import numpy as np
import pandas as pd
# imports for models
from sklearn.linear_model import LinearRegression
from sklearn import tree 
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.metrics import r2_score

# create models
LR = LinearRegression()
Svc = SVC()
Tree = tree.DecisionTreeClassifier()
NN = MLPClassifier()

# load the data
df = pd.read_csv('data/combined_data.csv')
df_imp = df.copy()

# encode data that is not numerical
encoder = LabelEncoder()
encoder.fit(df_imp['Region'])
df_imp['Region'] = encoder.transform(df_imp['Region'])
encoder.fit(df_imp['District'])
df_imp['District'] = encoder.transform(df_imp['District'])
encoder.fit(df_imp['Month'])
df_imp['Month'] = encoder.transform(df_imp['Month'])

# turn string numbers into floats
features = ['CDI','Month','Year','NDVI','Rainfall','Water Price',
            'Conflict Fatalities','Conflict Incidents','Cholera Deaths',
            'Cholera Cases','Malaria','Measles','Cost Min Basket',
            'Goat Price','Goat to Cereal','Maize Price','Rice Price',
            'Sorghum Price','Wage Price','Wage to Cereal', 'Departures','Arrivals']

for feature in features:
    df_imp[feature] = df_imp[feature].apply(lambda x: x.replace(',', '') if isinstance(x, str) else x)

#ALTERNATE DATAFRAME
df_new = df_imp.copy()


  df = pd.read_csv('data/combined_data.csv')


In [149]:
df_imp[features].isna().sum()


CDI                    0
Month                  0
Year                   0
NDVI                   0
Rainfall               0
Water Price            0
Conflict Fatalities    0
Conflict Incidents     0
Cholera Deaths         0
Cholera Cases          0
Malaria                0
Measles                0
Cost Min Basket        0
Goat Price             0
Goat to Cereal         0
Maize Price            0
Rice Price             0
Sorghum Price          0
Wage Price             0
Wage to Cereal         0
Departures             0
Arrivals               0
dtype: int64

Based on the amount of data marked NaN, we need to impute our data

In [148]:
# impute data
# imp = SimpleImputer(missing_values=np.nan, strategy='median')
# array_imp = imp.fit_transform(df_imp)
# df_imp = pd.DataFrame(array_imp, columns=df.columns)

imp = IterativeImputer(max_iter=10, random_state=0)
array_imp = imp.fit_transform(df_imp)
df_imp = pd.DataFrame(array_imp, columns=df.columns)   



In [151]:
df_imp.head()

Unnamed: 0.1,Unnamed: 0,Region,District,CDI,Month,Year,NDVI,Rainfall,Water Price,Conflict Fatalities,...,Cost Min Basket,Goat Price,Goat to Cereal,Maize Price,Rice Price,Sorghum Price,Wage Price,Wage to Cereal,Departures,Arrivals
0,0.0,0.0,13.0,1.05,4.0,2018.0,0.168,4.342,4000.0,0.0,...,1090175.0,297750.0,57.0,4000.0,5200.0,3200.0,57400.0,11.0,6.0,199.0
1,1.0,0.0,5.0,0.62,4.0,2018.0,0.132,6.065,8816.146932,0.0,...,3221842.0,1034149.0,96.505868,12674.026322,20302.198169,15778.178913,103823.463756,9.562667,415.870257,69.0
2,2.0,0.0,53.0,0.63,4.0,2018.0,0.047,7.915,15724.612735,0.0,...,1674934.0,300000.0,50.0,4729.703777,7000.0,6000.0,90000.0,15.0,253.0,24.0
3,3.0,0.0,73.0,1.1,4.0,2018.0,0.052,8.121,6000.0,0.0,...,1160475.0,410000.0,59.0,6000.0,7000.0,5500.0,100000.0,14.0,-32.184029,68.0
4,4.0,17.0,41.0,0.78,4.0,2018.0,0.0,2.247,11000.0,0.0,...,1366665.0,460000.0,92.0,6000.0,5000.0,5000.0,70000.0,14.0,400.0,663.0


In [155]:
important_features = ["Region", "District", "Month", "Year", "Conflict Fatalities", "Conflict Incidents", "Goat Price", "Water Price", "Rainfall", "Arrivals"]
important_features_no_arrivals = ["Region", "District", "Month", "Year", "Conflict Fatalities", "Conflict Incidents", "Goat Price", "Water Price", "Rainfall"]  


In [157]:
# split data into training and testing data
columns = ['CDI','Month','Year','NDVI','Rainfall','Water Price',
            'Conflict Fatalities','Conflict Incidents','Cholera Deaths',
            'Cholera Cases','Malaria','Measles','Cost Min Basket',
            'Goat Price','Goat to Cereal','Maize Price','Rice Price',
            'Sorghum Price','Wage Price','Wage to Cereal']


X_train, X_test, y_train, y_test = train_test_split(df_imp[important_features_no_arrivals], df_imp['Arrivals'], test_size=0.15)

With the data processed, we can now build some models.

In [153]:
def has_negative(values):
    for value in values:
        if value < 0:
            return True
    return False


X_train.head()
X_train.shape


(279527, 10)

In [158]:
# fitting models
LR.fit(X_train, list(y_train))

# predicting
LR_pred = LR.predict(X_test)
# evaluate
print("on training data: ", mean_squared_error(list(y_test), LR_pred, squared=False))

# caluclate R^2
r2_score(list(y_test), LR_pred)

LR.score(X_test, y_test)


on training data:  4211.42870495444


0.08496562756341985

Studies that try to explain human behavior generally have R^2 values lower than 50%