# CSCI451 Project: UNHCR ML Challenge
Jamie Hackney, Mihir Singh, Jake Gilbert

In [247]:
# imports
import numpy as np
import pandas as pd
# imports for models
from sklearn.linear_model import LinearRegression
from sklearn import tree 
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.metrics import r2_score

# create models
LR = LinearRegression()
Svc = SVC()
Tree = tree.DecisionTreeClassifier()
NN = MLPClassifier()

# load the data
df = pd.read_csv('/Users/mihirsingh/Documents/Middlebury/CSCI451/UNHCR-ml-challenge/data/combined_data.csv')
df_imp = df.copy()

# encode data that is not numerical
encoder = LabelEncoder()
encoder.fit(df_imp['Region'])
df_imp['Region'] = encoder.transform(df_imp['Region'])
encoder.fit(df_imp['District'])
df_imp['District'] = encoder.transform(df_imp['District'])
encoder.fit(df_imp['Month'])
df_imp['Month'] = encoder.transform(df_imp['Month'])


# turn string numbers into floats
features = ['CDI','Month','Year','NDVI','Rainfall','Water Price',
            'Conflict Fatalities','Conflict Incidents','Cholera Deaths',
            'Cholera Cases','Malaria','Measles','Cost Min Basket',
            'Goat Price','Goat to Cereal','Maize Price','Rice Price',
            'Sorghum Price','Wage Price','Wage to Cereal', 'Departures','Arrivals']

for feature in features:
    df_imp[feature] = df_imp[feature].apply(lambda x: x.replace(',', '') if isinstance(x, str) else x)
    
# impute data
# imp = SimpleImputer(missing_values=np.nan, strategy='median')
# array_imp = imp.fit_transform(df_imp)
# df_imp = pd.DataFrame(array_imp, columns=df.columns)

#L1 regression - Lasso

imp = IterativeImputer(max_iter=10, random_state=0)
array_imp = imp.fit_transform(df_imp)
df_imp = pd.DataFrame(array_imp, columns=df.columns)   
# df_imp = df_imp.dropna()


  df = pd.read_csv('/Users/mihirsingh/Documents/Middlebury/CSCI451/UNHCR-ml-challenge/data/combined_data.csv')


In [248]:

df_imp.head()

Unnamed: 0.1,Unnamed: 0,Region,District,CDI,Month,Year,NDVI,Rainfall,Water Price,Conflict Fatalities,...,Cost Min Basket,Goat Price,Goat to Cereal,Maize Price,Rice Price,Sorghum Price,Wage Price,Wage to Cereal,Departures,Arrivals
0,0.0,0.0,13.0,1.05,4.0,2018.0,0.168,4.342,4000.0,0.0,...,1090175.0,297750.0,57.0,4000.0,5200.0,3200.0,57400.0,11.0,6.0,199.0
1,1.0,0.0,5.0,0.62,4.0,2018.0,0.132,6.065,8816.146932,0.0,...,3221842.0,1034149.0,96.505868,12674.026322,20302.198169,15778.178913,103823.463756,9.562667,415.870257,69.0
2,2.0,0.0,53.0,0.63,4.0,2018.0,0.047,7.915,15724.612735,0.0,...,1674934.0,300000.0,50.0,4729.703777,7000.0,6000.0,90000.0,15.0,253.0,24.0
3,3.0,0.0,73.0,1.1,4.0,2018.0,0.052,8.121,6000.0,0.0,...,1160475.0,410000.0,59.0,6000.0,7000.0,5500.0,100000.0,14.0,-32.184029,68.0
4,4.0,17.0,41.0,0.78,4.0,2018.0,0.0,2.247,11000.0,0.0,...,1366665.0,460000.0,92.0,6000.0,5000.0,5000.0,70000.0,14.0,400.0,663.0


In [249]:
# split data into training and testing data
columns = ['Region', 'District','CDI','Month','Year','NDVI','Rainfall','Water Price',
            'Conflict Fatalities','Conflict Incidents','Cholera Deaths',
            'Cholera Cases','Malaria','Measles','Cost Min Basket',
            'Goat Price','Goat to Cereal','Maize Price','Rice Price',
            'Sorghum Price','Wage Price','Wage to Cereal', 'Departures']
X_train, X_test, y_train, y_test = train_test_split(df_imp[columns], df_imp['Arrivals'], test_size=0.15)
print(X_test.shape)
print(y_test.shape)

(49329, 23)
(49329,)


With the data processed, we can now build some models.

In [250]:
def has_negative(values):
    for value in values:
        if value < 0:
            return True
    return False

In [262]:
import math

# duplicate data
LR_xtrain = X_train.copy()
LR_ytrain = y_train.copy()
LR_xtest = X_test.copy()
LR_ytest = y_test.copy()

# fit and predict the model
LR.fit(LR_xtrain, list(LR_ytrain))
LR_pred = LR.predict(LR_xtest)

# calculate R^2
print("R^2: ", LR.score(LR_xtest, list(LR_ytest)))

# # bin LR_pred and bin y_test
LR_pred = pd.DataFrame(LR_pred, columns=['Arrivals'])
LR_pred['bins'] = pd.cut(LR_pred['Arrivals'], bins=[float('-inf'), 1000, 5000, float('inf')], labels=[1,2,3], right = False)
ybins = pd.cut(y_test, bins=[float('-inf'), 1000, 5000, float('inf')], labels=[1,2,3], right = False)

# evaluate
def classification_accuracy(y_true, y_pred):
    if len(y_true) != len(y_pred):
        raise ValueError("Arrays must be of equal length")
    
    correct_predictions = np.sum(y_true == y_pred)
    total_predictions = len(y_true)
    accuracy = correct_predictions / total_predictions
    return accuracy

print("Accuracy: ", classification_accuracy(ybins, list(LR_pred['bins'])))    

R^2:  0.7442155289368221
Accuracy:  0.8474528168014758


In [252]:
import math


# fitting models
LR.fit(X_train, list(y_train))

# predicting
LR_pred = LR.predict(X_test)


# evaluate
print("on test data: ", math.sqrt(mean_squared_error(list(y_test), LR_pred)))
print(f'rmse train: {math.sqrt(mean_squared_error(LR.predict(X_train), y_train))}')

# caluclate R^2
# r2_score(list(y_test), LR_pred)
LR.score(X_test, list(y_test))

on test data:  2093.128471086072
rmse train: 2151.041124226047


0.7442155289368221

Studies that try to explain human behavior generally have R^2 values lower than 50%

In [253]:
from sklearn.linear_model import Lasso

RF = Lasso()
RF.fit(X_train, y_train)
RF.score(X_test, y_test)
print(f'rmse: {math.sqrt(mean_squared_error(RF.predict(X_test), y_test))}')

rmse: 2093.29863761231


In [254]:
from sklearn.linear_model import ElasticNet
RF = ElasticNet()
RF.fit(X_train, y_train)
score = RF.score(X_test, y_test)
print(score)
print(f'rmse: {math.sqrt(mean_squared_error(RF.predict(X_test), y_test))}')

0.7300610387734758
rmse: 2150.2630099440703


In [255]:
from sklearn.tree import DecisionTreeRegressor

RF = DecisionTreeRegressor()
RF.fit(X_train, y_train)
score = RF.score(X_test, y_test)
print(score)
print(f'rmse train: {math.sqrt(mean_squared_error(RF.predict(X_train), y_train))}')
print(f'rmse: {math.sqrt(mean_squared_error(RF.predict(X_test), y_test))}')

0.97353160920993
rmse train: 4.880219933414567e-07
rmse: 673.3216312364166


In [256]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest, f_regression
sel = SelectKBest(f_regression, k=6)
X_new = sel.fit_transform(X_train, y_train)
cols_idxs = sel.get_support(indices=True)
print(cols_idxs)
features_df_new = X_train.iloc[:,cols_idxs]

print(features_df_new.columns)

[ 7  9 11 13 15 22]
Index(['Water Price', 'Conflict Incidents', 'Cholera Cases', 'Measles',
       'Goat Price', 'Departures'],
      dtype='object')


In [257]:

RF = RandomForestRegressor(max_depth=11)
RF.fit(X_train, y_train)
print(RF.score(X_test, y_test))
print(f'rmse train: {math.sqrt(mean_squared_error(RF.predict(X_train), y_train))}')
print(f'rmse: {math.sqrt(mean_squared_error(RF.predict(X_test), y_test))}')

KeyboardInterrupt: 