In [6]:
# imports
import numpy as np
import pandas as pd
import math
from sklearn.linear_model import LinearRegression
from sklearn import tree 
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [7]:
def dropNA(df, top_9=False):
    """
    Prepare the data for training, deal with NaNs. I think we also need to drop the 2014 years?
    """

    # remove commas in numeric columns
    for feature in df.columns:
        df[feature] = df[feature].apply(lambda x: x.replace(',', '') if isinstance(x, str) else x)

    # force numeric 
    numeric_cols = ['Cost Min Basket', 'Goat Price', 'Goat to Cereal', 'Maize Price', 'Rice Price', 'Sorghum Price', 'Wage Price', 'Arrivals']
    df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric)

    # make categorical columns
    df = df.astype({"Region": 'category', "District": 'category', "Month": 'category'})

    # drop columns that arent useful
    if top_9:
        keep_cols = ['Arrivals', 'Region', 'District', 'Month', 'Year', 'Rainfall', 'Conflict Fatalities', 'Conflict Incidents', 'Water Price', 'Goat Price']
        df = df[keep_cols]
    
    df = df.dropna()

    return df

def impute(df, top_9=False):
    imp = IterativeImputer(max_iter=10, random_state=0)
    array_imp = imp.fit_transform(df_imp)
    df_imp = pd.DataFrame(array_imp, columns=df.columns) 

    # drop columns that arent useful
    if top_9:
        keep_cols = ['Arrivals', 'Region', 'District', 'Month', 'Year', 'Rainfall', 'Conflict Fatalities', 'Conflict Incidents', 'Water Price', 'Goat Price']
        df_imp = df_imp[keep_cols]
        
    return df_imp
    

In [8]:
# create models
LR = LinearRegression()
DT = DecisionTreeRegressor()
RF = RandomForestRegressor()

# load data
df = pd.read_csv('/Users/mihirsingh/Documents/Middlebury/CSCI451/UNHCR-ml-challenge/data/combined_data.csv')
df_imp = df.copy()

# encode data that is not numerical
encoder = LabelEncoder()
encoder.fit(df_imp['Region'])
df_imp['Region'] = encoder.transform(df_imp['Region'])
encoder.fit(df_imp['District'])
df_imp['District'] = encoder.transform(df_imp['District'])
encoder.fit(df_imp['Month'])
df_imp['Month'] = encoder.transform(df_imp['Month'])

# turn string numbers into floats
features = ['CDI','Month','Year','NDVI','Rainfall','Water Price',
            'Conflict Fatalities','Conflict Incidents','Cholera Deaths',
            'Cholera Cases','Malaria','Measles','Cost Min Basket',
            'Goat Price','Goat to Cereal','Maize Price','Rice Price',
            'Sorghum Price','Wage Price','Wage to Cereal', 'Departures','Arrivals']

for feature in features:
    df_imp[feature] = df_imp[feature].apply(lambda x: x.replace(',', '') if isinstance(x, str) else x)
 
# impute data
imp = IterativeImputer(max_iter=10, random_state=0)
array_imp = imp.fit_transform(df_imp)
df_imp = pd.DataFrame(array_imp, columns=df.columns)     

  df = pd.read_csv('/Users/mihirsingh/Documents/Middlebury/CSCI451/UNHCR-ml-challenge/data/combined_data.csv')


In [9]:
# split data into training and testing data
columns = ['Region', 'District','CDI','Month','Year','NDVI','Rainfall','Water Price',
            'Conflict Fatalities','Conflict Incidents','Cholera Deaths',
            'Cholera Cases','Malaria','Measles','Cost Min Basket',
            'Goat Price','Goat to Cereal','Maize Price','Rice Price',
            'Sorghum Price','Wage Price','Wage to Cereal']
X_train, X_test, y_train, y_test = train_test_split(df_imp[columns], df_imp['Arrivals'], test_size=0.2)

In [10]:
# LR

# duplicate data
LR_xtrain = X_train.copy()
LR_ytrain = y_train.copy()
LR_xtest = X_test.copy()
LR_ytest = y_test.copy()

# fit and predict the model
LR.fit(LR_xtrain, list(LR_ytrain))
LR_pred = LR.predict(LR_xtest)

# calculate R^2
print("R^2: ", LR.score(LR_xtest, list(LR_ytest)))
print(f'rmse: {math.sqrt(mean_squared_error(LR_ytest, LR_pred))}') 

# # bin LR_pred and bin y_test
LR_pred = pd.DataFrame(LR_pred, columns=['Arrivals'])
LR_pred['bins'] = pd.cut(LR_pred['Arrivals'], bins=[float('-inf'), 1000, 5000, float('inf')], labels=[1,2,3], right = False)
ybins = pd.cut(y_test, bins=[float('-inf'), 1000, 5000, float('inf')], labels=[1,2,3], right = False)

# evaluate
def classification_accuracy(y_true, y_pred):
    if len(y_true) != len(y_pred):
        raise ValueError("Arrays must be of equal length")
    
    correct_predictions = np.sum(y_true == y_pred)
    total_predictions = len(y_true)
    accuracy = correct_predictions / total_predictions
    return accuracy

print("Accuracy: ", classification_accuracy(ybins, list(LR_pred['bins'])))    

R^2:  0.33613121524109
rmse: 3569.7298052192264
Accuracy:  0.6838776379006264


In [11]:
# duplicate data
DT_xtrain = X_train.copy()
DT_ytrain = y_train.copy()
DT_xtest = X_test.copy()
DT_ytest = y_test.copy()

# fit and predict the model
DT.fit(DT_xtrain, list(DT_ytrain))
DT_pred = DT.predict(DT_xtest)

# calculate R^2
print("R^2: ", DT.score(DT_xtest, list(DT_ytest)))
print(f'rmse: {math.sqrt(mean_squared_error(DT_ytest, DT_pred))}') 

# # bin LR_pred and bin y_test
DT_pred = pd.DataFrame(DT_pred, columns=['Arrivals'])
DT_pred['bins'] = pd.cut(DT_pred['Arrivals'], bins=[float('-inf'), 1000, 5000, float('inf')], labels=[1,2,3], right = False)
ybins = pd.cut(y_test, bins=[float('-inf'), 1000, 5000, float('inf')], labels=[1,2,3], right = False)

# evaluate
print("Accuracy: ", classification_accuracy(ybins, list(DT_pred['bins'])))    

R^2:  0.9202457492981244
rmse: 1237.2889680980425
Accuracy:  0.9939944049139452
