# Model

We build and evaluate our best model.

In [None]:
import math
import torch
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder


dtypes = {
  'Region':                  object,
  'District':                object,
  'CDI':                     float,
  'Month':                   object,
  'Year':                    int,
  'NDVI':                    float,
  'Rainfall':                float,
  'Water Price':             float,
  'Conflict Fatalities':     float,
  'Conflict Incidents':      float,
  'Cholera Deaths':          float,
  'Cholera Cases':           float,
  'Malaria':                 float,
  'Measles':                 float,
  'Cost Min Basket':         float,
  'Goat Price':              float,
  'Goat to Cereal':          float,
  'Maize Price':             float,
  'Rice Price':              float,
  'Sorghum Price':           float,
  'Wage Price':              float,
  'Wage to Cereal':          float,
  'Arrivals':                int,
  'Departures':              int,
}
df = pd.read_csv('data/combined_data.csv')


def prepare_data(df):
  """
  Prepare the data for training, drop NaNs.
  """

  # remove commas in numeric columns
  for feature in df.columns:
    df[feature] = df[feature].apply(lambda x: x.replace(',', '') if isinstance(x, str) else x)

  # force numeric 
  numeric_cols = ['Cost Min Basket', 'Goat Price', 'Goat to Cereal', 'Maize Price', 'Rice Price', 'Sorghum Price', 'Wage Price', 'Arrivals']
  df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric)

  # make categorical columns
  df = df.astype({"Region": 'category', "District": 'category', "Month": 'category'})

  # drop columns that arent useful
  keep_cols = ['Arrivals', 'Region', 'District', 'Month', 'Rainfall', 'Conflict Fatalities', 'Conflict Incidents', 'Water Price', 'Goat Price']
  df = df[keep_cols]
  df = df.dropna()

  return df


df = prepare_data(df)

In [2]:
# encode categorical columns
encoder = LabelEncoder()
encoder.fit(df['Region'])
df['Region'] = encoder.transform(df['Region'])
encoder.fit(df['District'])
df['District'] = encoder.transform(df['District'])
encoder.fit(df['Month'])
df['Month'] = encoder.transform(df['Month'])

# create train and test split
train, test = train_test_split(df, test_size=0.2)

X_train = train.drop(['Arrivals'], axis=1)
y_train = train[['Arrivals']]

X_test = test.drop(['Arrivals'], axis=1)
y_test = test[['Arrivals']]

In [10]:
def classification_accuracy(y_true, y_pred):
    """
    Return the classification accuracy of the predicted labels.
    """
    if y_true.shape != y_pred.shape:
        raise ValueError("y_true and y_pred are not the same size")

    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy

In [11]:
# fit the model
DT = DecisionTreeRegressor(max_depth=11)
DT.fit(X_train, y_train)

# evaluate model based on bins
true_bins = pd.cut(y_test['Arrivals'], bins=[0, 1000, 5000, float('inf')], labels=[1, 2, 3], right=False)
preds_bin = np.digitize(DT.predict(X_test), bins=[0, 1000, 5000, float('inf')], right=False).flatten()

print(f'score: {DT.score(X_test, y_test)}')
print(f'rmse: {math.sqrt(mean_squared_error(DT.predict(X_test), y_test))}')
print(f'classification accuracy: {classification_accuracy(true_bins, preds_bin)}')

score: 0.8310562218654249
rmse: 2712.2346698307515
classification accuracy: 0.9607040839959855


In [5]:
# fit the model
RF = RandomForestRegressor()
RF.fit(X_train, y_train)

# evaluate model based on bins
true_bins = pd.cut(y_test['Arrivals'], bins=[0, 1000, 5000, float('inf')], labels=[1, 2, 3], right=False)
preds_bin = np.digitize(RF.predict(X_test), bins=[0, 1000, 5000, float('inf')], right=False).flatten()

print(f'score: {RF.score(X_test, y_test)}')
print(f'rmse: {math.sqrt(mean_squared_error(RF.predict(X_test), y_test))}')
print(f'classification accuracy: {classification_accuracy(true_bins, preds_bin)}')

  RF.fit(X_train, y_train)


score: 0.8513110232376245
rmse: 2544.4595340049505
classification accuracy: 0.989346097429167


In [6]:
preds = RF.predict((X_test))
y_true = y_test.values.flatten()
diffs = preds - y_true

print(f'percent of perfect predictions: {(np.count_nonzero(diffs == 0) / len(diffs))}')

percent of perfect predictions: 0.9792326102061298


## Neural Network

For expirementation purposes, we build, train, and evlauate a neural network. While preforming relatively well, this model does not do as well as a Decision Tree.

In [7]:
class NeuralNetwork(torch.nn.Module):

    def __init__(self, input_dim):
        super().__init__()

        self.loss_fn = torch.nn.L1Loss()

        self.pipeline = torch.nn.Sequential(
                torch.nn.Linear(input_dim, 512),
                torch.nn.Sigmoid(),
                torch.nn.Linear(512, 128),
                torch.nn.Sigmoid(),
                torch.nn.Linear(128, 32),
                torch.nn.Sigmoid(),
                torch.nn.Linear(32, 1)
        )


    def forward(self, x):
        return self.pipeline(x)
    

    def train(self, X, y, learning_rate, num_epochs):

        for epoch in range(num_epochs):
            optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate)

            pred = self.forward(torch.tensor(X.values, dtype=torch.float32))

            # evaluate loss on prediction
            loss = self.loss_fn(pred, torch.tensor(y.values, dtype=torch.float32))

            # compute gradient
            loss.backward()

            # take an optimization step
            optimizer.step()


    def evaluate(self, X, y):
        true_bins = pd.cut(y['Arrivals'], bins=[0, 1000, 5000, float('inf')], labels=[1, 2, 3], right=False)
        preds = self.forward(torch.Tensor(X.values)).detach()
        preds_bins = np.digitize(preds, bins=[0, 1000, 5000, float('inf')], right=False).flatten()

        print(f'rmse: {math.sqrt(mean_squared_error(preds, y))}')
        print(f'classification accuracy: {classification_accuracy(true_bins, preds_bins)}')


NN = NeuralNetwork(len(list(X_train.columns)))

In [8]:
learning_rate = 0.01
num_epochs = 10
NN.train(X_train, y_train, learning_rate, num_epochs)

In [9]:
NN.evaluate(X_test, y_test)

rmse: 6719.332248689283
classification accuracy: 0.8125530765073729
