In [3]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
import math
from sklearn.preprocessing import LabelEncoder
import torch


dtypes = {
  'Region':                  object,
  'District':                object,
  'CDI':                     float,
  'Month':                   object,
  'Year':                    int,
  'NDVI':                    float,
  'Rainfall':                float,
  'Water Price':             float,
  'Conflict Fatalities':     float,
  'Conflict Incidents':      float,
  'Cholera Deaths':          float,
  'Cholera Cases':           float,
  'Malaria':                 float,
  'Measles':                 float,
  'Cost Min Basket':         float,
  'Goat Price':              float,
  'Goat to Cereal':          float,
  'Maize Price':             float,
  'Rice Price':              float,
  'Sorghum Price':           float,
  'Wage Price':              float,
  'Wage to Cereal':          float,
  'Arrivals':                int,
  'Departures':              int,
}
df = pd.read_csv('data/combined_data.csv')


def prepare_data(df):
  """
  Prepare the data for training, deal with NaNs. I think we also need to drop the 2014 years?
  """

  # remove commas in numeric columns
  for feature in df.columns:
    df[feature] = df[feature].apply(lambda x: x.replace(',', '') if isinstance(x, str) else x)

  # force numeric 
  numeric_cols = ['Cost Min Basket', 'Goat Price', 'Goat to Cereal', 'Maize Price', 'Rice Price', 'Sorghum Price', 'Wage Price', 'Arrivals']
  df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric)

  # make categorical columns
  df = df.astype({"Region": 'category', "District": 'category', "Month": 'category'})

  # drop columns that arent useful
  keep_cols = ['Arrivals', 'Region', 'District', 'Month', 'Rainfall', 'Conflict Fatalities', 'Conflict Incidents', 'Water Price', 'Goat Price']
  df = df[keep_cols]
  df = df.dropna()

  # print(f'arrivals described: \n {df["Arrivals"].describe()} \n ')
  # print(f'10 largest arrivals: \n {df.nlargest(10, "Arrivals")["Arrivals"]}')

  return df


df = prepare_data(df)

  df = pd.read_csv('data/combined_data.csv')


In [4]:
# encode dataframe
# encoded_df = pd.get_dummies(df, columns=['Region', 'District', 'Month'], drop_first=True)

# encode categorical columns in dataframe
encoder = LabelEncoder()
encoder.fit(df['Region'])
df['Region'] = encoder.transform(df['Region'])
encoder.fit(df['District'])
df['District'] = encoder.transform(df['District'])
encoder.fit(df['Month'])
df['Month'] = encoder.transform(df['Month'])
train, test = train_test_split(df, test_size=0.2)

train, test = train_test_split(df, test_size=0.2)

# create train and test split
X_train = train.drop(['Arrivals'], axis=1)
y_train = train[['Arrivals']]

X_test = test.drop(['Arrivals'], axis=1)
y_test = test[['Arrivals']]

In [5]:
def classification_accuracy(y_true, y_pred):
    """
    Return the classification accuracy of the predicted labels.
    """
    if len(y_true) != len(y_pred):
        raise ValueError("Arrays must be of equal length")

    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy

In [6]:
# fit the model
LR = LinearRegression()
LR.fit(X_train, y_train)

# evaluate model based on bins
true_bins = pd.cut(y_test['Arrivals'], bins=[0, 1000, 5000, float('inf')], labels=[1, 2, 3], right=False)
preds_bin = np.digitize(LR.predict(X_test), bins=[0, 1000, 5000, float('inf')], right=False).flatten()

print(f'score: {LR.score(X_test, y_test)}')
print(f'rmse: {math.sqrt(mean_squared_error(LR.predict(X_test), y_test))}')
print(f'classification accuracy: {classification_accuracy(true_bins, preds_bin)}')

score: 0.018070448990905486
rmse: 6000.03080981807
classification accuracy: 0.4173550528835019


In [6]:
# fit the model
SVR = SVR()
SVR.fit(X_train, y_train)

# evaluate model based on bins
true_bins = pd.cut(y_test['Arrivals'], bins=[0, 1000, 5000, float('inf')], labels=[1, 2, 3], right=False)
preds_bin = np.digitize(SVR.predict(X_test), bins=[0, 1000, 5000, float('inf')], right=False).flatten()

print(f'score: {SVR.score(X_test, y_test)}')
print(f'rmse: {math.sqrt(mean_squared_error(SVR.predict(X_test), y_test))}')
print(f'classification accuracy: {classification_accuracy(true_bins, preds_bin)}')

  y = column_or_1d(y, warn=True)


In [7]:
# fit the model
DT = DecisionTreeRegressor(max_depth=11)
DT.fit(X_train, y_train)

# evaluate model based on bins
true_bins = pd.cut(y_test['Arrivals'], bins=[0, 1000, 5000, float('inf')], labels=[1, 2, 3], right=False)
preds_bin = np.digitize(DT.predict(X_test), bins=[0, 1000, 5000, float('inf')], right=False).flatten()

print(f'score: {DT.score(X_test, y_test)}')
print(f'rmse: {math.sqrt(mean_squared_error(DT.predict(X_test), y_test))}')
print(f'classification accuracy: {classification_accuracy(true_bins, preds_bin)}')

score: 0.9615640234900281
rmse: 1187.0862510982797
classification accuracy: 0.9727476260325794


In [8]:
# fit the model
RF = RandomForestRegressor()
RF.fit(X_train, y_train)

# evaluate model based on bins
true_bins = pd.cut(y_test['Arrivals'], bins=[0, 1000, 5000, float('inf')], labels=[1, 2, 3], right=False)
preds_bin = np.digitize(RF.predict(X_test), bins=[0, 1000, 5000, float('inf')], right=False).flatten()

print(f'score: {RF.score(X_test, y_test)}')
print(f'rmse: {math.sqrt(mean_squared_error(RF.predict(X_test), y_test))}')
print(f'classification accuracy: {classification_accuracy(true_bins, preds_bin)}')

  return fit_method(estimator, *args, **kwargs)


score: 0.9588915914216269
rmse: 1227.6615314542425
classification accuracy: 0.9918937697830619


In [9]:
class NeuralNetwork(torch.nn.Module):

    def __init__(self, input_dim):
        super().__init__()

        self.loss_fn = torch.nn.L1Loss()

        self.pipeline = torch.nn.Sequential(
                torch.nn.Linear(input_dim, 512),
                torch.nn.Sigmoid(),
                torch.nn.Linear(512, 128),
                torch.nn.Sigmoid(),
                torch.nn.Linear(128, 32),
                torch.nn.Sigmoid(),
                torch.nn.Linear(32, 1)
        )


    def forward(self, x):
        return self.pipeline(x)
    

    def train(self, X, y, learning_rate, num_epochs):

        for epoch in range(num_epochs):
            optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate)

            pred = self.forward(torch.tensor(X.values, dtype=torch.float32))

            # evaluate loss on prediction
            loss = self.loss_fn(pred, torch.tensor(y.values, dtype=torch.float32))

            # compute gradient
            loss.backward()

            # take an optimization step
            optimizer.step()


    def evaluate(self, X, y):
        true_bins = pd.cut(y['Arrivals'], bins=[0, 1000, 5000, float('inf')], labels=[1, 2, 3], right=False)
        preds = self.forward(torch.Tensor(X.values)).detach()
        preds_bins = np.digitize(preds, bins=[0, 1000, 5000, float('inf')], right=False).flatten()

        print(f'rmse: {math.sqrt(mean_squared_error(preds, y))}')
        print(f'classification accuracy: {classification_accuracy(true_bins, preds_bins)}')


NN = NeuralNetwork(len(list(X_train.columns)))

In [10]:
learning_rate = 0.01
num_epochs = 10
NN.train(X_train, y_train, learning_rate, num_epochs)

In [11]:
NN.evaluate(X_test, y_test)

rmse: 6180.975084770626
classification accuracy: 0.8090017756504285
