In [18]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.metrics import mean_squared_error
import math
import numpy as np


dtypes = {
  'Region':                  object,
  'District':                object,
  'CDI':                     float,
  'Month':                   object,
  'Year':                    int,
  'NDVI':                    float,
  'Rainfall':                float,
  'Water Price':             float,
  'Conflict Fatalities':    float,
  'Conflict Incidents':      float,
  'Cholera Deaths':          float,
  'Cholera Cases':           float,
  'Malaria':                 float,
  'Measles':                 float,
  'Cost Min Basket':         float,
  'Goat Price':              float,
  'Goat to Cereal':          float,
  'Maize Price':             float,
  'Rice Price':              float,
  'Sorghum Price':           float,
  'Wage Price':              float,
  'Wage to Cereal':          float,
  'Arrivals':                int,
  'Departures':              int,
}
df = pd.read_csv('data/combined_data.csv')

def prepare_data(df):
  """
  Prepare the data for training, deal with NaNs. I think we also need to drop the 2014 years?
  """

  # remove commas in numeric columns
  for feature in df.columns:
    df[feature] = df[feature].apply(lambda x: x.replace(',', '') if isinstance(x, str) else x)

  # force numeric 
  numeric_cols = ['Cost Min Basket', 'Goat Price', 'Goat to Cereal', 'Maize Price', 'Rice Price', 'Sorghum Price', 'Wage Price', 'Arrivals']
  df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric)

  # make categorical columns
  df = df.astype({"Region": 'category', "District": 'category', "Month": 'category'})

  # drop columns that arent useful
  keep_cols = ['Arrivals', 'Region', 'District', 'Month', 'Year', 'Rainfall', 'Conflict Fatalities', 'Conflict Incidents', 'Water Price', 'Goat Price']
  df = df[keep_cols]
  df = df.dropna()

  return df

df = prepare_data(df)
train, test = train_test_split(df, test_size=0.2)

X_train = train.drop('Arrivals', axis=1)
y_train = train[['Arrivals']]

X_test = test.drop('Arrivals', axis=1)
y_test = test[['Arrivals']]

  df = pd.read_csv('data/combined_data.csv')


In [19]:
X_train.head(2)

Unnamed: 0,Region,District,Month,Year,Rainfall,Conflict Fatalities,Conflict Incidents,Water Price,Goat Price
82499,Bari,Bandarbeyla,Jun,2021,1.026,0.0,0.0,35000.0,2025000.0
291729,Woqooyi Galbeed,Hargeysa,Jun,2016,15.101,0.0,2.0,9000.0,300000.0


In [20]:
y_train.head(2)

Unnamed: 0,Arrivals
82499,31.0
291729,224.0


In [21]:
# encode dataframe
encoded_df = pd.get_dummies(df, columns=['Region', 'District', 'Month'], drop_first=True)
train, test = train_test_split(encoded_df, test_size=0.2)

# create train and test split
X_train = train.drop('Arrivals', axis=1)
y_train = train[['Arrivals']]

X_test = test.drop('Arrivals', axis=1)
y_test = test[['Arrivals']]

In [22]:
LR = LinearRegression()
LR.fit(X_train, y_train)
LR.score(X_test, y_test)

0.1532049839386861

In [23]:
# SVM = SVR()
# SVM.fit(X_train, y_train)
# SVM.score(X_test, y_test)

In [24]:
DT = DecisionTreeRegressor(max_depth=11)
DT.fit(X_train, y_train)
DT.score(X_test, y_test)
print(np.unique(DT.predict(X_test)))
# print(f'rmse: {math.sqrt(mean_squared_error(DT.predict(X_train), y_train))}')

[1.60000000e+01 3.41703163e+01 3.60044469e+01 8.80000000e+01
 9.60000000e+01 1.00495098e+02 1.04000000e+02 1.37427621e+02
 1.56037736e+02 1.75000000e+02 1.97081731e+02 2.13000000e+02
 2.15333333e+02 2.25111111e+02 2.28076190e+02 2.33000000e+02
 2.40000000e+02 2.54000000e+02 2.56113324e+02 2.60000000e+02
 3.28842105e+02 3.70500000e+02 3.91000000e+02 4.02000000e+02
 4.30554016e+02 4.37000000e+02 4.89403580e+02 5.46000000e+02
 5.58000000e+02 6.12000000e+02 6.12676906e+02 6.30000000e+02
 6.77000000e+02 6.98271704e+02 7.11953052e+02 7.80000000e+02
 7.91172414e+02 1.05200000e+03 1.19510550e+03 1.24100000e+03
 1.25688424e+03 1.56000000e+03 1.59012114e+03 1.62804717e+03
 2.11200000e+03 2.34745424e+03 2.37200000e+03 2.81600000e+03
 2.91691928e+03 3.00300000e+03 3.32330928e+03 3.66000000e+03
 3.67148558e+03 3.93500000e+03 5.09900000e+03 5.48700000e+03
 5.53032353e+03 5.91800000e+03 5.97332000e+03 6.36586364e+03
 6.56850000e+03 7.34700000e+03 1.06500000e+04 1.16870000e+04
 1.58080000e+04 2.252600

In [26]:
from sklearn.tree import DecisionTreeRegressor

RF = RandomForestRegressor()
RF.fit(X_train, y_train)
RF.score(X_test, y_test)
print(f'rmse: {math.sqrt(mean_squared_error(RF.predict(X_test), y_test))}')

  return fit_method(estimator, *args, **kwargs)


rmse: 1448.3473571944742


In [27]:
from sklearn.linear_model import Lasso

RF = Lasso()
RF.fit(X_train, y_train)
RF.score(X_test, y_test)
print(f'rmse: {math.sqrt(mean_squared_error(RF.predict(X_test), y_test))}')

rmse: 5486.473180688849
