In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer


dtypes = {
  'Region':                  object,
  'District':                object,
  'CDI':                     float,
  'Month':                   object,
  'Year':                    int,
  'NDVI':                    float,
  'Rainfall':                float,
  'Water Price':             float,
  'Conflict Fatalities':    float,
  'Conflict Incidents':      float,
  'Cholera Deaths':          float,
  'Cholera Cases':           float,
  'Malaria':                 float,
  'Measles':                 float,
  'Cost Min Basket':         float,
  'Goat Price':              float,
  'Goat to Cereal':          float,
  'Maize Price':             float,
  'Rice Price':              float,
  'Sorghum Price':           float,
  'Wage Price':              float,
  'Wage to Cereal':          float,
  'Arrivals':                int,
  'Departures':              int,
}
df = pd.read_csv('data/combined_data.csv')

def prepare_data(df):
  """
  Prepare the data for training, deal with NaNs. I think we also need to drop the 2014 years?
  """

  # remove commas in numeric columns
  for feature in df.columns:
    df[feature] = df[feature].apply(lambda x: x.replace(',', '') if isinstance(x, str) else x)

  # force numeric 
  numeric_cols = ['Cost Min Basket', 'Goat Price', 'Goat to Cereal', 'Maize Price', 'Rice Price', 'Sorghum Price', 'Wage Price', 'Arrivals']
  df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric)

  # make categorical columns
  df = df.astype({"Region": 'category', "District": 'category', "Month": 'category'})

  # drop columns that arent useful
  keep_cols = ['Arrivals', 'Region', 'District', 'Month', 'Year', 'Rainfall', 'Conflict Fatalities', 'Conflict Incidents', 'Water Price', 'Goat Price']
  df = df[keep_cols]
  df = df.dropna()

  return df

df = prepare_data(df)
train, test = train_test_split(df, test_size=0.2)

X_train = train.drop('Arrivals', axis=1)
y_train = train[['Arrivals']]

X_test = test.drop('Arrivals', axis=1)
y_test = test[['Arrivals']]

  df = pd.read_csv('data/combined_data.csv')


In [2]:
X_train.head(2)

Unnamed: 0,Region,District,Month,Year,Rainfall,Conflict Fatalities,Conflict Incidents,Water Price,Goat Price
307451,Gedo,Luuq,Jun,2016,3.485,0.0,2.0,15000.0,797500.0
47435,Galgaduud,Dhuusamarreeb,Jun,2022,1.114,0.0,3.0,15000.0,1247500.0


In [3]:
y_train.head(2)

Unnamed: 0,Arrivals
307451,38.0
47435,1366.0


In [4]:
# encode dataframe
encoded_df = pd.get_dummies(df, columns=['Region', 'District', 'Month'], drop_first=True)
train, test = train_test_split(encoded_df, test_size=0.2)

# create train and test split
X_train = train.drop('Arrivals', axis=1)
y_train = train[['Arrivals']]

X_test = test.drop('Arrivals', axis=1)
y_test = test[['Arrivals']]

In [5]:
LR = LinearRegression()
LR.fit(X_train, y_train)
LR.score(X_test, y_test)

0.15328036560983038

In [6]:
SVM = SVR()
SVM.fit(X_train, y_train)
SVM.score(X_test, y_test)

  y = column_or_1d(y, warn=True)


In [None]:
DT = DecisionTreeRegressor(max_depth=2)
DT.fit(X_train, y_train)
DT.score(X_test, y_test)

0.7919669238234

In [None]:
RF = RandomForestRegressor()
RF.fit(X_train, y_train)
RF.score(X_test, y_test)

  RF.fit(X_train, y_train)


0.9446294049969509