# ML algorithm spot check 

Table of Contents
1. [Read in cleaned data](#cleaned)
1. [Spot check XGBoost](#xgboost)
1. [Spot check Multilayer Perceptron](#mlp)

## Read in the cleansed data <a name="cleaned"></a>

In [None]:
import pandas as pd
import numpy as np
import sklearn

In [None]:
prepared_data_dir = 'data/processed'

In [None]:
processed = pd.read_pickle("{}/cooked_v3.pkl".format (prepared_data_dir))

In [None]:
processed.info ()

In [None]:
processed.sample(5)

In [None]:
processed.head (5)

In [None]:
selected = processed[processed['Mnemonic'].isin(['EOAN', 'BMW', 'SAP'])]
selected.Mnemonic.cat.remove_unused_categories (inplace=True)

In [None]:
selected.head(5)

## Forecast with XGBoost <a name="xgboost"></a>

In [None]:
!pip install xgboost

In [None]:
import xgboost

In [None]:
def create_xgb_target (df):
    return df.MaxPrice.shift(-1).fillna (method='ffill')

In [None]:
def create_xgb_features (df, horizon):
    n_df = df.copy ()
    
    for offset in range(1, horizon+1):
        min_price = n_df['MinPrice'].shift (offset).fillna(method='bfill')
        max_price = n_df['MaxPrice'].shift (offset).fillna(method='bfill')
        start_price = n_df['StartPrice'].shift (offset).fillna(method='bfill')
        end_price = n_df['EndPrice'].shift (offset).fillna(method='bfill')
        trade_vol = n_df['TradedVolume'].shift (offset).fillna(method='bfill')
        num_trades = n_df['NumberOfTrades'].shift (offset).fillna(method='bfill')
        
        n_df["h{}_MinPrice".format (offset)] = min_price
        n_df["h{}_MaxPrice".format (offset)] = max_price
        n_df["h{}_StartPrice".format (offset)] = start_price
        n_df["h{}_EndPrice".format (offset)] = end_price
        n_df["h{}_TradeVolume".format (offset)] = trade_vol
        n_df["h{}_NumberOfTrades".format (offset)] = num_trades
        
    return n_df

In [None]:
xgb_data = create_xgb_features (selected, 5)
xgb_data['NextMaxPrice'] = create_xgb_target (xgb_data)
pd.options.display.max_columns=150
xgb_data.head(5)

In [None]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split (xgb_data, train_size=0.8, shuffle=False)
train_data, validate_data = train_test_split (train_data, train_size=0.8, test_size=0.2, shuffle=True)

train_labels = train_data.NextMaxPrice
validate_labels = validate_data.NextMaxPrice
test_labels = test_data.NextMaxPrice

In [None]:
train_data.drop ("NextMaxPrice", axis=1, inplace=True)
validate_data.drop ("NextMaxPrice", axis=1, inplace=True)
test_data.drop ('NextMaxPrice', axis = 1, inplace=True)

In [None]:
train_features = pd.get_dummies(train_data)
validate_features = pd.get_dummies(validate_data)
test_features = pd.get_dummies(test_data)
train_features.head(5)

In [None]:
model = xgboost.XGBRegressor (n_estimators=500)
model.fit (train_features, train_labels, 
           eval_set=[(train_features, train_labels), (validate_features, validate_labels)],
           early_stopping_rounds = 50, verbose=False)

In [None]:
%matplotlib inline
from matplotlib import pyplot as plt
from xgboost import plot_importance
fig, ax = plt.subplots(1,1,figsize=(25,25))
plot_importance(model, height=0.8, ax=ax)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
pred = model.predict (test_features)
print ("RMSE: {}".format (mean_squared_error (y_true = test_labels, y_pred = pred)))
print ("MAE: {}".format (mean_absolute_error (y_true = test_labels, y_pred = pred)))

In [None]:
test_labels.index.get_loc('2017-07-14 08:00:00')

In [None]:
from matplotlib import pyplot as plt

start_loc = test_labels.index.get_loc('2017-07-14 08:00:00')
end_loc = test_labels.index.get_loc('2017-07-14 20:00:00')
data = pd.DataFrame ()
data['True_Y'] = test_labels[start_loc:end_loc]
data['Pred_Y'] = pred[start_loc:end_loc]

data.plot (figsize=(25,10))

## Forecasting with an MLP <a name="mlp"></a>

In [None]:
import mxnet as mx
from mxnet import nd, autograd, gluon

In [None]:
num_hidden_1 = 64
num_hidden_2 = 24
num_output = 1
learning_rate = 0.001
batch_size = 100
num_examples = 33233

model_ctx = mx.cpu ()
data_ctx = mx.cpu ()

In [None]:
train_dataset = gluon.data.dataset.ArrayDataset (nd.array(train_features), nd.array(train_labels))
train_dataloader = gluon.data.DataLoader (train_dataset, batch_size=batch_size)

validate_dataset = gluon.data.dataset.ArrayDataset (nd.array(validate_features), nd.array(validate_labels))
validate_dataloader = gluon.data.DataLoader (validate_dataset, batch_size=batch_size)

test_dataset = gluon.data.dataset.ArrayDataset (nd.array(test_features), nd.array(test_labels))
test_dataloader = gluon.data.DataLoader (test_dataset, batch_size=batch_size)

In [None]:
model = gluon.nn.Sequential ()
with model.name_scope ():
    model.add (gluon.nn.Dense (num_hidden_1, activation='relu'))
    model.add (gluon.nn.Dense (num_hidden_2, activation='relu'))
    model.add (gluon.nn.Dense (num_output))
    
model.collect_params().initialize(mx.init.Normal(sigma=.1), ctx=model_ctx)
l2loss = gluon.loss.L2Loss ()
trainer = gluon.Trainer(model.collect_params(), 'adam', {'learning_rate': learning_rate})


In [None]:
def evaluate_accuracy(data_iterator, model):
    rmse = mx.metric.RMSE ()
    for i, (data, label) in enumerate(data_iterator):
        data = data.as_in_context(model_ctx).reshape((-1, 46))
        label = label.as_in_context(model_ctx)
        output = model(data)
        predictions = output # predictions = nd.argmax(output, axis=1)
        rmse.update(preds=predictions, labels=label)
    return rmse.get()[1]

In [None]:
epochs = 50
smoothing_constant = .01

for e in range(epochs):
    cumulative_loss = 0
    for i, (data, label) in enumerate(train_dataloader):
        data = data.as_in_context(model_ctx).reshape((-1, 46))
        label = label.as_in_context(model_ctx)
        with autograd.record():
            output = model(data)
            loss = l2loss(output, label)
        loss.backward()
        trainer.step(data.shape[0])
        cumulative_loss += nd.sum(loss).asscalar()

    validate_accuracy = evaluate_accuracy(validate_dataloader, model)
    train_accuracy = evaluate_accuracy(train_dataloader, model)
    print("Epoch %s. Loss: %s, Train_acc %s, Validate_acc %s" %
          (e, cumulative_loss/num_examples, train_accuracy, validate_accuracy))

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
pred = model(nd.array(test_features)).asnumpy ().reshape(-1)
print ("RMSE: {}".format (mean_squared_error (y_true = test_labels, y_pred = pred)))
print ("MAE: {}".format (mean_absolute_error (y_true = test_labels, y_pred = pred)))

In [None]:
from matplotlib import pyplot as plt

start_loc = test_labels.index.get_loc('2017-07-14 08:00:00')
end_loc = test_labels.index.get_loc('2017-07-14 20:00:00')
data = pd.DataFrame ()
data['True_Y'] = test_labels[start_loc:end_loc]
data['Pred_Y'] = pred[start_loc:end_loc]

data.plot (figsize=(25,10))

---
Extra credit

## Forecasting with CNN

## Forecasting with LSTM

## Forecasting with DeepAR