In [12]:
!pip install tensorflow_addons

Collecting tensorflow_addons
  Downloading tensorflow_addons-0.18.0-cp38-cp38-macosx_10_14_x86_64.whl (12.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.5/12.5 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting typeguard>=2.7
  Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Installing collected packages: typeguard, tensorflow_addons
Successfully installed tensorflow_addons-0.18.0 typeguard-2.13.3


In [1]:
import tensorflow as tf
print(tf.__version__)

2.8.0


In [None]:
# Weights and Biases
#!pip install -q wandb
# Tensorflow
#!pip install -q tensorflow

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from pathlib import Path
import math

In [3]:
#Variables
first_year = 2019
last_year = 2021
split_date ="2021-01-01"

epochs = 100

features = ["Underlying_last", "Strike", "Ttl", "Volatility", "R"]
num_features = len(features)
num_outputs = 1
seq_length = 5



In [4]:
def read_file(file):
    """Read a single file and return a dataframe"""
    return pd.read_csv(file, skipinitialspace=True)

def lag_features(df, features, seq_length):
    """Transforms a raw 2D dataframe of option data into 2D dataframe ofsequence data.
    Last 2 indexes per sequence are bid and ask price. The len(features)*seq_length
    features before are sequences of features"""
    df = df.sort_values(["Expire_date", "Strike", "Ttl"], ascending = [True, True, False])

    for step in range(seq_length)[::-1]:
        for feature in features:
            df[feature + "-" + str(step)] = df[feature].shift(step)
    
    df["Check_strike"] = df["Strike"] == df["Strike"].shift(seq_length-1)
    df["Check_expire"] = df["Expire_date"] == df["Expire_date"].shift(seq_length-1)
    df = df[(df["Check_strike"] == True) & (df["Check_expire"] == True)]
    df = df.drop(["Check_strike", "Check_expire"], axis=1)
    #df[["Bid_strike_last", "Ask_strike_last"]] = df[["Bid_strike", "Ask_strike"]]
    #df[["Bid_last", "Ask_last"]] = df[["Bid", "Ask"]]
    df["Price_last"] = df["Price"]
    
    return df

def create_train_test(df, split_date):
    """Splits data in training and test set, and transforms data to right 2D format"""
    return df[df["Quote_date"] < split_date], df[df["Quote_date"] >= split_date]

def df_to_xy(df, num_features, num_outputs):
    """Transforms a dataframe into two arrays of explanatory variables x and explained variables y"""
    dx = df[["Underlying_last", "Strike", "Ttl", "Volatility", "R"]]
    dy = df["Price"]
    array_x, array_y = dx.to_numpy().astype(np.float32), dy.to_numpy().astype(np.float32)
    return array_x, array_y

def min_max_scale(train, test):
    """Scales a training and test set using MinMaxScaler. The scaler is calibrated on the training set"""
    scaler = MinMaxScaler()
    train = scaler.fit_transform(train)
    test = scaler.transform(test)
    return train, test

In [9]:
# Load dataset
df_read = read_file("data/processed_data/2019-2021_underlying-strike_only-price.csv")
print(df_read)
df_read.info()
print(df_read)
print(df_read["Ttl"].max())


         Unnamed: 0  Quote_date Expire_date     Price  Underlying_last  \
0           1354913  2019-01-02  2019-01-04  1707.050          2509.98   
1           1354914  2019-01-02  2019-01-04  1607.495          2509.98   
2           1354915  2019-01-02  2019-01-04  1507.500          2509.98   
3           1354916  2019-01-02  2019-01-04  1458.295          2509.98   
4           1354917  2019-01-02  2019-01-04  1408.300          2509.98   
...             ...         ...         ...       ...              ...   
5123793     6521988  2021-12-31  2024-12-20   150.000          4766.39   
5123794     6521989  2021-12-31  2024-12-20   150.000          4766.39   
5123795     6521990  2021-12-31  2024-12-20   150.900          4766.39   
5123796     6521991  2021-12-31  2024-12-20   150.000          4766.39   
5123797     6521992  2021-12-31  2024-12-20   150.000          4766.39   

         Strike   Ttl  Volatility     R  
0         800.0     2    0.202726  2.40  
1         900.0     2    0.

In [10]:
# Splitting dataset
df_read = lag_features(df_read, features, seq_length)
df_read = df_read[["Quote_date", "Price", "Underlying_last", "Strike", "Ttl", "Volatility", "R"]]
df_train_orginal, df_test_orginal = create_train_test(df_read, split_date)
print(df_train_orginal.head(), df_test_orginal.head())

train_x_org, train_y_org, = df_to_xy(df_train_orginal, num_features, num_outputs)
print("-------\n", train_x_org, train_y_org)
test_x_org, test_y_org = df_to_xy(df_test_orginal, num_features, num_outputs)

train_x_scaled, test_x_scaled = min_max_scale(train_x_org, test_x_org)
#train_y_scaled, test_y_scaled = min_max_scale(train_y_org, test_y_org)
train_y_scaled, test_y_scaled = train_y_org, test_y_org

"""shuffle = np.random.permutation(len(train_x_scaled))
train_x_scaled, train_y_scaled = train_x_scaled[shuffle], train_y_scaled[shuffle]"""

train_x_scaled = np.reshape(train_x_scaled, (len(train_x_scaled), num_features))
test_x_scaled = np.reshape(test_x_scaled, (len(test_x_scaled), num_features))

print(f"Train_x shape: {train_x_scaled.shape}, train_y shape: {train_y_scaled.shape}")
print(f"Test_x shape: {test_x_scaled.shape}, test_y shape: {test_y_scaled.shape}")

       Quote_date     Price  Underlying_last  Strike  Ttl  Volatility    R
24524  2019-01-08  1072.110          2574.26  1500.0    1    0.215077  2.4
24525  2019-01-08  1022.110          2574.26  1550.0    1    0.215077  2.4
24526  2019-01-08   972.100          2574.26  1600.0    1    0.215077  2.4
24527  2019-01-08   922.895          2574.26  1650.0    1    0.215077  2.4
24528  2019-01-08   872.100          2574.26  1700.0    1    0.215077  2.4          Quote_date     Price  Underlying_last  Strike  Ttl  Volatility     R
3097811  2021-01-04  2701.855          3701.38  1000.0    2    0.185353  0.09
3104882  2021-01-05  2726.195          3727.05  1000.0    1    0.184968  0.08
3097812  2021-01-04  2598.795          3701.38  1100.0    2    0.185353  0.09
3104883  2021-01-05  2626.705          3727.05  1100.0    1    0.184968  0.08
3097813  2021-01-04  2500.195          3701.38  1200.0    2    0.185353  0.09
-------
 [[2.5742600e+03 1.5000000e+03 1.0000000e+00 2.1507716e-01 2.4000001e+00]


In [13]:
from keras.models import Sequential
from keras.layers import Dense, Input, Dropout, BatchNormalization
from keras import backend as K
from tensorflow_addons.optimizers import AdamW
import keras as KER
from sklearn.model_selection import train_test_split
from keras.activations import linear, relu
from datetime import datetime
from tensorflow.keras.metrics import MeanSquaredError, RootMeanSquaredError

In [18]:
def create_model(config):
  """Builds a model of minimum 2 layers sequentially from a given config dictionary"""
  model = Sequential()

  model.add(Dense(
    units = config["units"],
    activation = relu,
    input_shape = (config["num_features"],)
  ))

  model.add(BatchNormalization(
    momentum = config["bn_momentum"]
  ))


  for i in range(config["layers"]-2):
    model.add(Dense(
      units = config["units"],
      activation = relu
    ))
    model.add(BatchNormalization(
      momentum = config["bn_momentum"]
    ))

  model.add(Dense(
    units = config["units"],
    activation = relu
  ))

  model.add(BatchNormalization(
    momentum = config["bn_momentum"]
  ))

  model.add(Dense(
    units = num_outputs,
    activation = relu
  ))  

  model.compile(
    optimizer = AdamW(
      learning_rate = config["learning_rate"],
      weight_decay = config["weight_decay"]
    ),
    loss = "mse",
  )

  return model

In [20]:
from keras.callbacks import EarlyStopping
config = {
    "units": 32,
    "learning_rate": 0.004588272887584361,
    "layers": 4,
    "seq_length": seq_length,
    "num_features": num_features,
    "bn_momentum" : 0.034653375084312724,
    "lr_decay" : 0.9475091291542892,
    "weight_decay" : 0.0001
}

def trainer(train_x, train_y, model):
    epochs = 100
    minibatch_size = 2048

    early_stopping = EarlyStopping(
        monitor='val_loss',
        mode='min',
        min_delta = 1,
        patience = 15,
    )

    model.fit(
        train_x,
        train_y,
        batch_size = minibatch_size,
        validation_split = 0.3,
        epochs = epochs,
        callbacks = [early_stopping]
    )

model = create_model(config)
model.summary()

trainer(train_x_scaled, train_y_org, model)

"""timestamp = datetime.now().strftime("%d/%m/%Y_%H:%M")
path = f"{colab_path}runs/model_w_validation/{first_year}-{last_year}-{timestamp}"
model.save(path)"""

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_15 (Dense)            (None, 32)                192       
                                                                 
 batch_normalization_12 (Bat  (None, 32)               128       
 chNormalization)                                                
                                                                 
 dense_16 (Dense)            (None, 32)                1056      
                                                                 
 batch_normalization_13 (Bat  (None, 32)               128       
 chNormalization)                                                
                                                                 
 dense_17 (Dense)            (None, 32)                1056      
                                                                 
 batch_normalization_14 (Bat  (None, 32)              

'timestamp = datetime.now().strftime("%d/%m/%Y_%H:%M")\npath = f"{colab_path}runs/model_w_validation/{first_year}-{last_year}-{timestamp}"\nmodel.save(path)'

In [None]:
predictions = np.array(model(test_x_scaled))

In [None]:
def prediction(df_test, predictions, model, train_y_org, test_y_org):
    df_test["Prediction"] = predictions.flatten()

    m = MeanSquaredError()
    m.update_state(test_y_org, predictions)
    print("MSE from model:", m.result().numpy())
    m = RootMeanSquaredError()
    m.update_state(test_y_org, predictions)
    print("RMSE from model:", m.result().numpy())

    return df_test

df_test = prediction(df_test_orginal, predictions, model, train_y_org, test_y_org)

#print(train_y_org[:, :1].min(), train_y_org[:, :1].max())
#print(train_y_org[:, 1:].min(), train_y_org[:, 1:].max())

"""print("MSE_bid:", df_test["SE_bid"].mean(), "RMSE_bid:", math.sqrt(df_test["SE_bid"].mean()))
print("MSE_ask:", df_test["SE_ask"].mean(), "RMSE_ask:", math.sqrt(df_test["SE_ask"].mean()))
"""

"""timestamp = datetime.now().strftime("%d/%m/%Y_%H:%M")
filename = f"{colab_path}runs/data_w_validation/{first_year}-{last_year}-{timestamp}.csv"
filepath = Path(filename)  
filepath.parent.mkdir(parents=True, exist_ok=True)  
df_test.to_csv(filename)
"""
#df_test.info()
#print(df_test.head())

MSE from model: 3797.985
RMSE from model: 61.627796


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["Prediction"] = predictions.flatten()


'timestamp = datetime.now().strftime("%d/%m/%Y_%H:%M")\nfilename = f"{colab_path}runs/data_w_validation/{first_year}-{last_year}-{timestamp}.csv"\nfilepath = Path(filename)  \nfilepath.parent.mkdir(parents=True, exist_ok=True)  \ndf_test.to_csv(filename)\n'

In [None]:
def trainer2(train_x, train_y, model):
    epochs = 24
    minibatch_size = 2048

    model.fit(
        train_x,
        train_y,
        batch_size = minibatch_size,
        epochs = epochs
    )

model2 = create_model(config)
model2.summary()

trainer2(train_x_scaled, train_y_org, model2)

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 32)                192       
                                                                 
 batch_normalization_3 (Batc  (None, 32)               128       
 hNormalization)                                                 
                                                                 
 dense_5 (Dense)             (None, 32)                1056      
                                                                 
 batch_normalization_4 (Batc  (None, 32)               128       
 hNormalization)                                                 
                                                                 
 dense_6 (Dense)             (None, 32)                1056      
                                                                 
 batch_normalization_5 (Batc  (None, 32)              

In [None]:
predictions2 = np.array(model2(test_x_scaled))

In [None]:
def prediction(df_test, predictions, model, train_y_org, test_y_org):
    df_test["Prediction"] = predictions.flatten()

    m = MeanSquaredError()
    m.update_state(test_y_org, predictions)
    print("MSE from model:", m.result().numpy())
    m = RootMeanSquaredError()
    m.update_state(test_y_org, predictions)
    print("RMSE from model:", m.result().numpy())

    return df_test

df_test2 = prediction(df_test_orginal, predictions2, model2, train_y_org, test_y_org)

MSE from model: 1634.7479
RMSE from model: 40.432014


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["Prediction"] = predictions.flatten()


In [None]:
from pathlib import Path
from datetime import datetime

In [None]:
time = datetime.now()
time = time.strftime("%m-%d_%H-%M")

filename = f"{colab_path}Predictions/{last_year}_predictions_{time}.csv"
filepath = Path(filename)
filepath.parent.mkdir(parents=True, exist_ok = True)
df_test2.to_csv(filename)