In [1]:
from preprocessing import get_model_dataset, create_train_test, min_max_scale, df_to_xy, read_file, lag_features
from lstm import create_model
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from datetime import date
from pathlib import Path

In [2]:
first_year = 2019
last_year = 2021
file = f"./data/processed_data/{first_year}-{last_year}_underlying-strike_only-price.csv"

df_read = read_file(file)
print(df_read)
df_read.info()
print(df_read)
print(df_read["Ttl"].max())

         Unnamed: 0  Quote_date Expire_date     Price  Underlying_last  \
0           1354913  2019-01-02  2019-01-04  1707.050          2509.98   
1           1354914  2019-01-02  2019-01-04  1607.495          2509.98   
2           1354915  2019-01-02  2019-01-04  1507.500          2509.98   
3           1354916  2019-01-02  2019-01-04  1458.295          2509.98   
4           1354917  2019-01-02  2019-01-04  1408.300          2509.98   
...             ...         ...         ...       ...              ...   
5123793     6521988  2021-12-31  2024-12-20   150.000          4766.39   
5123794     6521989  2021-12-31  2024-12-20   150.000          4766.39   
5123795     6521990  2021-12-31  2024-12-20   150.900          4766.39   
5123796     6521991  2021-12-31  2024-12-20   150.000          4766.39   
5123797     6521992  2021-12-31  2024-12-20   150.000          4766.39   

         Strike   Ttl  Volatility     R  
0         800.0     2    0.202726  2.40  
1         900.0     2    0.

In [3]:
features = ["Underlying_last", "Strike", "Ttl", "Volatility", "R"]
seq_length = 5
num_features = len(features)
num_outputs = 1

df_read_lags = lag_features(df_read, features, seq_length)

df_train_orginal, df_test_orginal = create_train_test(df_read_lags, "2021-01-01")

train_x_org, train_y_org, = df_to_xy(df_train_orginal, num_features, seq_length, num_outputs)
test_x_org, test_y_org = df_to_xy(df_test_orginal, num_features, seq_length, num_outputs)

train_x_scaled, test_x_scaled = min_max_scale(train_x_org, test_x_org)
#train_y_scaled, test_y_scaled = min_max_scale(train_y_org, test_y_org)

"""shuffle = np.random.permutation(len(train_x_scaled))
train_x_scaled, train_y_scaled = train_x_scaled[shuffle], train_y_scaled[shuffle]"""

train_x_scaled = np.reshape(train_x_scaled, (len(train_x_scaled), seq_length, num_features))
test_x_scaled = np.reshape(test_x_scaled, (len(test_x_scaled), seq_length, num_features))

print(f"Train_x shape: {train_x_scaled.shape}, train_y shape: {train_y_org.shape}")
print(f"Test_x shape: {test_x_scaled.shape}, test_y shape: {test_y_org.shape}")

Train_x shape: (2785226, 5, 5), train_y shape: (2785226, 1)
Test_x shape: (1845482, 5, 5), test_y shape: (1845482, 1)


In [8]:
df_to = df_test_orginal

df_a = df_to[(df_to["Expire_date"] == "2021-05-19") & (df_to["Strike"] == 3500)]

print(df_a)

         Unnamed: 0  Quote_date Expire_date    Price  Underlying_last  Strike  \
3681828     5061800  2021-04-26  2021-05-19  683.345          4188.00  3500.0   
3689985     5069957  2021-04-27  2021-05-19  686.200          4186.62  3500.0   
3698188     5078160  2021-04-28  2021-05-19  681.205          4184.51  3500.0   
3706422     5086394  2021-04-29  2021-05-19  708.350          4211.60  3500.0   
3714363     5094335  2021-04-30  2021-05-19  678.150          4181.81  3500.0   
3722203     5102175  2021-05-03  2021-05-19  690.745          4192.80  3500.0   
3730294     5110266  2021-05-04  2021-05-19  662.940          4165.88  3500.0   
3738279     5118251  2021-05-05  2021-05-19  663.800          4168.14  3500.0   
3746264     5126236  2021-05-06  2021-05-19  699.150          4201.46  3500.0   
3754097     5134069  2021-05-07  2021-05-19  730.300          4232.05  3500.0   
3761887     5141859  2021-05-10  2021-05-19  687.605          4188.83  3500.0   
3770004     5149976  2021-05

In [11]:
df_ax, df_ay = df_to_xy(df_a, num_features, seq_length, num_outputs)

print(df_ax[-1])
print(df_ay[-1])

[4.0632900e+03 3.5000000e+03 7.0000000e+00 1.4681862e-01 9.9999998e-03
 4.1119702e+03 3.5000000e+03 6.0000000e+00 1.4768696e-01 0.0000000e+00
 4.1734800e+03 3.5000000e+03 5.0000000e+00 1.4705545e-01 9.9999998e-03
 4.1636001e+03 3.5000000e+03 2.0000000e+00 1.4687872e-01 0.0000000e+00
 4.1278101e+03 3.5000000e+03 1.0000000e+00 1.4760363e-01 0.0000000e+00]
[629.3]


In [4]:
from keras.callbacks import EarlyStopping
config = {
    "units": 32,
    "learning_rate": 0.0015,
    "layers": 4,
    "seq_length": seq_length,
    "num_features": num_features,
    "bn_momentum" : 0.4,
    "clip_norm": 0.7
}

def trainer(train_x, train_y, model):
    epochs = 100
    minibatch_size = 1024

    early_stopping = EarlyStopping(
        monitor='val_loss',
        mode='min',
        min_delta = 10,
        patience = 5,
    )

    model.fit(
        train_x,
        train_y,
        batch_size = minibatch_size,
        validation_split = 0.3,
        epochs = epochs,
        callbacks = [early_stopping]
    )

model = create_model(config)
model.summary()

trainer(train_x_scaled, train_y_org, model)

"""path = f"./runs/model_w_validation/{first_year}-{last_year}-{date.today()}"
model.save(path)"""

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 5, 32)             4864      
                                                                 
 batch_normalization (BatchN  (None, 5, 32)            128       
 ormalization)                                                   
                                                                 
 lstm_1 (LSTM)               (None, 5, 32)             8320      
                                                                 
 batch_normalization_1 (Batc  (None, 5, 32)            128       
 hNormalization)                                                 
                                                                 
 lstm_2 (LSTM)               (None, 5, 32)             8320      
                                                                 
 batch_normalization_2 (Batc  (None, 5, 32)            1

'path = f"./runs/model_w_validation/{first_year}-{last_year}-{date.today()}"\nmodel.save(path)'

In [9]:
from tensorflow.keras.metrics import MeanSquaredError, RootMeanSquaredError
predictions = np.array(model(test_x_scaled))


In [10]:
def prediction(df_test, predictions, model, train_y_org, test_y_org):
    bid, ask = predictions[:, :1].flatten(), predictions[:, 1:].flatten()
    df_test["Prediction_bid_strike"] = bid
    df_test["Prediction_ask_strike"] = ask
    df_test["Prediction_bid"] = df_test["Prediction_bid_strike"] * df_test["Strike"]
    df_test["Prediction_ask"] = df_test["Prediction_ask_strike"] * df_test["Strike"]


    m = MeanSquaredError()
    m.update_state(test_y_org, predictions)
    print("MSE from model:", m.result().numpy())
    m = RootMeanSquaredError()
    m.update_state(test_y_org, predictions)
    print("RMSE from model:", m.result().numpy())

    return df_test

df_test = prediction(df_test_orginal, predictions, model, train_y_org, test_y_org)

"""filename = f"./runs/data_w_validation//{first_year}-{last_year}-{date.today()}.csv"
filepath = Path(filename)  
filepath.parent.mkdir(parents=True, exist_ok=True)  
df_test.to_csv(filename)"""

df_test.info()
print(df_test.head())

MSE from model: 5553.525
RMSE from model: 74.52197
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1845482 entries, 3097811 to 5123797
Data columns (total 43 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   Unnamed: 0             int64  
 1   Quote_date             object 
 2   Expire_date            object 
 3   Ask                    float64
 4   Bid                    float64
 5   Underlying_last        float64
 6   Strike                 float64
 7   Ttl                    int64  
 8   Volatility             float64
 9   R                      float64
 10  Underlying_last-4      float64
 11  Strike-4               float64
 12  Ttl-4                  float64
 13  Volatility-4           float64
 14  R-4                    float64
 15  Underlying_last-3      float64
 16  Strike-3               float64
 17  Ttl-3                  float64
 18  Volatility-3           float64
 19  R-3                    float64
 20  Underlying_last-2      float64
 2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["Prediction_bid_strike"] = bid
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["Prediction_ask_strike"] = ask
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["Prediction_bid"] = df_test["Prediction_bid_strike"] * df_test["Strike"]
A value is trying to be set on a copy of a sli

In [6]:
print(df_test.head(20)[["Bid", "Ask", "Prediction_bid", "Prediction_ask", "SE_bid", "SE_ask", "Prediction_bid_strike", "Prediction_ask_strike"]])

             Bid      Ask  Prediction_bid  Prediction_ask        SE_bid  \
3097811  2699.91  2703.80    9.735468e+09    9.770265e+09  9.477928e+19   
3104882  2724.19  2728.20    9.776310e+09    9.812406e+09  9.557618e+19   
3097812  2595.90  2601.69    1.030207e+10    1.034309e+10  1.061325e+20   
3104883  2624.70  2628.71    1.035156e+10    1.039414e+10  1.071548e+20   
3097813  2498.29  2502.10    1.079898e+10    1.084613e+10  1.166179e+20   
3104884  2523.60  2529.00    1.086084e+10    1.090991e+10  1.179579e+20   
3097814  2398.29  2402.29    1.122484e+10    1.127768e+10  1.259970e+20   
3104885  2424.70  2428.70    1.129715e+10    1.135218e+10  1.276255e+20   
3097815  2298.30  2302.30    1.157228e+10    1.162959e+10  1.339177e+20   
3104886  2324.69  2328.70    1.165620e+10    1.171603e+10  1.358670e+20   
3097816  2199.80  2203.80    1.184022e+10    1.190166e+10  1.401909e+20   
3104887  2223.39  2228.69    1.193669e+10    1.200072e+10  1.424844e+20   
3097817  2100.30  2104.11

In [7]:
print(f"MAE: {df_test['MAE_bid'].mean()}")
plt.scatter(df_test["Bid"], df_test["MAE_bid"])
plt.show()

KeyError: 'MAE_bid'

In [None]:
print(f"MAE: {df_test['Raw_mae_bid'].mean()}")
plt.scatter(train_y_scaled[:,:1], df_test["Raw_mae_bid"])
plt.show()

In [None]:
print(f"MAE: {df_test['MAE_ask'].mean()}")
plt.scatter(df_test["Ask"], df_test["MAE_ask"])
plt.show()

In [None]:
plt.scatter(df_train_orginal["Strike"], df_train_orginal["Moneyness"])
plt.show()