In [2]:
# initialization code
from google.colab import drive
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# path to drive where csv file is read from
drive.mount('/content/drive')
datadir = "/content/drive/MyDrive/BBGdatasets/sampleDailyInputData_12-27-21.csv"

# load historical data from path into a pandas DataFrame
df = pd.read_csv(datadir, index_col=0, header=[0,1], parse_dates=True)
df.columns = [' '.join(col).strip() for col in df.columns.values]

# transform existing DataFrame by calculating yield spreads and assigning to new DataFrame
def yield_spreads_calc(df):
    return (df
     .assign(yield_spread_MTGEFNCL_Index_LRC30APR_Index = df['MTGEFNCL Index PX_LAST'] - df['LRC30APR Index PX_LAST'],
             yield_spread_MTGEFNCL_Index_GT5_Govt = df['MTGEFNCL Index PX_LAST'] - df['GT5 Govt PX_LAST'],
             yield_spread_MTGEFNCL_Index_USGG5YR_Index = df['MTGEFNCL Index PX_LAST'] - df['USGG5YR Index PX_LAST'],
             yield_spread_LRC30APR_Index_MTGEFNCL_Index = df['LRC30APR Index PX_LAST'] - df['MTGEFNCL Index PX_LAST'],
             yield_spread_LRC30APR_Index_GT5_Govt = df['LRC30APR Index PX_LAST'] - df['GT5 Govt PX_LAST'],
             yield_spread_LRC30APR_Index_USGG5YR_Index = df['LRC30APR Index PX_LAST'] - df['USGG5YR Index PX_LAST'],
             yield_spread_GT5_Govt_MTGEFNCL_Index = df['GT5 Govt PX_LAST'] - df['MTGEFNCL Index PX_LAST'],
             yield_spread_GT5_Govt_LRC30APR_Index = df['GT5 Govt PX_LAST'] - df['LRC30APR Index PX_LAST'],
             yield_spread_GT5_Govt_USGG5YR_Index = df['GT5 Govt PX_LAST'] - df['USGG5YR Index PX_LAST'],
             yield_spread_USGG5YR_Index_MTGEFNCL_Index = df['USGG5YR Index PX_LAST'] - df['MTGEFNCL Index PX_LAST'],
             yield_spread_USGG5YR_Index_LRC30APR_Index = df['USGG5YR Index PX_LAST'] - df['LRC30APR Index PX_LAST'],
             yield_spread_USGG5YR_Index_GT5_Govt = df['USGG5YR Index PX_LAST'] - df['GT5 Govt PX_LAST']
            )
    )
yield_spreads_df = yield_spreads_calc(df)

# number of yield spreads, adjust accordingly
n = 12

# seperate yield spreads from other market information
yield_spreads = yield_spreads_df.iloc[:, -n:]
other_columns = yield_spreads_df.iloc[:, :-n]

# concatenate yield spreads with other market information to restructure into new DataFrame
df_new = pd.concat([yield_spreads, other_columns], axis=1)

# # assign the new DataFrame to df_imp to not overwrite original DataFrame as well as do MICE Imputation
imputer = IterativeImputer(random_state=100, max_iter=10)
imputer.fit(df_new)
df_imp = imputer.transform(df_new)
data = pd.DataFrame(df_imp)

# separate the yield spreads which are our targets from other market information again and into NumPy arrays
yield_spreads = data.iloc[:, :n].values
market_info = data.iloc[:, n:].values

# device configuration for using Nvidia GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# convert the data to PyTorch tensors of type torch.float32 and run on GPU if available
X_yield_spreads = torch.tensor(yield_spreads, dtype=torch.float32).to(device)
X_market_info = torch.tensor(market_info, dtype=torch.float32).to(device)

# concatenate the yield spreads and other market information tensors as well as run on GPU if available
X = torch.cat((X_yield_spreads, X_market_info), dim=1).to(device)
y = torch.tensor(yield_spreads, dtype=torch.float32).to(device)

# split the data into training, validation, and testing sets
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, shuffle=False)

# note that no scalers are needed as features are similar

# define the neural network model named Spread
class Spread(nn.Module):
    def __init__(self, input_size, hidden_size_1, hidden_size_2, output):
        super(Spread, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size_1)
        self.fc2 = nn.Linear(hidden_size_1, hidden_size_2)
        self.fc3 = nn.Linear(hidden_size_2, output)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# hyperparameters to control learning proccess of model
input_size = data.shape[1]
hidden_size_1 = 264
hidden_size_2 = 132
output = n
batch_size = 64
num_epochs = 100
learning_rate = 0.001

# instantiating the neural network model with GPU if available
model = Spread(input_size, hidden_size_1, hidden_size_2, output).to(device)

# define the Mean Squred Error loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# create dataloaders for training, validation, and testing sets calling whole class
train_dataset = torch.utils.data.TensorDataset(x_train, y_train)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size)

val_dataset = torch.utils.data.TensorDataset(x_val, y_val)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size)

test_dataset = torch.utils.data.TensorDataset(x_test, y_test)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size)

# training loop of model
for epoch in range(num_epochs):
    model.train()
    train_loss = 0

    # iterate over training set batches
    for inputs, targets in train_loader:
        # convert inputs and targets to run on GPU if available
        inputs = inputs.to(device)
        targets = targets.to(device)

        # update learnable weights of model as well as output input of inputs
        optimizer.zero_grad()
        outputs = model(inputs)

        # MSE loss between inputs outputs and actual targets values
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * inputs.size(0)

    # calculate average training loss for every epoch
    train_loss /= len(train_loader.dataset)

    # evaluate model on validation set without calculating gradients
    model.eval()
    val_loss = 0

    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs = inputs.to(device)
            targets = targets.to(device)

            val_outputs = model(inputs)
            batch_loss = criterion(val_outputs, targets)

            val_loss += batch_loss.item() * inputs.size(0)

        val_loss /= len(val_loader.dataset)

    # display current Train Loss and Val Loss for each epoch
    print(f"Epoch {epoch+1}: Train Loss = {train_loss}, Val Loss = {val_loss}")

# save only state_dict of model and where
save_model_name = 'spread.pt'
PATH = f"/content/drive/MyDrive/BBGdatasets/MLmodels/{save_model_name}"

torch.save(model.state_dict(), PATH)

# load loaded dictionary of model
model = Spread(input_size, hidden_size_1, hidden_size_2, output).to(device)
model.load_state_dict(torch.load(PATH))

# evaluate model on test set similar to validation set
model.eval()
test_loss = 0

with torch.no_grad():
    for inputs, targets in test_loader:
        inputs = inputs.to(device)
        targets = targets.to(device)

        test_outputs = model(inputs)
        batch_loss = criterion(test_outputs, targets)

        test_loss += batch_loss.item() * inputs.size(0)

    test_loss /= len(test_loader.dataset)

print(f"Test Loss: {test_loss}")

# convert the predicted tensor back to a NumPy array
predictions = test_outputs.cpu().numpy()

# calculate the average predicted yield spread for each instrument
avg_predicted_spreads = np.mean(predictions, axis=0)

# get instrument names
instrument_names = df_new.columns[:n]

# create a list of tuples with instrument names and their average predicted spreads
instrument_spreads = list(zip(instrument_names, avg_predicted_spreads))

# rank all of the instruments based on their average predicted yield spreads
ranked_instruments = sorted(instrument_spreads, key=lambda x: x[1], reverse=True)

# print each instrument's ranking
print("Instrument Rankings:")
for rank, (instrument_name, spread) in enumerate(ranked_instruments):
    print(f"Rank {rank+1}: Instrument {instrument_name} ({spread})")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Epoch 1: Train Loss = 47.91761459842805, Val Loss = 20.579891204833984
Epoch 2: Train Loss = 13.434885517243416, Val Loss = 9.319538116455078
Epoch 3: Train Loss = 8.449710676746983, Val Loss = 3.788029670715332
Epoch 4: Train Loss = 2.9695329896865355, Val Loss = 2.2581076622009277
Epoch 5: Train Loss = 1.7725122321036555, Val Loss = 0.9515868425369263
Epoch 6: Train Loss = 1.270678339465972, Val Loss = 1.2812327146530151
Epoch 7: Train Loss = 0.7990342984276433, Val Loss = 0.18398714065551758
Epoch 8: Train Loss = 0.3533350344627134, Val Loss = 0.5708064436912537
Epoch 9: Train Loss = 0.4682333978914445, Val Loss = 0.21269479393959045
Epoch 10: Train Loss = 0.1789192158368326, Val Loss = 0.1712891012430191
Epoch 11: Train Loss = 0.2094864797207617, Val Loss = 0.07913187146186829
Epoch 12: Train Loss = 0.13178763995247503, Val Loss = 0.10813816636800766
Epoc