# Import Libraries

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import json
import torch
import torch.nn as nn
import torch.optim as optim

# Overview of new dataset V4.2: Rain 

In [None]:
path = "/kaggle/input/numerai-data-v42-rain/v4.2/"

with open('/kaggle/input/numerai-data-v42-rain/current_round.txt', 'r') as file:
    print(f"{file.read()}")

with open(path+"features.json", "r") as f:
    feature_metadata = json.load(f)

nr_features = len(feature_metadata["feature_sets"].keys())
print(f"Additional feature sets have been added, there are now {nr_features} feature sets:")
for item in [(key, len(feature_metadata["feature_sets"][key])) for key in feature_metadata["feature_sets"].keys()]:
    print(item)
    
nr_targets = len(feature_metadata["targets"]) - 1 # target is same as target_cyrus_v4_20
print(f"12 additional targets have been added, there are now {nr_targets} targets named:")
for item in feature_metadata["targets"]:
    print(item)

# Select features and targets for training

In [None]:
# Choose small for fast training
FEATURE_SET = "small"
features = feature_metadata["feature_sets"][FEATURE_SET]
targets = ["target_cyrus_v4_20"]
slice_ct = 4
 
# If working with multiple targets uncomment below:

# # Choose targets to work with, this will affect data slicing
# # 20-day targets -> data overlaps every 4th era, slice data with 4
# # 60-day targets -> data overlaps every 12th era, slice data with 12
# TARGET = 20 # Select target 20 or 60 here
# assert TARGET in (20, 60)

# targets_v20 = [t for t in feature_metadata["targets"] if t.endswith('20') == True]
# targets_v60 = [t for t in feature_metadata["targets"] if t.endswith('60') == True]

# if(TARGET == 20):
#     targets = targets_v20
#     slice_ct = 4
# elif(TARGET == 60):
#     targets = targets_v60
#     slice_ct = 12
# else:
#     print("error, TARGET has to be 20 or 60")

# Slice train and validation sets to avoid overlaps giving data leakage

In [None]:
train_df = pd.read_parquet(path+'train_int8.parquet', columns=['id', 'era'] + targets + features).reset_index()
train_df.loc[:, "era"] = train_df.era.astype(int)
validation_df = pd.read_parquet(path+'validation_int8.parquet', columns=['id', 'era'] + targets + features).reset_index()
validation_df.loc[:, "era"] =validation_df.era.astype(int)

train_df = train_df.loc[lambda x: (x.era%slice_ct) == 0]
validation_df = validation_df.loc[lambda x: (x.era%slice_ct) == 0]

print("Data before removing nans:")
print(train_df.shape)
print(validation_df.shape)

# Remove rows that have nan
train_df = train_df.dropna()
validation_df = validation_df.dropna()

print("Data after removing nans:")
print(train_df.shape)
print(validation_df.shape)

In [None]:
train_df.head()

# Convert targets and features data to PyTorch variables

In [None]:
train_X = torch.tensor(train_df[features].values).float()
train_X /= 4 # make sure values fall within 0-1
train_y = torch.tensor(train_df[targets].values).float()
validate_X = torch.tensor(validation_df[features].values).float()
validate_X /= 4 # make sure values fall within 0-1
validate_y = torch.tensor(validation_df[targets].values).float()

del train_df, validation_df

In [None]:
print(train_X.shape, train_y.shape)
print(validate_X.shape, validate_y.shape)

# Create simple neural net

In [None]:
# Helper function to plot loss
def plot_loss(train_loss, val_loss):
    """
    Plot training and validation loss at every epoch.

    Args:
        train_loss (list): List of training loss values.
        val_loss (list): List of validation loss values.
    """
    epochs = range(1, len(train_loss) + 1)
    plt.plot(epochs, train_loss.values(), color='tab:blue', marker='o', label='Training Loss')
    plt.plot(epochs, val_loss.values(), color='tab:orange', label='Validation Loss')
    plt.title('Training and Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

In [None]:
# Define training loop
def training_loop(n_epochs, optimizer, model, loss_fn, train_X, validate_X, train_y, validate_y):
    train_loss_dict = {}
    val_loss_dict = {}
    for epoch in range(1, n_epochs + 1):
         y_pred_train = model(train_X)
         train_loss = loss_fn(y_pred_train, train_y)
         y_pred_val = model(validate_X)
         val_loss = loss_fn(y_pred_val, validate_y)
         optimizer.zero_grad()
         train_loss.backward()
         optimizer.step()
         train_loss_dict[epoch] = train_loss.item()
         val_loss_dict[epoch] = val_loss.item()
         if epoch <= 1 or epoch % 10 == 0:
             print('Epoch {}, Training loss {}, Validation loss {}'.format(
             epoch, float(train_loss), float(val_loss)))
    plot_loss(train_loss_dict, val_loss_dict)
    return model

In [None]:
# Create simple sequential model
from collections import OrderedDict
namedseq_model = nn.Sequential(OrderedDict([
 ('hidden_linear', nn.Linear(train_X.shape[1], train_X.shape[1])),
 ('hidden_activation', nn.Tanh()),
 ('output_linear', nn.Linear(train_X.shape[1] , train_y.shape[1]))
]))
namedseq_model

In [None]:
# Choose optimizer and run
model = namedseq_model
optimizer = optim.Adam(model.parameters(), lr=1e-3)
final_model = training_loop(
 n_epochs = 100,
 optimizer = optimizer,
 model = model,
 loss_fn = nn.MSELoss(),
 train_X = train_X, 
 validate_X = validate_X, 
 train_y = train_y, 
 validate_y = validate_y)

In [None]:
list(final_model.named_parameters())

In [None]:
(final_model(train_X), final_model(validate_X))

The loss function decreased and stabilized but it's basically telling us to predict 0.5 for all input values, which is not very useful since Numerai scores predictions based on a ranked Spearman correlation metric

In [None]:
# final loss check
mse = torch.mean((validate_y-final_model(validate_X))**2)
print(mse)

# Improving the neural net
Now that we have an intial net running, let's try to make some meaningful adjustments