In [1]:
!pip install --quiet -r requirements.txt

[0m

In [2]:
!nvidia-smi

Sun Dec  8 02:34:28 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3060        Off |   00000000:01:00.0  On |                  N/A |
|  0%   41C    P5             21W /  170W |     967MiB /  12288MiB |     44%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [3]:
import warnings
warnings.filterwarnings('ignore')

import os, gc
import pandas as pd
import polars as pl
import random

import lightgbm as lgb
import xgboost as xgb
#import catboost as cbt # needs numpy <2.0
import numpy as np
#from hyperopt import hp, fmin, tpe, Trials
#from hyperopt.pyll.base import scope
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import GroupKFold
import matplotlib.pyplot as plt
#from tqdm.notebook import tqdm
#from joblib import dump, load
#import datatable as dtable
#from mlxtend.evaluate import GroupTimeSeriesSplit
import kaggle_evaluation.jane_street_inference_server as js_server

import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.tensorboard import SummaryWriter
from torchvision import datasets
from torchvision.transforms import ToTensor

In [4]:
# check that torch is working and sees the GPU
print(torch.__version__)
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

2.4.1+cu121
True
NVIDIA GeForce RTX 3060


In [5]:
collected = gc.collect()
# Prints Garbage collector 
# as 0 object
print("Garbage collector: collected",
          "%d objects." % collected)

Garbage collector: collected 72 objects.


# Globals and prep

In [6]:
PATH = os.getcwd() + '/input/'

METAS = ['date_id', 'time_id', 'symbol_id', 'weight']
FEATURES = [f'feature_{i:02}' for i in range(79)]
RESPONDERS = [f'responder_{i}' for i in range(9)]
RESPONDERS_LAGS = [f'responder_{i}_lag_1' for i in range(9)]
TARGET = 'responder_6'
FEATURES_AND_LAGS=True
if FEATURES_AND_LAGS:
    FEATURES_WORKING = FEATURES + RESPONDERS + RESPONDERS_LAGS
    FEATURES_WORKING.remove(TARGET)
else:
    FEATURES_WORKING = FEATURES
print(FEATURES_WORKING)
SEQUENCE_LEN = 16 # not sure what this is for

SEED = 728

TEST=True

['feature_00', 'feature_01', 'feature_02', 'feature_03', 'feature_04', 'feature_05', 'feature_06', 'feature_07', 'feature_08', 'feature_09', 'feature_10', 'feature_11', 'feature_12', 'feature_13', 'feature_14', 'feature_15', 'feature_16', 'feature_17', 'feature_18', 'feature_19', 'feature_20', 'feature_21', 'feature_22', 'feature_23', 'feature_24', 'feature_25', 'feature_26', 'feature_27', 'feature_28', 'feature_29', 'feature_30', 'feature_31', 'feature_32', 'feature_33', 'feature_34', 'feature_35', 'feature_36', 'feature_37', 'feature_38', 'feature_39', 'feature_40', 'feature_41', 'feature_42', 'feature_43', 'feature_44', 'feature_45', 'feature_46', 'feature_47', 'feature_48', 'feature_49', 'feature_50', 'feature_51', 'feature_52', 'feature_53', 'feature_54', 'feature_55', 'feature_56', 'feature_57', 'feature_58', 'feature_59', 'feature_60', 'feature_61', 'feature_62', 'feature_63', 'feature_64', 'feature_65', 'feature_66', 'feature_67', 'feature_68', 'feature_69', 'feature_70', 'feat

In [7]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        # Ensure deterministic behavior (may impact performance)
        #torch.backends.cudnn.deterministic = False
        #torch.backends.cudnn.benchmark = False

def lazy_load(par_path):
    return pl.scan_parquet(par_path).select(
        pl.int_range(pl.len(), dtype=pl.UInt64).alias("index"),
        pl.all()
    )
seed_everything(SEED) # just do it

# Use synthetic test?

In [8]:
# # this is just copied boilerplate, don't have synthetic testing atm
# USE_SYNTHETIC = False

# if USE_SYNTHETIC:
#     syn_dir = '/kaggle/input/js24-rmf-generate-synthetic-test-data'
#     test_parquet = f'{syn_dir}/synthetic_test.parquet'
#     lag_parquet = f'{syn_dir}/synthetic_lag.parquet'
#     total_time_steps = pl.scan_parquet(test_parquet).select(
#         (pl.col("date_id")*10000+pl.col('time_id')).n_unique()   
#         ).collect().item()
# else:
#     test_parquet_path = PATH + 'test.parquet/'
#     lag_parquet_path =  PATH + 'lags.parquet/'
#     total_time_steps = 1
    
# print("Test parquet:", test_parquet_path)
# print("Lag parquet:", lag_parquet_path)
# print("Total time steps:", total_time_steps)

# Load data

In [9]:
if TEST:
    train_df = pl.read_parquet(PATH+'training_TEST.parquet') # this parquet has lags
    valid_df = pl.read_parquet(PATH+'validation_TEST.parquet') # this parquet has lags
else:
    train_df = pl.scan_parquet(PATH+'training.parquet').collect() # I think this will scan over all the subdirs, also thise has lags
    valid_df = pl.scan_parquet(PATH+'validation.parquet').collect()

feature_tags = pl.read_csv(PATH+'features.csv') # no one seems to use this....
gc.collect()

0

# Preprocessing

In [10]:
print('Filling...')
print(train_df["weight"].gt(0).sum() == train_df.shape[0]) # check if weights>0 is true for entire dataframe
train_df = train_df.select([
    pl.col(c).fill_null(pl.col(c).mean()).alias(c) for c in train_df.columns
])
print(train_df.head())

valid_df = valid_df.select([
    pl.col(c).fill_null(pl.col(c).mean()).alias(c) for c in valid_df.columns
])
print(valid_df.head())

Filling...
True
shape: (5, 103)
┌────────────┬─────────┬─────────┬───────────┬───┬────────────┬────────────┬───────────┬───────────┐
│ id         ┆ date_id ┆ time_id ┆ symbol_id ┆ … ┆ responder_ ┆ responder_ ┆ responder ┆ responder │
│ ---        ┆ ---     ┆ ---     ┆ ---       ┆   ┆ 5_lag_1    ┆ 6_lag_1    ┆ _7_lag_1  ┆ _8_lag_1  │
│ f64        ┆ f64     ┆ f64     ┆ f64       ┆   ┆ ---        ┆ ---        ┆ ---       ┆ ---       │
│            ┆         ┆         ┆           ┆   ┆ f32        ┆ f32        ┆ f32       ┆ f32       │
╞════════════╪═════════╪═════════╪═══════════╪═══╪════════════╪════════════╪═══════════╪═══════════╡
│ 2.5023058e ┆ 1101.0  ┆ 0.0     ┆ 0.0       ┆ … ┆ 0.112157   ┆ 0.011801   ┆ 0.033876  ┆ 0.009129  │
│ 7          ┆         ┆         ┆           ┆   ┆            ┆            ┆           ┆           │
│ 2.5023059e ┆ 1101.0  ┆ 0.0     ┆ 1.0       ┆ … ┆ 0.112157   ┆ 0.011801   ┆ 0.033876  ┆ 0.009129  │
│ 7          ┆         ┆         ┆           ┆   ┆         

In [11]:
# for the purged group time series split, code is copied from somewhere
# TODO: make GitHub GIST
# TODO: add as dataset
# TODO: add logging with verbose

from sklearn.model_selection import KFold
from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples
from sklearn.utils.validation import _deprecate_positional_args

# modified code for group gaps; source
# https://github.com/getgaurav2/scikit-learn/blob/d4a3af5cc9da3a76f0266932644b884c99724c57/sklearn/model_selection/_split.py#L2243
class PurgedGroupTimeSeriesSplit(_BaseKFold):
    """Time Series cross-validator variant with non-overlapping groups.|
    Allows for a gap in groups to avoid potentially leaking info from
    train into test if the model has windowed or lag features.
    Provides train/test indices to split time series data samples
    that are observed at fixed time intervals according to a
    third-party provided group.
    In each split, test indices must be higher than before, and thus shuffling
    in cross validator is inappropriate.
    This cross-validation object is a variation of :class:`KFold`.
    In the kth split, it returns first k folds as train set and the
    (k+1)th fold as test set.
    The same group will not appear in two different folds (the number of
    distinct groups has to be at least equal to the number of folds).
    Note that unlike standard cross-validation methods, successive
    training sets are supersets of those that come before them.
    Read more in the :ref:`User Guide <cross_validation>`.
    Parameters
    ----------
    n_splits : int, default=5
        Number of splits. Must be at least 2.
    max_train_group_size : int, default=Inf
        Maximum group size for a single training set.
    group_gap : int, default=None
        Gap between train and test
    max_test_group_size : int, default=Inf
        We discard this number of groups from the end of each train split
    """

    @_deprecate_positional_args
    def __init__(self,
                 n_splits=5,
                 *,
                 max_train_group_size=np.inf,
                 max_test_group_size=np.inf,
                 group_gap=None,
                 verbose=False
                 ):
        super().__init__(n_splits, shuffle=False, random_state=None)
        self.max_train_group_size = max_train_group_size
        self.group_gap = group_gap
        self.max_test_group_size = max_test_group_size
        self.verbose = verbose

    def split(self, X, y=None, groups=None):
        """Generate indices to split data into training and test set.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like of shape (n_samples,)
            Always ignored, exists for compatibility.
        groups : array-like of shape (n_samples,)
            Group labels for the samples used while splitting the dataset into
            train/test set.
        Yields
        ------
        train : ndarray
            The training set indices for that split.
        test : ndarray
            The testing set indices for that split.
        """
        if groups is None:
            raise ValueError(
                "The 'groups' parameter should not be None")
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        n_splits = self.n_splits
        group_gap = self.group_gap
        max_test_group_size = self.max_test_group_size
        max_train_group_size = self.max_train_group_size
        n_folds = n_splits + 1
        group_dict = {}
        u, ind = np.unique(groups, return_index=True)
        unique_groups = u[np.argsort(ind)]
        n_samples = _num_samples(X)
        n_groups = _num_samples(unique_groups)
        for idx in np.arange(n_samples):
            if (groups[idx] in group_dict):
                group_dict[groups[idx]].append(idx)
            else:
                group_dict[groups[idx]] = [idx]
        if n_folds > n_groups:
            raise ValueError(
                ("Cannot have number of folds={0} greater than"
                 " the number of groups={1}").format(n_folds,
                                                     n_groups))

        group_test_size = min(n_groups // n_folds, max_test_group_size)
        group_test_starts = range(n_groups - n_splits * group_test_size,
                                  n_groups, group_test_size)
        for group_test_start in group_test_starts:
            train_array = []
            test_array = []

            group_st = max(0, group_test_start - group_gap - max_train_group_size)
            for train_group_idx in unique_groups[group_st:(group_test_start - group_gap)]:
                train_array_tmp = group_dict[train_group_idx]
                
                train_array = np.sort(np.unique(
                                      np.concatenate((train_array,
                                                      train_array_tmp)),
                                      axis=None), axis=None)

            train_end = train_array.size
 
            for test_group_idx in unique_groups[group_test_start:
                                                group_test_start +
                                                group_test_size]:
                test_array_tmp = group_dict[test_group_idx]
                test_array = np.sort(np.unique(
                                              np.concatenate((test_array,
                                                              test_array_tmp)),
                                     axis=None), axis=None)

            test_array  = test_array[group_gap:]
            
            
            if self.verbose > 0:
                    pass
                    
            yield [int(i) for i in train_array], [int(i) for i in test_array]

In [12]:
# from matplotlib.colors import ListedColormap
    
# # this is code slightly modified from the sklearn docs here:
# # https://scikit-learn.org/stable/auto_examples/model_selection/plot_cv_indices.html#sphx-glr-auto-examples-model-selection-plot-cv-indices-py
# def plot_cv_indices(cv, X, y, group, ax, n_splits, lw=10):
#     """Create a sample plot for indices of a cross-validation object."""
    
#     cmap_cv = plt.cm.coolwarm

#     jet = plt.cm.get_cmap('jet', 256)
#     seq = np.linspace(0, 1, 256)
#     _ = np.random.shuffle(seq)   # inplace
#     cmap_data = ListedColormap(jet(seq))

#     # Generate the training/testing visualizations for each CV split
#     for ii, (tr, tt) in enumerate(cv.split(X=X, y=y, groups=group)):
#         # Fill in indices with the training/test groups
#         indices = np.array([np.nan] * len(X))
#         indices[tt] = 1
#         indices[tr] = 0

#         # Visualize the results
#         ax.scatter(range(len(indices)), [ii + .5] * len(indices),
#                    c=indices, marker='_', lw=lw, cmap=cmap_cv,
#                    vmin=-.2, vmax=1.2)

#     # Plot the data classes and groups at the end
#     ax.scatter(range(len(X)), [ii + 1.5] * len(X),
#                c=y, marker='_', lw=lw, cmap=plt.cm.Set3)

#     ax.scatter(range(len(X)), [ii + 2.5] * len(X),
#                c=group, marker='_', lw=lw, cmap=cmap_data)

#     # Formatting
#     yticklabels = list(range(n_splits)) + ['target', 'day']
#     ax.set(yticks=np.arange(n_splits+2) + .5, yticklabels=yticklabels,
#            xlabel='Sample index', ylabel="CV iteration",
#            ylim=[n_splits+2.2, -.2], xlim=[0, len(y)])
#     ax.set_title('{}'.format(type(cv).__name__), fontsize=15)
#     return ax


# fig, ax = plt.subplots()
# #plot_cv_indices(cv, X_train, y_train, groups, ax, 5, lw=20)
# plot_cv_indices(
#     cv,
#     train[features].values,
#     train['responder_6'].values,
#     train['date_id'].values,
#     ax,
#     5,
#     lw=20
# )
# rubbish=gc.collect()

# code structure copied/inspired from https://www.kaggle.com/code/shiyili/js2024-rmf-mlp-inference-pytorch

In [13]:
### Define various helper functions including r2 score
writer = SummaryWriter()
# get cpu, gpu or mps device for training
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

def reset_weights(m):
    '''
    Try resetting model weights to avoid
    weight leakage.
    '''
    for layer in m.children():
        if hasattr(layer, 'reset_parameters'):
            print(f'Reset trainable parameters of layer = {layer}')
            layer.reset_parameters()

# for monitoring layer weights
def weight_histograms(writer, step, model):
    print("Visualizing model weights...")
    # Iterate over all model layers
    for layer_number in range(len(model.layers)):
        layer = model.layers[layer_number]
        try:
            weights = layer.weight
            flattened_weights = weights.flatten()
            tag = f"layer_{layer_number}"
            writer.add_histogram(tag, flattened_weights, global_step=step, bins='tensorflow')
        except AttributeError:
            return
            
# loss is as defined on competition homepage
# score will be = 1 - loss
def r2_loss(outputs, targets, weights):
    loss = torch.sum(weights*(targets - outputs)**2) / (torch.sum(weights*targets**2)+1e-38)
    return loss

# "standard" loss function
test_loss_function= nn.MSELoss()

Using cuda device


In [14]:
# Basic LSTM model
class LSTM(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim):
        super().__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim

        self.lstm = nn.LSTM(input_size=self.input_dim, hidden_size=self.hidden_dim, num_layers=1, batch_first=True)
        self.linear = nn.Linear(self.hidden_dim, self.output_dim)
        
    def forward(self, x):
        '''Forward pass'''
        x, _ = self.lstm(x)
        x = self.linear(x)
        return x.squeeze()

# copied LSTM equivalent model
class LSTMEquavalentMLP(nn.Module):
    
    def __init__(self, input_size, hidden_size):
        super(LSTMEquavalentMLP, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        
        self.input_gate = nn.Linear(input_size, hidden_size)
        self.candidate_gate = nn.Linear(input_size, hidden_size)
        self.output_gate = nn.Linear(input_size, hidden_size)
        
    def forward(self, x):
        
        i_t = torch.sigmoid(self.input_gate(x)) # input gate
        c_t = torch.tanh(self.candidate_gate(x)) # candidate gate
        o_t = torch.sigmoid(self.output_gate(x)) # output gate
        
        h_t = o_t * torch.tanh(c_t * i_t)
        
        return h_t
    
class LSTMFeedForward(nn.Module):
    
    def __init__(self, input_size, hidden_size, output_size, dropout_rates=0):
        super(LSTMFeedForward, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size

        self.lstm = LSTMEquavalentMLP(input_size, hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        h_t = self.lstm(x)
        y = self.fc(h_t).squeeze()
        return y

In [15]:
# training function
def train(model, trainloader, optimizer, num_epochs, writer, save=True):
    model.train()
    # Run the training loop for defined number of epochs
    for epoch in range(0, num_epochs):
        # Visualize weight histograms
        #weight_histograms(writer, epoch, model)
        # Print epoch
        print(f'Starting epoch {epoch+1}')
        
        # Set current loss value
        current_loss = 0.0
        # Iterate over Dataloader for training data
        for i, data in enumerate(trainloader):
            # Get inputs
            inputs, targets, weights = data
            # GPU must see the data and the model
            inputs = inputs.to(device) 
            targets = targets.to(device)
            weights = weights.to(device)
            
            # Zero the gradients
            optimizer.zero_grad()
            
            # Perform forward pass
            outputs = model(inputs)
            
            # Compute loss
            loss = r2_loss(outputs, targets, weights)
            current_loss += loss.item()
            if i % 1000 == 0:
                print('Loss after mini-batch %5d: %.3f' %
                      (i + 1, current_loss / 1000))
                current_loss = 0.0
            # Perform backward pass
            loss.backward()
            # Perform optimization
            optimizer.step()

        # Finished iterating over Dataloader
        writer.add_scalar("Loss/train/epoch", loss, epoch)
        writer.flush()
      
    # Saving the model
    if save:
        save_path = f'./models/lstm_{fold}.pth'
        torch.save(model.state_dict(), save_path)
    return

# testing function
def test(model, testloader, writer):
    # Evaluation for this fold
    R2_score = 0.0
    total_loss = 0.0
    with torch.no_grad():
        model.eval()
        # Iterate over the test data and generate predictions
        for i, data in enumerate(testloader):
            # Get inputs
            inputs, targets, weights = data
            inputs = inputs.to(device) # GPU must see the data and the model
            targets = targets.to(device)
            weights = weights.to(device)
        
            # Generate outputs
            outputs = model(inputs)
            loss = r2_loss(outputs, targets, weights)
            # writer.add_scalar('Loss/test/minibatches', loss, batch_test_tally)
            # batch_test_tally += 1
            total_loss += loss
        return total_loss

In [16]:
LSTM_FF_CONFIG = {
    'input_size':  len(FEATURES_WORKING),
    'hidden_size': 64,
    'output_size': 1,
    'dropout_rates': 0
}

TRAINING_CONFIG = {
    'num_folds': 1,
    'num_epochs': 10,
    'learning_rate': 1e-4,
    'test_loss': False
}

fold_function = 'PurgedGroupTimeSeries'
# fold_function = 'KFold'
fold_function = 'None'

# For holding fold results
results = {}

In [17]:
# should test models with only features, features + responders, + lags, etc...
if fold_function == 'PurgedGroupTimeSeries':
    X = train_df[FEATURES_WORKING].to_numpy()
    y = train_df[TARGET].to_numpy()
    group = train_df['date_id'].to_numpy()
    w = train_df['weight'].to_numpy()
    # Define the PurgedGroupTimeSeriesCV
    cv = PurgedGroupTimeSeriesSplit(
    n_splits=folds,
    max_train_group_size=80,
    group_gap=10,
    max_test_group_size=20
    )
    folds = cv.split(X=X, y=y, groups=group)
elif fold_function == 'KFold': # TODO
    folds = _
elif fold_function == 'None' :
    folds = [([0,0],[0,0])]
    X_tr, y_tr, w_tr = train_df[FEATURES_WORKING].to_numpy(), train_df[TARGET].to_numpy(), train_df['weight'].to_numpy()
    X_te, y_te, w_te = valid_df[FEATURES_WORKING].to_numpy(), valid_df[TARGET].to_numpy(), valid_df['weight'].to_numpy()
else:
    folds = _
#neat hack to ensure the original dataframe is gone
del train_df, valid_df
gc.collect()
train_df = pl.DataFrame()
valid_df = pl.DataFrame()

In [18]:
### Train and cross validation loop
for fold, (train_ids, test_ids) in enumerate(folds):

    # Print
    print(f'FOLD {fold}')
    print(f'train_ids: [{train_ids[0]}, {train_ids[-1]}]')
    print(f'test_ids: [{test_ids[0]}, {test_ids[-1]}]')
    if fold_function != 'None':
        print("Using a fold function...")
        # define train/test sets
        X_tr, y_tr, w_tr = X[train_ids], y[train_ids], w[train_ids]
        X_te, y_te, w_te = X[test_ids], y[test_ids], w[test_ids]
    
    print('--------------------------------')

    # Initialize Dataset objects to make PyTorch play nice
    trainset = TensorDataset(torch.tensor(X_tr).to(torch.float32), torch.tensor(y_tr), torch.tensor(w_tr))
    testset = TensorDataset(torch.tensor(X_te).to(torch.float32), torch.tensor(y_te), torch.tensor(w_te))
    
    # Define data loaders
    BATCH_SIZE = 2048
    NUM_WORKERS = 4 # num of parallel subprocesses for data loading (CPU task)
    trainloader = torch.utils.data.DataLoader(
        trainset,
        batch_size=BATCH_SIZE,
        num_workers = NUM_WORKERS
    )
    testloader = torch.utils.data.DataLoader(
        testset,
        batch_size=BATCH_SIZE,
        num_workers = NUM_WORKERS
    )

    # Init the model
    model = LSTMFeedForward(**LSTM_FF_CONFIG).to(device)
    model.apply(reset_weights) # not sure if I need this but w/e
    # Initialize optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=TRAINING_CONFIG['learning_rate'])
    train(model, trainloader, optimizer, TRAINING_CONFIG["num_epochs"], writer, save=True)
    # Process is complete.
    print('Training process has finished')
    
    # Print about testing
    print('Starting testing')
    
    total_loss = test(model, testloader, writer)
    num_batches = len(testloader)
    results[fold] = total_loss / num_batches
    writer.add_scalar('Loss/test/fold', results[fold], fold)
    writer.flush()
    
    # Print fold results
    print(f'CROSS VALIDATION RESULTS FOR {folds} FOLDS')
    print('--------------------------------')
    sum = 0.0
    for key, value in results.items():
        print(f'Fold {key}: {value}')
        sum += value
    print(f'Average: {sum/len(results.items())}')
writer.close()
rubbish=gc.collect()

FOLD 0
train_ids: [0, 0]
test_ids: [0, 0]
--------------------------------
Reset trainable parameters of layer = Linear(in_features=96, out_features=64, bias=True)
Reset trainable parameters of layer = Linear(in_features=96, out_features=64, bias=True)
Reset trainable parameters of layer = Linear(in_features=96, out_features=64, bias=True)
Reset trainable parameters of layer = Linear(in_features=64, out_features=1, bias=True)
Starting epoch 1
Loss after mini-batch     1: 0.001
Loss after mini-batch  1001: 0.802
Loss after mini-batch  2001: 0.372
Loss after mini-batch  3001: 0.160
Loss after mini-batch  4001: 0.101
Loss after mini-batch  5001: 0.097
Loss after mini-batch  6001: 0.505
Loss after mini-batch  7001: 0.362
Loss after mini-batch  8001: 0.693
Loss after mini-batch  9001: 0.459
Starting epoch 2
Loss after mini-batch     1: 0.000
Loss after mini-batch  1001: 0.135
Loss after mini-batch  2001: 0.105
Loss after mini-batch  3001: 0.127
Loss after mini-batch  4001: 0.098
Loss after 

In [19]:
# lmao
# out = outputs.cpu().detach().numpy()
# tar = targets.cpu().detach().numpy()
# wei = weights.cpu().detach().numpy()
# print(out[10,0])
# nbins = 20
# plt.hist(tar, color='orange', label='targets', bins=nbins)
# plt.hist(wei, color='green', label='weights', bins=nbins)
# plt.hist(out[:,0], color='blue', bins=nbins, label='predicted')
# plt.legend()