- Notebook modified from https://www.kaggle.com/code/markwijkhuizen/planttraits2024-eda-training-pub.
- Training only, EDA part not included.
- Image model only, tabular data not used.

Modified from HDJOJO's original notebook with SWIN Transformer

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import imageio.v3 as imageio
import albumentations as A

from albumentations.pytorch import ToTensorV2
from torch.utils.data import Dataset, DataLoader
from torch import nn
from tqdm.notebook import tqdm
from sklearn.preprocessing import StandardScaler

import torch
import timm
import glob
import torchmetrics
import time
import psutil
import os

tqdm.pandas()

In [23]:
class Config():
    IMAGE_SIZE = 256 #384 # Sample: [224, 224]
#     BACKBONE = 'swin_large_patch4_window12_384.ms_in22k_ft_in1k'
    BACKBONE = 'swinv2_tiny_window16_256'
    TARGET_COLUMNS = ['X4_mean', 'X11_mean', 'X18_mean', 'X50_mean', 'X26_mean', 'X3112_mean']
    N_TARGETS = len(TARGET_COLUMNS)
    BATCH_SIZE = 10 # Sample: 96
    LR_MAX = 1e-4
    WEIGHT_DECAY = 0.01
    N_EPOCHS = 6 # Sample: 12
    TRAIN_MODEL = True
    IS_INTERACTIVE = os.environ['KAGGLE_KERNEL_RUN_TYPE'] == 'Interactive'
    
    DEVICE = 'cpu'
    # Added variables
    NUM_FOLDS = 5
    VALID_FOLD = 0  # Fold of validation data
        
CONFIG = Config()

In [4]:
# Read in training data
train_df = pd.read_csv('/kaggle/input/planttraits2024/train.csv')
train_df['file_path'] = train_df['id'].apply(lambda s: f'/kaggle/input/planttraits2024/train_images/{s}.jpeg')
train_df['jpeg_bytes'] = train_df['file_path'].progress_apply(lambda fp: open(fp, 'rb').read())
# train_df.to_pickle('train.pkl') # serialize object into string form

  0%|          | 0/55489 [00:00<?, ?it/s]

### Data Filtering

In [5]:
# Sampled training set for faster training
print("Previous length:", len(train_df))
# train_df = train_df.sample(frac=0.3, random_state=42)
# print("Sampled length:", len(train_df))

Previous length: 55489


In [6]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=CONFIG.NUM_FOLDS, shuffle=True, random_state=42)

# Create separate bin for each traits
for i, trait in enumerate(CONFIG.TARGET_COLUMNS):
    # Determine the bin edges dynamically based on the distribution of traits
    bin_edges = np.percentile(train_df[trait], np.linspace(0, 100, CONFIG.NUM_FOLDS + 1))
    train_df[f"bin_{i}"] = np.digitize(train_df[trait], bin_edges)

# Concatenate the bins into a final bin
train_df["final_bin"] = (
    train_df[[f"bin_{i}" for i in range(CONFIG.N_TARGETS)]]
    .astype(str)
    .agg("".join, axis=1)
)

# Perform the stratified split using final bin
train_df = train_df.reset_index(drop=True)
for fold, (train_idx, valid_idx) in enumerate(skf.split(train_df, train_df["final_bin"])):
    train_df.loc[valid_idx, "fold"] = fold
    
train_df.head()



Unnamed: 0,id,WORLDCLIM_BIO1_annual_mean_temperature,WORLDCLIM_BIO12_annual_precipitation,WORLDCLIM_BIO13.BIO14_delta_precipitation_of_wettest_and_dryest_month,WORLDCLIM_BIO15_precipitation_seasonality,WORLDCLIM_BIO4_temperature_seasonality,WORLDCLIM_BIO7_temperature_annual_range,SOIL_bdod_0.5cm_mean_0.01_deg,SOIL_bdod_100.200cm_mean_0.01_deg,SOIL_bdod_15.30cm_mean_0.01_deg,...,file_path,jpeg_bytes,bin_0,bin_1,bin_2,bin_3,bin_4,bin_5,final_bin,fold
0,192027691,12.235703,374.466675,62.524445,72.256844,773.592041,33.277779,125,149,136,...,/kaggle/input/planttraits2024/train_images/192...,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,2,2,1,4,2,1,221421,2.0
1,195542235,17.270555,90.239998,10.351111,38.22094,859.193298,40.009777,124,144,138,...,/kaggle/input/planttraits2024/train_images/195...,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,3,3,2,2,2,3,332223,1.0
2,196639184,14.254504,902.071411,49.642857,17.873655,387.977753,22.807142,107,133,119,...,/kaggle/input/planttraits2024/train_images/196...,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,5,1,5,5,2,3,515523,0.0
3,195728812,18.680834,1473.93335,163.100006,45.009758,381.053986,20.436666,120,131,125,...,/kaggle/input/planttraits2024/train_images/195...,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,3,2,3,2,1,3,323213,4.0
4,195251545,0.673204,530.088867,50.857777,38.230709,1323.526855,45.891998,91,146,120,...,/kaggle/input/planttraits2024/train_images/195...,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,2,3,3,5,4,4,233544,2.0


In [7]:
train = train_df[train_df["fold"] != CONFIG.VALID_FOLD]
valid = train_df[train_df["fold"] == CONFIG.VALID_FOLD] # Fold 0 is validation
train[CONFIG.TARGET_COLUMNS + ["fold"]].describe()

Unnamed: 0,X4_mean,X11_mean,X18_mean,X50_mean,X26_mean,X3112_mean,fold
count,44391.0,44391.0,44391.0,44391.0,44391.0,44391.0,44391.0
mean,0.522456,127.1709,24600.4,12.810444,3096.704,493829.3,2.499966
std,0.176001,12379.79,2582362.0,1313.424294,221043.6,102327000.0,1.118037
min,-2.431157,6.78e-05,2.33e-08,9.7e-05,5.5e-07,7.69e-08,1.0
25%,0.410739,10.6356,0.3099867,1.174045,0.5595144,255.2807,1.5
50%,0.509275,15.12003,0.7171231,1.48013,2.529542,725.8266,2.0
75%,0.622427,19.68705,3.574691,1.924787,14.98396,2158.052,3.0
max,4.475172,1504254.0,272049400.0,159759.8977,31065550.0,21559110000.0,4.0


In [8]:
class PlantDataPreProcess:
    lower_quantile = 0.005
    upper_quantile = 0.995
    log_transform = np.log10

In [9]:
# Filter data
print("Num samples before filtering:", len(train))

for trait in CONFIG.TARGET_COLUMNS:
    lower_bound = train[trait].quantile(PlantDataPreProcess.lower_quantile)
    upper_bound = train[trait].quantile(PlantDataPreProcess.upper_quantile)
    train = train[(train[trait] >= lower_bound) & (train[trait] <= upper_bound)]
    
print("Num samples After filtering:", len(train))
train[CONFIG.TARGET_COLUMNS].describe()

Num samples before filtering: 44391
Num samples After filtering: 41797


Unnamed: 0,X4_mean,X11_mean,X18_mean,X50_mean,X26_mean,X3112_mean
count,41797.0,41797.0,41797.0,41797.0,41797.0,41797.0
mean,0.521676,15.823053,3.207305,1.616637,42.244362,1858.297208
std,0.144273,7.598915,5.347872,0.638061,166.646792,3116.155242
min,0.176725,2.830246,0.032735,0.494166,0.006679,9.725925
25%,0.410757,10.792999,0.318085,1.186312,0.58651,267.23733
50%,0.509045,15.129038,0.714284,1.481727,2.534134,729.941079
75%,0.621267,19.51144,3.402814,1.909787,14.288664,2106.94038
max,0.957788,58.287012,32.388908,4.608223,2369.101479,29876.60141


In [10]:
# Log10 transformation for all traits except X4
LOG_FEATURES = ['X11_mean', 'X18_mean', 'X50_mean', 'X26_mean', 'X3112_mean']
y_train = train[CONFIG.TARGET_COLUMNS]

for skewed_trait in LOG_FEATURES:
    y_train.loc[:, skewed_trait] = y_train[skewed_trait].apply(PlantDataPreProcess.log_transform)

y_train.describe()

Unnamed: 0,X4_mean,X11_mean,X18_mean,X50_mean,X26_mean,X3112_mean
count,41797.0,41797.0,41797.0,41797.0,41797.0,41797.0
mean,0.521676,1.147512,0.002364,0.177637,0.456476,2.853331
std,0.144273,0.221711,0.667655,0.163304,1.036252,0.645715
min,0.176725,0.451824,-1.484983,-0.306127,-2.175279,0.987931
25%,0.410757,1.033142,-0.497456,0.074199,-0.231724,2.426897
50%,0.509045,1.179811,-0.146129,0.170768,0.40383,2.863288
75%,0.621267,1.290289,0.531838,0.280985,1.154992,3.323652
max,0.957788,1.765572,1.510396,0.663534,3.374584,4.475331


In [11]:
# Normalize to mean = 0, std dev = 1
from sklearn.preprocessing import StandardScaler

SCALER = StandardScaler()
y_train = SCALER.fit_transform(y_train)

# Save StandardScaler
# import pickle
# with open('scaler.pkl','wb') as f:
#     pickle.dump(SCALER, f)

# y_train_df = pd.DataFrame(y_train, columns=CONFIG.TARGET_COLUMNS)
# y_train_df.describe()

### SWIN Transformer Data Load

In [15]:
# Previous filtering by HDJOJO
# Keep only data that is in range 0.005 to 0.985
# for column in CONFIG.TARGET_COLUMNS:
#     lower_quantile = train[column].quantile(0.005)
#     upper_quantile = train[column].quantile(0.985)  
#     train = train[(train[column] >= lower_quantile) & (train[column] <= upper_quantile)]

CONFIG.N_TRAIN_SAMPLES = len(train)
CONFIG.N_STEPS_PER_EPOCH = (CONFIG.N_TRAIN_SAMPLES // CONFIG.BATCH_SIZE)
CONFIG.N_STEPS = CONFIG.N_STEPS_PER_EPOCH * CONFIG.N_EPOCHS + 1

test = pd.read_csv('/kaggle/input/planttraits2024/test.csv')
test['file_path'] = test['id'].apply(lambda s: f'/kaggle/input/planttraits2024/test_images/{s}.jpeg')
test['jpeg_bytes'] = test['file_path'].progress_apply(lambda fp: open(fp, 'rb').read())
# test.to_pickle('test.pkl')

print('N_TRAIN_SAMPLES:', len(train), 'N_TEST_SAMPLES:', len(test))

  0%|          | 0/6545 [00:00<?, ?it/s]

In [14]:
print("Train len:", len(train))
print("y_train len", len(y_train))

Train len: 41797
y_train len 41797


In [16]:
# Where did values come from?
# Likely Mean/std dev for each channel - Check! (only for train though)
MEAN = np.array([0.485, 0.456, 0.406])
STD = np.array([0.229, 0.224, 0.225])

TRAIN_TRANSFORMS = A.Compose([
        A.HorizontalFlip(p=0.5),
        A.RandomSizedCrop(
            [448, 512],
            CONFIG.IMAGE_SIZE, CONFIG.IMAGE_SIZE, w2h_ratio=1.0, p=0.75),
        A.Resize(CONFIG.IMAGE_SIZE, CONFIG.IMAGE_SIZE),
        A.RandomBrightnessContrast(brightness_limit=0.1, contrast_limit=0.1, p=0.25),
        A.ImageCompression(quality_lower=85, quality_upper=100, p=0.25),
        A.ToFloat(),
        A.Normalize(mean=MEAN, std=STD, max_pixel_value=1),
        ToTensorV2(),
    ])

VALID_TRANSFORMS = A.Compose([
        A.Resize(CONFIG.IMAGE_SIZE, CONFIG.IMAGE_SIZE),
        A.ToFloat(),
        A.Normalize(mean=MEAN, std=STD, max_pixel_value=1),
        ToTensorV2(),
    ])

TEST_TRANSFORMS = A.Compose([
        A.Resize(CONFIG.IMAGE_SIZE, CONFIG.IMAGE_SIZE),
        A.ToFloat(),
        A.Normalize(mean=MEAN, std=STD, max_pixel_value=1),
        ToTensorV2(),
    ])

class Dataset(Dataset):
    def __init__(self, X_jpeg_bytes, y, transforms=None):
        self.X_jpeg_bytes = X_jpeg_bytes
        self.y = y
        self.transforms = transforms

    def __len__(self):
        return len(self.X_jpeg_bytes)

    def __getitem__(self, index):
        X_sample = self.transforms(
            image=imageio.imread(self.X_jpeg_bytes[index]),
        )['image']
        y_sample = self.y[index]
        
        return X_sample, y_sample

train_dataset = Dataset(
    train['jpeg_bytes'].values,
    y_train,
    TRAIN_TRANSFORMS,
)

train_dataloader = DataLoader(
    train_dataset,
    batch_size=CONFIG.BATCH_SIZE,
    shuffle=True,
    drop_last=True,
    num_workers=psutil.cpu_count(),
)


valid_dataset = Dataset(
    valid['jpeg_bytes'].values,
    valid['id'].values,
    VALID_TRANSFORMS,
)

test_dataset = Dataset(
    test['jpeg_bytes'].values,
    test['id'].values,
    TEST_TRANSFORMS,
)

In [19]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.backbone = timm.create_model(
                CONFIG.BACKBONE,
                num_classes=CONFIG.N_TARGETS,
                pretrained=True)  # Use pretrained SWIN Transformer model
        
    def forward(self, inputs):
        return self.backbone(inputs)


model = Model()
model = model

PATH = '/kaggle/input/simple-swin-v1-1-best-model/simple_swin_v1.1_best_model.pth'

# Upload saved model
checkpoint = torch.load(PATH, map_location=torch.device('cpu'))
model.load_state_dict(checkpoint)
# print(model)

<All keys matched successfully>

## Attention Maps
Inspired by Vision Transformer visualization on MNIST: https://github.com/mashaan14/VisionTransformer-MNIST/blob/main/VisionTransformer_MNIST.ipynb

In [28]:
# Plot and save self-attention map for first layer window attention block
# Input: test sample index
# Output: sample_id, original image, attention map image

def get_self_attention_map_layer0(index, dataset=test_dataset, attn_index=-2):
    ## Get image from test set: id, image, input
    # Image: (512, 512, 3)
    test_sample_image = imageio.imread(dataset.X_jpeg_bytes[index])
    # Input: Resized, normalized to input to model (3, 256, 256)
    test_sample_input = dataset[index][0]
    test_sample_id = dataset[index][1]
    # Change to torch.Size([1, 3, 256, 256])
    test_sample_input = test_sample_input.unsqueeze(0).to(CONFIG.DEVICE)
    
    ## Run input through first few layers before window attention
    # input: torch.Size([1, 3, 256, 256])
    # patch_embed: torch.Size([1, 64, 64, 96])
    test_sample_patch_embed = model.backbone.patch_embed(test_sample_input)
    # Run through downsample layer, Identity(), so same torch.Size([1, 64, 64, 96])
    test_sample_downsample = model.backbone.layers[0].downsample(test_sample_patch_embed)
    
    ## Run input through window attention to get query, key, value (qkv)
    window_attn = model.backbone.layers[0].blocks[0].attn
    # torch.Size([1, 64, 64, 288])
    # The 96 input features becomes 96 queries, 96 keys, 96 values for total of 288
    test_sample_qkv = window_attn.qkv(test_sample_downsample)
    
    ## Get attention matrix
    dim0 = 64
    dim1 = dim0*dim0  # 4096
    dim2 = 96
    
    # query, key
    qkv = test_sample_qkv.squeeze(0).view(dim0, dim0, 3, dim2).reshape(dim1, 3, dim2)
    q = qkv[:, 0]  # (dim1, dim2)
    k = qkv[:, 1]
    kT = k.permute(1, 0)  # (dim2, dim1)

    # Attention matrix
    attn_matrix = q @ kT  # (dim1, dim1)
    
    self_attn_image = attn_matrix[attn_index, :].reshape(dim0, dim0).detach().cpu().numpy()
    
    return test_sample_id, test_sample_image, self_attn_image

In [156]:
# Folder to save images
SELF_ATTN_FOLDER = 'test_self_attn_maps'
if SELF_ATTN_FOLDER not in os.listdir():
    os.mkdir(SELF_ATTN_FOLDER)
os.listdir()

['.virtual_documents', 'test_self_attn_maps']

### Save Test Attention Maps

In [158]:
model.eval()
for i in range(50):
    # Visualize attention weights
    attn_index = -2  # self-attention for second-to-last pixel in image
    test_sample_id, test_sample_image, self_attn_image = get_self_attention_map_layer0(i, test_dataset, attn_index)
    
    fig, ax = plt.subplots(1, 2, figsize=(12,6))
    ax[0].imshow(test_sample_image)
    ax[0].set_title('Plant Test Sample')

    ax[1].imshow(self_attn_image)
    ax[1].set_title('Swin Transformer First Shifted Window Attention Map')
    
    if f'swin_self_attn_map_{test_sample_id}.png' not in os.listdir(SELF_ATTN_FOLDER):
        plt.savefig(f'{SELF_ATTN_FOLDER}/swin_self_attn_map_{test_sample_id}')
    plt.close(fig)

### Visualize Validation Attention Map and R2 Scores

In [109]:
Y_MEAN = torch.tensor(y_train).mean(dim=0).to(CONFIG.DEVICE)
EPS = torch.tensor([1e-6]).to(CONFIG.DEVICE)
print(Y_MEAN)

TRAIT_COLS = [CONFIG.TARGET_COLUMNS[i].removesuffix("_mean") for i in range(CONFIG.N_TARGETS)] 
print(TRAIT_COLS)

tensor([-2.7200e-16,  3.0642e-16,  6.7999e-18, -1.6320e-17, -2.6690e-17,
         6.3239e-17], dtype=torch.float64)
['X4', 'X11', 'X18', 'X50', 'X26', 'X3112']


In [167]:
# Folder to save images
VALID_SELF_ATTN_FOLDER = 'valid_self_attn_maps'
if VALID_SELF_ATTN_FOLDER not in os.listdir():
    os.mkdir(VALID_SELF_ATTN_FOLDER)
os.listdir()

['.virtual_documents', 'valid_self_attn_maps', 'test_self_attn_maps']

In [168]:
model.eval()  # Ensure outputs are consistent

for j in range(50):
    # Get Validation Attention
    attn_index = -2  # self-attention for second-to-last pixel in image
    valid_sample_id, valid_sample_image, self_attn_image = get_self_attention_map_layer0(j, valid_dataset, attn_index)
    
    # Run valid sample image through Swin
    with torch.no_grad():
        y_valid_pred = model(valid_dataset[j][0].unsqueeze(0).to(CONFIG.DEVICE)).detach().cpu().numpy()
        
    # Get R2Score for valid sample image
    y_valid_true = valid[valid['id'] == valid_sample_id][CONFIG.TARGET_COLUMNS].to_numpy()
    y_valid_pred = torch.tensor(y_valid_pred, dtype=torch.float64)
    y_valid_true = torch.tensor(y_valid_true)
    
    # Compute R2
    ss_total = torch.sum((y_valid_true - Y_MEAN)**2, dim=0)
    ss_total = torch.maximum(ss_total, EPS)
    
    ss_res = torch.sum((y_valid_true - y_valid_pred)**2, dim=0)
    r2 = 1 - ss_res / ss_total
    avg_r2 = torch.mean(r2)
    
    title_list1 = [f"{TRAIT_COLS[i]}: {r2[i]:.4f}" for i in range(CONFIG.N_TARGETS//2)]
    title_list2 = [f"{TRAIT_COLS[i]}: {r2[i]:.4f}" for i in range(CONFIG.N_TARGETS//2, CONFIG.N_TARGETS)]
    title = f"Validation Sample R2 Scores. Average R2 = {avg_r2:.4f} \n" + ",   ".join(title_list1) + "\n" + ",   ".join(title_list2) + "\n"
    
    # Plot
    fig, ax = plt.subplots(1, 2, figsize=(12,6))
    fig.suptitle(title, fontsize=16)
    fig.subplots_adjust(top=0.8)
    ax[0].imshow(valid_sample_image)
    ax[0].set_title('Plant Test Sample')

    ax[1].imshow(self_attn_image)
    ax[1].set_title('Swin Transformer First Shifted Window Attention Map')
    
    # Save image
    if f'valid_swin_self_attn_map_{valid_sample_id}.png' not in os.listdir(VALID_SELF_ATTN_FOLDER):
        plt.savefig(f'{VALID_SELF_ATTN_FOLDER}/valid_swin_self_attn_map_{valid_sample_id}')
    plt.close(fig)

## Training, Valid, Testing components


In [None]:
# def get_lr_scheduler(optimizer):
#     return torch.optim.lr_scheduler.OneCycleLR(
#         optimizer=optimizer,
#         max_lr=CONFIG.LR_MAX,
#         total_steps=CONFIG.N_STEPS,
#         pct_start=0.1,
#         anneal_strategy='cos',
#         div_factor=1e1,
#         final_div_factor=1e1,
#     )

# class AverageMeter(object):
#     def __init__(self):
#         self.reset()

#     def reset(self):
#         self.avg = 0
#         self.sum = 0
#         self.count = 0

#     def update(self, val):
#         self.sum += val.sum()
#         self.count += val.numel()
#         self.avg = self.sum / self.count

# MAE = torchmetrics.regression.MeanAbsoluteError().to('cuda')
# R2 = torchmetrics.regression.R2Score(num_outputs=CONFIG.N_TARGETS, multioutput='uniform_average').to('cuda')
# LOSS = AverageMeter()

# Y_MEAN = torch.tensor(y_train).mean(dim=0).to('cuda')
# EPS = torch.tensor([1e-6]).to('cuda')

# def r2_loss(y_pred, y_true):
#     ss_res = torch.sum((y_true - y_pred)**2, dim=0)
#     ss_total = torch.sum((y_true - Y_MEAN)**2, dim=0)
#     ss_total = torch.maximum(ss_total, EPS)
#     r2 = torch.mean(ss_res / ss_total)
#     return r2

# # How is this R2 Loss?
# LOSS_FN = nn.SmoothL1Loss() # r2_loss

# optimizer = torch.optim.AdamW(
#     params=model.parameters(),
#     lr=CONFIG.LR_MAX,
#     weight_decay=CONFIG.WEIGHT_DECAY,
# )

# LR_SCHEDULER = get_lr_scheduler(optimizer)

# def count_parameters(model):
#     return sum(p.numel() for p in model.parameters() if p.requires_grad)

# metrics = {
#     'epoch': [],
#     'loss': [],
#     'mae': [],
#     'r2': [],
#     'lr': [],
#     'training_time': [],
#     'num_params': count_parameters(model),
#     'valid_r2': [],
#     'valid_mae': [],
#     'valid_r2_loss': [],
#     'valid_sl1_loss': []
# }

In [None]:
# best_valid_r2 = -np.inf 

# print("Start Training:")
# for epoch in range(CONFIG.N_EPOCHS):
#     epoch_start_time = time.time()
#     MAE.reset()
#     R2.reset()
#     LOSS.reset()
#     model.train()
    
#     epoch_loss = 0
#     epoch_mae = 0
#     epoch_r2 = 0
        
#     for step, (X_batch, y_true) in enumerate(train_dataloader):
#         X_batch = X_batch.to('cuda')
#         y_true = y_true.to('cuda')
#         t_start = time.perf_counter_ns()
#         y_pred = model(X_batch)
#         loss = LOSS_FN(y_pred, y_true)
#         LOSS.update(loss)
#         loss.backward()
#         optimizer.step()
#         optimizer.zero_grad()
#         LR_SCHEDULER.step()
#         MAE.update(y_pred, y_true)
#         R2.update(y_pred, y_true)
        
#         epoch_loss += loss.item()
#         epoch_mae += MAE.compute().item()
#         epoch_r2 += R2.compute().item()
            
#         if not CONFIG.IS_INTERACTIVE and (step+1) == CONFIG.N_STEPS_PER_EPOCH:
#             print(
#                 f'EPOCH {epoch+1:02d}, {step+1:04d}/{CONFIG.N_STEPS_PER_EPOCH} | ' + 
#                 f'loss: {LOSS.avg:.4f}, mae: {MAE.compute().item():.4f}, r2: {R2.compute().item():.4f}, ' +
#                 f'step: {(time.perf_counter_ns()-t_start)*1e-9:.3f}s, lr: {LR_SCHEDULER.get_last_lr()[0]:.2e}',
#             )
#         elif CONFIG.IS_INTERACTIVE:
#             print(
#                 f'\rEPOCH {epoch+1:02d}, {step+1:04d}/{CONFIG.N_STEPS_PER_EPOCH} | ' + 
#                 f'loss: {LOSS.avg:.4f}, mae: {MAE.compute().item():.4f}, r2: {R2.compute().item():.4f}, ' +
#                 f'step: {(time.perf_counter_ns()-t_start)*1e-9:.3f}s, lr: {LR_SCHEDULER.get_last_lr()[0]:.2e}',
#                 end='\n' if (step + 1) == CONFIG.N_STEPS_PER_EPOCH else '', flush=True,
#             )
            
#     epoch_training_time = time.time() - epoch_start_time
    
#     # Validate on validation set
#     VALID_ROWS = []
#     model.eval()

#     for X_sample_valid, valid_id in tqdm(valid_dataset):
#         with torch.no_grad():
#             y_pred = model(X_sample_valid.unsqueeze(0).to('cuda')).detach().cpu().numpy()

#         y_pred = SCALER.inverse_transform(y_pred).squeeze()
#         row = {'id': valid_id}

#         for k, v in zip(CONFIG.TARGET_COLUMNS, y_pred):
#             if k in LOG_FEATURES:
#                 row[k] = 10 ** v
#             else:
#                 row[k] = v

#         VALID_ROWS.append(row)

#     valid_predict_df = pd.DataFrame(VALID_ROWS)
#     valid_y_true = torch.tensor(valid[CONFIG.TARGET_COLUMNS].to_numpy()).to('cuda')
#     valid_y_pred = torch.tensor(valid_predict_df[CONFIG.TARGET_COLUMNS].to_numpy()).to('cuda')

#     MAE_valid = torchmetrics.regression.MeanAbsoluteError().to('cuda')
#     R2_valid = torchmetrics.regression.R2Score(num_outputs=CONFIG.N_TARGETS, multioutput='uniform_average').to('cuda')
    
#     with torch.no_grad():
#         valid_r2 = R2_valid(valid_y_pred, valid_y_true).item()
#         valid_mae = MAE_valid(valid_y_pred, valid_y_true).item()
#         valid_r2_loss = r2_loss(valid_y_pred, valid_y_true).item()
#         valid_sl1_loss = LOSS_FN(valid_y_pred, valid_y_true).item()
        
        
#         # Log metrics for this epoch
#         metrics['epoch'].append(epoch + 1)
#         metrics['loss'].append(epoch_loss / len(train_dataloader))
#         metrics['mae'].append(epoch_mae / len(train_dataloader))
#         metrics['r2'].append(epoch_r2 / len(train_dataloader))
#         metrics['lr'].append(LR_SCHEDULER.get_last_lr()[0])
#         metrics['training_time'].append(epoch_training_time)
#         metrics['valid_r2'].append(valid_r2)
#         metrics['valid_mae'].append(valid_mae)
#         metrics['valid_r2_loss'].append(valid_r2_loss)
#         metrics['valid_sl1_loss'].append(valid_sl1_loss)
        
#         # Save the model if validation R2 improves
#         if valid_r2 > best_valid_r2:
#             best_valid_r2 = valid_r2
#             torch.save(model.state_dict(), 'best_model.pth')
#             print(f'Saved Best Model at Epoch {epoch + 1} with R2: {valid_r2:.4f}')

# # Save metrics to a file
# import json
# with open('metrics3.json', 'w') as f:
#     json.dump(metrics, f)

In [None]:
# import json
# import matplotlib.pyplot as plt

# # Load the metrics
# with open('metrics3.json', 'r') as f:
#     metrics = json.load(f)

# # Plotting training and validation metrics
# fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# # Loss plot
# axes[0, 0].plot(metrics['epoch'], metrics['loss'], label='Train Loss')
# axes[0, 0].plot(metrics['epoch'], metrics['valid_sl1_loss'], label='Valid SL1 Loss')
# axes[0, 0].set_title('Loss')
# axes[0, 0].set_xlabel('Epoch')
# axes[0, 0].set_ylabel('Loss')
# axes[0, 0].legend()

# # MAE plot
# axes[0, 1].plot(metrics['epoch'], metrics['mae'], label='Train MAE')
# axes[0, 1].plot(metrics['epoch'], metrics['valid_mae'], label='Valid MAE')
# axes[0, 1].set_title('Mean Absolute Error (MAE)')
# axes[0, 1].set_xlabel('Epoch')
# axes[0, 1].set_ylabel('MAE')
# axes[0, 1].legend()

# # R2 plot
# axes[1, 0].plot(metrics['epoch'], metrics['r2'], label='Train R2')
# axes[1, 0].plot(metrics['epoch'], metrics['valid_r2'], label='Valid R2')
# axes[1, 0].set_title('R2 Score')
# axes[1, 0].set_xlabel('Epoch')
# axes[1, 0].set_ylabel('R2 Score')
# axes[1, 0].legend()

# # Learning rate plot
# axes[1, 1].plot(metrics['epoch'], metrics['lr'], label='Learning Rate')
# axes[1, 1].set_title('Learning Rate')
# axes[1, 1].set_xlabel('Epoch')
# axes[1, 1].set_ylabel('Learning Rate')
# axes[1, 1].legend()

# plt.tight_layout()
# plt.show()

In [None]:
# # Validate on validation set
# VALID_ROWS = []
# model.eval()

# for X_sample_valid, valid_id in tqdm(valid_dataset):
#     with torch.no_grad():
#         y_pred = model(X_sample_valid.unsqueeze(0).to('cuda')).detach().cpu().numpy()
    
#     y_pred = SCALER.inverse_transform(y_pred).squeeze()
#     row = {'id': valid_id}
    
#     for k, v in zip(CONFIG.TARGET_COLUMNS, y_pred):
#         if k in LOG_FEATURES:
#             row[k] = 10 ** v
#         else:
#             row[k] = v

#     VALID_ROWS.append(row)
    
# valid_predict_df = pd.DataFrame(VALID_ROWS)
# print(valid_predict_df.head())

In [None]:
# # valid_y_true
# print(valid[['id'] + CONFIG.TARGET_COLUMNS].head())
# valid_y_true = torch.tensor(valid[CONFIG.TARGET_COLUMNS].to_numpy()).to('cuda')

In [None]:
# # Evaluate valid scores
# valid_y_pred = torch.tensor(valid_predict_df[CONFIG.TARGET_COLUMNS].to_numpy()).to('cuda')

# with torch.no_grad():
#     # Calculate R2 Loss
#     print("Validation R2 Loss (using r2_loss):", r2_loss(valid_y_pred, valid_y_true))

#     # Loss function (smooth L1 loss)
#     valid_loss = LOSS_FN(valid_y_pred, valid_y_true)
#     print("Validation loss (Smooth L1 loss): ", valid_loss)

In [None]:
# VALID_Y_MEAN = torch.tensor(y_train).mean(dim=0).to('cuda')

# def r2_loss_valid(y_pred, y_true):
#     ss_res = torch.sum((y_true - y_pred)**2, dim=0)
#     ss_total = torch.sum((y_true - VALID_Y_MEAN)**2, dim=0)
#     ss_total = torch.maximum(ss_total, torch.tensor([1e-6]))
#     r2 = torch.mean(ss_res / ss_total)
#     return r2

# print("R2 Score valid (using r2_loss_valid):", 1 - r2_loss_valid(valid_y_pred, valid_y_true))

In [None]:
# # Scratch code to test R2 loss: random produced around R2 score = -92
# # v_len = len(valid_y_true)
# # train_y_true = torch.tensor(train[0:v_len][CONFIG.TARGET_COLUMNS].to_numpy())
# # print("Train and valid R2 score:", 1 - r2_loss_valid(valid_y_true, train_y_true))

# MAE_valid = torchmetrics.regression.MeanAbsoluteError().to('cuda')
# R2_valid = torchmetrics.regression.R2Score(num_outputs=CONFIG.N_TARGETS, multioutput='uniform_average').to('cuda')

# print("Torch R2 valid:", R2_valid(valid_y_pred, valid_y_true))
# print("Torch MAE valid:", MAE_valid(valid_y_pred, valid_y_true))

In [None]:
# # Predict on test set
# SUBMISSION_ROWS = []
# model.eval()

# for X_sample_test, test_id in tqdm(test_dataset):
#     with torch.no_grad():
#         y_pred = model(X_sample_test.unsqueeze(0).to('cuda')).detach().cpu().numpy()
    
#     y_pred = SCALER.inverse_transform(y_pred).squeeze()
#     row = {'id': test_id}
    
#     for k, v in zip(CONFIG.TARGET_COLUMNS, y_pred):
#         if k in LOG_FEATURES:
#             row[k.replace('_mean', '')] = 10 ** v
#         else:
#             row[k.replace('_mean', '')] = v

#     SUBMISSION_ROWS.append(row)
    
# submission_df = pd.DataFrame(SUBMISSION_ROWS)
# print(submission_df.head())
# submission_df.to_csv('submission.csv', index=False)
# print("Submit!")