## Training Model from Scratch
Lisa Fung, 05/14/2024

Following Andrej Karpathy’s Recipe to Training Neural Networks: https://karpathy.github.io/2019/04/25/recipe/

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os

import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from sklearn.preprocessing import StandardScaler

from torchvision.io import read_image

In [2]:
BASE_PATH = "/kaggle/input/planttraits2024"
df = pd.read_csv(f'{BASE_PATH}/train.csv') # Ancillary geodata
df['image_path'] = f'{BASE_PATH}/train_images/'+df['id'].astype(str)+'.jpeg' # Image path

In [3]:
# Borrowed from Plant Traits Sample Notebook
class CFG:
    verbose = 1  # Verbosity
    seed = 42  # Random seed
    preset = "efficientnetv2_b2_imagenet"  # Name of pretrained classifier
    image_size = [224, 224]  # Input image size
    epochs = 12 # Training epochs
    batch_size = 96  # Batch size
    lr_mode = "step" # LR scheduler mode from one of "cos", "step", "exp"
    drop_remainder = True  # Drop incomplete batches
    num_classes = 6 # Number of classes in the dataset
    num_folds = 5 # Number of folds to split the dataset
    fold = 0 # Which fold to set as validation data
    class_names = ['X4_mean', 'X11_mean', 'X18_mean',
                   'X26_mean', 'X50_mean', 'X3112_mean',]
    aux_class_names = list(map(lambda x: x.replace("mean","sd"), class_names))
    num_classes = len(class_names)
    aux_num_classes = len(aux_class_names)

## Train/Validation Split
Borrowed from Plant Traits Sample Notebook

In [4]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=CFG.num_folds, shuffle=True, random_state=42)

# Create separate bin for each traits
for i, trait in enumerate(CFG.class_names):

    # Determine the bin edges dynamically based on the distribution of traits
    bin_edges = np.percentile(df[trait], np.linspace(0, 100, CFG.num_folds + 1))
    df[f"bin_{i}"] = np.digitize(df[trait], bin_edges)

# Concatenate the bins into a final bin
df["final_bin"] = (
    df[[f"bin_{i}" for i in range(len(CFG.class_names))]]
    .astype(str)
    .agg("".join, axis=1)
)

# Perform the stratified split using final bin
df = df.reset_index(drop=True)
for fold, (train_idx, valid_idx) in enumerate(skf.split(df, df["final_bin"])):
    df.loc[valid_idx, "fold"] = fold



In [5]:
train_df = df[df["fold"] != 0]
valid_df = df[df["fold"] == 0] # Fold 0 is validation
train_df[CFG.class_names + ["fold"]].describe()

Unnamed: 0,X4_mean,X11_mean,X18_mean,X26_mean,X50_mean,X3112_mean,fold
count,44391.0,44391.0,44391.0,44391.0,44391.0,44391.0,44391.0
mean,0.522456,127.1709,24600.4,3096.704,12.810444,493829.3,2.499966
std,0.176001,12379.79,2582362.0,221043.6,1313.424294,102327000.0,1.118037
min,-2.431157,6.78e-05,2.33e-08,5.5e-07,9.7e-05,7.69e-08,1.0
25%,0.410739,10.6356,0.3099867,0.5595144,1.174045,255.2807,1.5
50%,0.509275,15.12003,0.7171231,2.529542,1.48013,725.8266,2.0
75%,0.622427,19.68705,3.574691,14.98396,1.924787,2158.052,3.0
max,4.475172,1504254.0,272049400.0,31065550.0,159759.8977,21559110000.0,4.0


## Pre-Process Data
For each trait, we apply a combination of the following pre-processing techniques depending on the specific distribution of the trait's values. The pre-processing ONLY APPLIES to **training data** (NOT validation data), so we ignore fold 0.
1. Remove outliers by filtering for the quantile range (0.005, 0.995) (for all traits)
2. Use log base 10 transformation of right-skewed data (for all traits except trait X4)
3. Normalize data to have mean = 0 and standard deviation = 1 (for all traits)

In [6]:
class PlantDataPreProcess:
    lower_quantile = 0.005
    upper_quantile = 0.995
    log_transform = np.log10

In [7]:
# Filter data
print("Num samples before filering:", len(train_df))

for trait in CFG.class_names:
    lower_bound = train_df[trait].quantile(PlantDataPreProcess.lower_quantile)
    upper_bound = train_df[trait].quantile(PlantDataPreProcess.upper_quantile)
    train_df = train_df[(train_df[trait] >= lower_bound) & (train_df[trait] <= upper_bound)]
    
print("Num samples After filtering:", len(train_df))
train_df[CFG.class_names].describe()

Num samples before filering: 44391
Num samples After filtering: 41799


Unnamed: 0,X4_mean,X11_mean,X18_mean,X26_mean,X50_mean,X3112_mean
count,41799.0,41799.0,41799.0,41799.0,41799.0,41799.0
mean,0.521677,15.822802,3.207161,42.242341,1.616606,1858.228362
std,0.144269,7.59882,5.347785,166.643061,0.638061,3116.096582
min,0.176725,2.830246,0.032735,0.006451,0.494166,9.725925
25%,0.410762,10.792842,0.318074,0.585903,1.186274,267.23733
50%,0.509045,15.127512,0.714281,2.534105,1.481558,729.941079
75%,0.621267,19.509902,3.402668,14.288584,1.909773,2106.9175
max,0.957788,58.287012,32.388908,2369.101479,4.608223,29876.60141


In [8]:
# Log10 transformation for all traits except X4
y_train = train_df[CFG.class_names]

for skewed_trait in CFG.class_names[1:]:
    y_train.loc[:, skewed_trait] = y_train[skewed_trait].apply(PlantDataPreProcess.log_transform)
    
y_train.describe()

Unnamed: 0,X4_mean,X11_mean,X18_mean,X26_mean,X50_mean,X3112_mean
count,41799.0,41799.0,41799.0,41799.0,41799.0,41799.0
mean,0.521677,1.147506,0.002329,0.456349,0.177628,2.85332
std,0.144269,0.221707,0.667658,1.036389,0.163305,0.645702
min,0.176725,0.451824,-1.484983,-2.190374,-0.306127,0.987931
25%,0.410762,1.033136,-0.497472,-0.232174,0.074185,2.426897
50%,0.509045,1.179768,-0.146131,0.403825,0.170719,2.863288
75%,0.621267,1.290255,0.53182,1.154989,0.280982,3.323648
max,0.957788,1.765572,1.510396,3.374584,0.663534,4.475331


In [9]:
# Normalize to mean = 0, std dev = 1
from sklearn.preprocessing import StandardScaler

SCALER = StandardScaler()
y_train = SCALER.fit_transform(y_train)

y_train_df = pd.DataFrame(y_train, columns=CFG.class_names)
y_train_df.describe()

Unnamed: 0,X4_mean,X11_mean,X18_mean,X26_mean,X50_mean,X3112_mean
count,41799.0,41799.0,41799.0,41799.0,41799.0,41799.0
mean,-5.4906890000000007e-17,-3.829883e-16,-3.3573100000000005e-17,1.6999039999999997e-19,1.031842e-16,-4.310956e-16
std,1.000012,1.000012,1.000012,1.000012,1.000012,1.000012
min,-2.391054,-3.137878,-2.227683,-2.553825,-2.962323,-2.888967
25%,-0.7688166,-0.5158674,-0.7485979,-0.6643567,-0.6334448,-0.66041
50%,-0.08755688,0.1455159,-0.2223625,-0.05068134,-0.04231292,0.01543735
75%,0.690315,0.643871,0.7930659,0.6741179,0.6328929,0.7284061
max,3.022934,2.78779,2.25877,2.815806,2.975485,2.512043


## View Data

In [None]:
df.columns.to_list()
df.describe()

In [None]:
# Get images
def decode_image(inp, image_row):
    path = inp.loc[image_row, "image_path"]

    # Read jpeg image
    image = read_image(path)
    return image.permute(1, 2, 0)

#### Check Annual Mean Temperature below 0 degrees C

In [None]:
# Get all images with very low annual mean temperature
below_0_temp_idx = df.index[df['WORLDCLIM_BIO1_annual_mean_temperature'] < 0]
len(below_0_temp_idx)

In [None]:
# Check images
fig = plt.figure()

for i in range(20):
    fig.add_subplot(4, 5, i+1) 
    plt.imshow(decode_image(df, below_0_temp_idx[i]))

### Check Variation in Labels (Plant Traits)

In [None]:
# Check variation of labels
# regex guide: https://www.w3schools.com/python/python_regex.asp
df.filter(regex="^X.+_mean$").describe()

#### Check X11 (Leaf area per leaf dry mass) outliers

In [None]:
# X11: Leaf area per leaf dry mass
print("Total train samples:", len(df))
x11_greater_100_idx = df.index[df['X11_mean'] > 100]
x11_less_01_idx = df.index[df['X11_mean'] < 0.1]
print("X11 > 100:", len(x11_greater_100_idx))
print("X11 < 0.1:", len(x11_less_01_idx))

In [None]:
# Check images
fig = plt.figure()

for i in range(20):
    fig.add_subplot(4, 5, i+1)
    plt.imshow(decode_image(df, x11_greater_100_idx[i]))
    
plt.title("X11 greater than 100")
# Big leaves, very little weight

In [None]:
# Check images
fig = plt.figure()

for i in range(20):
    fig.add_subplot(4, 5, i+1)
    plt.imshow(decode_image(df, x11_less_01_idx[i]))
    
plt.title("X11 less than 0.1")
# Smaller, heavier leaves

### Visualize Distributions of Each Trait

In [None]:
i = 2
col = CFG.class_names[i]
lower = 0.005
upper = 0.995
lower_q = df[col].quantile(lower)
upper_q = df[col].quantile(upper)

print(f"Original data for trait {i}")
print(df[col].describe())
print()
print(f"Lower quantile ({lower}):", lower_q)
print(f"Upper quantile ({upper}):", upper_q)
print("Unique values:", len(df[col].unique()))
print("Largest unique values:", sorted(df[col].unique(), reverse=True)[:5])
print()
print(f"Outliers removed data for trait {i}")
filtered_df = df[(df[col] >= lower_q) & (df[col] <= upper_q)][col]
transform_filtered_df = filtered_df.apply(np.log10)
print(transform_filtered_df.describe())
print(transform_filtered_df.hist())