In [1]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
# import torch.nn.functional as F
import torch.optim as optim
import torchvision
from torch.utils.data import Dataset, DataLoader

import matplotlib.pyplot as plt
# from IPython.display import display, clear_output
from datetime import datetime
import time
import os.path
from sklearn.metrics import r2_score

In [2]:
# ! pip install -q kaggle
# from google.colab import files
# files.upload()

In [None]:
# ! mkdir ~/.kaggle
# ! cp kaggle.json ~/.kaggle/
# ! chmod 600 ~/.kaggle/kaggle.json
# ! kaggle competitions download -c cs-480-2024-spring
# ! mkdir data
# ! unzip -q cs-480-2024-spring.zip

Downloading cs-480-2024-spring.zip to /content
 99% 284M/287M [00:04<00:00, 78.0MB/s]
100% 287M/287M [00:04<00:00, 65.6MB/s]


In [3]:
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print('On GPU') if device == torch.device("cuda:0") else print('On CPU')

On CPU


In [4]:
class Hyperparameter:
    num_predictions: int    = 6

    # hyperparams
    batch_size: int         = 64
    num_epochs: int         = 8

    vgg_output_size: int    = 4096
    fc_output_size: int     = 256

    # bookkeeping
    batch_report_gap: int   = 96
    sav_model_epoch: bool   = True
    vals_per_epoch: int     = 3

    # optimizer
    learning_rate: float    = 0.0005
    lr_decay: float         = 0.9

hp = Hyperparameter()

# Data processing

In [5]:
class CustomDataset(Dataset):
    def __init__(self, img_id, data, target, img_dir, transform=None):
        self.img_id = img_id
        self.data_frame = data
        self.target = target
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.data_frame)

    def __getitem__(self, idx):
        img_name = os.path.join(self.img_dir, f"{self.img_id[idx]}.jpeg")
        image = np.array(plt.imread(img_name), dtype=np.float32)
        if self.transform:
            image = self.transform(image)

        training_data = torch.tensor(self.data_frame[idx])
        target_data = torch.tensor(self.target[idx])

        return image, training_data, target_data

In [6]:
MEAN = [113.82422637939453/255, 114.86695861816406/255, 85.6895751953125/255]
STD = [46.77458190917969/255, 45.75661849975586/255, 45.359466552734375/255]

augment_transform = torchvision.transforms.Compose([
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize(mean=MEAN, std=STD),
    torchvision.transforms.RandomHorizontalFlip(p=0.5),
    torchvision.transforms.RandomVerticalFlip(p=0.5),
    torchvision.transforms.RandomResizedCrop(size=128, scale=(0.8, 1.0)),
    torchvision.transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1)
    ])

standard_transform = torchvision.transforms.Compose([
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize(mean=MEAN, std=STD)
])

In [7]:
# load data
TRAIN_IMG_DIR = os.path.join('data', 'train_images')
EVAL_IMG_DIR = os.path.join('data', 'test_images')

train_data = pd.read_csv(os.path.join('data', 'train.csv'))
evaluation_data = pd.read_csv(os.path.join('data', 'test.csv'))

# normalise data (convert to Z score)
data_to_normalise = train_data[train_data.columns[1:164]]
eval_to_normalise = evaluation_data[evaluation_data.columns[1:]]

stats_df = pd.DataFrame({
    'mean': data_to_normalise.mean(),
    'std': data_to_normalise.std() })

data_normd = (data_to_normalise - stats_df['mean']) / stats_df['std']
eval_normd = (eval_to_normalise - stats_df['mean']) / stats_df['std']

In [9]:
# shove into dataloaders
X = np.array(data_normd, dtype=np.float32)
Y = np.array(train_data, dtype=np.float32)[:, -6:]

X = X[:1000]
Y = Y[:1000]

TRAIN_TEST_SPLIT = int(np.round(0.85 * len(X)))

train_img_id = train_data['id'][:TRAIN_TEST_SPLIT]
train_x = X[:TRAIN_TEST_SPLIT]
train_y = Y[:TRAIN_TEST_SPLIT]
train_dataset = CustomDataset(train_img_id, train_x, train_y, img_dir=TRAIN_IMG_DIR, transform=augment_transform)
train_dataloader = DataLoader(train_dataset, batch_size=hp.batch_size, shuffle=True)

test_img_id = train_data['id'][TRAIN_TEST_SPLIT:].reset_index(drop=True)
test_x = X[TRAIN_TEST_SPLIT:]
test_y = Y[TRAIN_TEST_SPLIT:]
test_dataset = CustomDataset(test_img_id, test_x, test_y, img_dir=TRAIN_IMG_DIR, transform=standard_transform)
test_dataloader = DataLoader(test_dataset, batch_size=hp.batch_size, shuffle=False)

train_x.shape, test_x.shape, len(train_dataloader), len(test_dataloader)

((850, 163), (150, 163), 14, 3)

In [None]:
EVAL_X = np.array(eval_normd, dtype=np.float32)

eval_img_id = evaluation_data['id']
eval_x = np.array(EVAL_X, dtype=np.float32)
eval_y = np.zeros((6391,6), dtype=np.float32) # empty
eval_dataset = CustomDataset(eval_img_id, eval_x, eval_y, img_dir=EVAL_IMG_DIR, transform=standard_transform)
eval_dataloader = DataLoader(eval_dataset, batch_size=hp.batch_size, shuffle=False)

eval_x.shape, eval_y.shape, len(eval_dataloader)

((6391, 163), (6391, 6), 100)

In [None]:
# all_img_id = train_data['id']
# full_dataset = CustomDataset(all_img_id, X, Y, img_dir=TRAIN_IMG_DIR, transform = standard_transform)
# full_dataloader = DataLoader(full_dataset, batch_size=128, shuffle=False)

# mean = 0.0
# std = 0.0
# total_images = 0

# for images, _, _ in full_dataloader:
#     images = images.view(images.size(0), images.size(1), -1)  # Reshape to (batch_size, channels, height*width)
#     mean += images.mean(2).sum(0)
#     std += images.std(2).sum(0)
#     total_images += images.size(0)

# mean /= total_images
# std /= total_images

# mean.tolist()
# std.tolist()

# MODELS

In [None]:
class VGG11(nn.Module):
    def __init__(self):
        super(VGG11, self).__init__()

        self.cnn = nn.Sequential(
            # - Conv(003, 064, 3, 1, 1) - BatchNorm(064) - ReLU - MaxPool(2, 2)
            nn.Conv2d(3, 64, 3, 1, 1), nn.BatchNorm2d(64), nn.ReLU(inplace=True), nn.MaxPool2d(2, 2),
            # - Conv(064, 128, 3, 1, 1) - BatchNorm(128) - ReLU - MaxPool(2, 2)
            nn.Conv2d(64, 128, 3, 1, 1), nn.BatchNorm2d(128), nn.ReLU(inplace=True), nn.MaxPool2d(2, 2),
            # - Conv(128, 256, 3, 1, 1) - BatchNorm(256) - ReLU
            nn.Conv2d(128, 256, 3, 1, 1), nn.BatchNorm2d(256), nn.ReLU(inplace=True),
            # # - Conv(256, 256, 3, 1, 1) - BatchNorm(256) - ReLU - MaxPool(2, 2)
            nn.Conv2d(256, 256, 3, 1, 1), nn.BatchNorm2d(256), nn.ReLU(inplace=True), nn.MaxPool2d(2, 2),
            # - Conv(256, 512, 3, 1, 1) - BatchNorm(512) - ReLU
            nn.Conv2d(256, 512, 3, 1, 1), nn.BatchNorm2d(512), nn.ReLU(inplace=True),
            # - Conv(512, 512, 3, 1, 1) - BatchNorm(512) - ReLU - MaxPool(2, 2)
            nn.Conv2d(512, 512, 3, 1, 1), nn.BatchNorm2d(512), nn.ReLU(inplace=True), nn.MaxPool2d(2, 2),
            # - Conv(512, 512, 3, 1, 1) - BatchNorm(512) - ReLU
            nn.Conv2d(512, 512, 3, 1, 1), nn.BatchNorm2d(512), nn.ReLU(inplace=True),
            # - Conv(512, 512, 3, 1, 1) - BatchNorm(512) - ReLU - MaxPool(2, 2)
            nn.Conv2d(512, 512, 3, 1, 1), nn.BatchNorm2d(512), nn.ReLU(inplace=True), nn.MaxPool2d(2, 2),
            # - FC(0512, 4096) - ReLU - Dropout(0.5)
            nn.Flatten(),
            # nn.Linear(512*4*4, 4096), nn.ReLU(inplace=True), nn.Dropout(0.5),
            # # - FC(4096, 4096) - ReLU - Dropout(0.5)
            # nn.Linear(4096, 2048), nn.ReLU(inplace=True), nn.Dropout(0.5),
            # - FC(4096, out)
            # nn.Linear(4096, hp.vgg_output_size)

            nn.Linear(512*4*4, hp.vgg_output_size)
        )

    def forward(self, input):

        return self.cnn(input)

In [None]:
class FC(nn.Module):
    def __init__(self):
        super(FC, self).__init__()

        self.linear = nn.Sequential(
            nn.Linear(163, 256), nn.ReLU(inplace=True), # nn.Dropout(0.2),

            nn.Linear(256, 512), nn.ReLU(inplace=True), # nn.Dropout(0.2),

            nn.Linear(512, 512), nn.ReLU(inplace=True), # nn.Dropout(0.2),

            nn.Linear(512, 256), nn.ReLU(inplace=True), nn.Dropout(0.2),

            nn.Linear(256, hp.fc_output_size)
        )

    def forward(self, input):
        return self.linear(input)

In [None]:
class SmallBoy(nn.Module):
    def __init__(self):
        super(SmallBoy, self).__init__()

        self.vgg = VGG11()
        self.fc = FC()

        concat_size = hp.vgg_output_size + hp.fc_output_size

        self.linear_head = nn.Sequential(
            nn.Linear(concat_size, concat_size), nn.ReLU(inplace=True), nn.Dropout(0.4),

            nn.Linear(concat_size, concat_size), nn.ReLU(inplace=True), nn.Dropout(0.4),

            nn.Linear(concat_size, hp.num_predictions)
        )

    def forward(self, input):
        img = input[0]
        data = input[1]

        vgg_result = self.vgg(img)
        data_result = self.fc(data)

        combined = torch.cat((vgg_result, data_result), dim=1)

        return self.linear_head(combined)

# Bookkeeping

In [None]:
# bookkeeping block

class bookkeeper():
    def __init__(self, key_lst, total_batches):

        # graphs
        self.lists = {}
        for key in key_lst:
            self.lists[key] = {'batch': [],
                               'value': []}

        self.batch_loss_list = []

        # timer
        self.start_time = time.time()
        self.total_batches = total_batches
        self.local_batches_completed = 0
        self.batches_completed = 0
        self.epochs_completed = 0

    def append(self, key, val):
        self.lists[key]['batch'].append(self.batches_completed)
        self.lists[key]['value'].append(val)

    def init_plots():
        plt.ion()


    def plot(self):
        for k in self.lists:
            plt.figure(figsize=(10,5))
            plt.title(k)
            plt.xticks(np.arange(0, np.array(self.lists[k]['batch']).max()+len(train_dataloader), len(train_dataloader)))
            plt.plot(self.lists[k]['batch'][1:], self.lists[k]['value'][1:]) #ignore 1st entry, too big
            plt.grid(True)
            plt.show()

        # plot batch loss moving average of window size 5:
        data_series = pd.Series(self.batch_loss_list)
        moving_average = data_series.rolling(window=10, center=False).mean()
        plt.title('moving batch loss')
        plt.plot(moving_average)
        plt.grid(True)
        plt.show()


    def reset_timer(self):
        self.start_time = time.time()
        print('cur time', datetime.fromtimestamp(self.start_time).strftime("%H:%M:%S"))

    def elapsed_time(self):
        elapsed = time.time() - self.start_time
        hours, rem = divmod(elapsed, 3600)
        minutes, seconds = divmod(rem, 60)
        return 'ELAPSED: ' + f"{int(hours):02}h {int(minutes):02}m {int(seconds):02}s"

    def eta(self):
        progress = (self.batches_completed+1) / self.total_batches
        elapsed = time.time() - self.start_time
        est_total_time = elapsed / progress
        eta = self.start_time + est_total_time
        formatted_eta = datetime.fromtimestamp(eta).strftime("%H:%M:%S")
        return 'ETA: ' + formatted_eta


    def tick_batch(self, batch_loss):
        self.local_batches_completed += 1
        self.batches_completed += 1

        self.batch_loss_list.append(batch_loss)

        if self.batches_completed % hp.batch_report_gap == 0:
            print('Epoch', f"{self.epochs_completed / hp.vals_per_epoch : .2f}",
                  'Batch', f"{self.local_batches_completed:03}",
                  batch_loss, '|',
                  self.elapsed_time(), '|',
                  self.eta())

    def tick_epoch(self, train_loss, test_loss, r2):
        self.epochs_completed += 1
        self.local_batches_completed = 0

        print('### Epoch', f"{self.epochs_completed / hp.vals_per_epoch : .2f}", '|',
              self.elapsed_time(), '|',
              self.eta())
        print('train_loss', train_loss, 'test_loss', test_loss, 'r2',r2)
        print()


In [None]:
def get_current_lr(optimizer):
    """Get the current learning rate from the optimizer."""
    for param_group in optimizer.param_groups:
        return param_group['lr']

def set_learning_rate(optimizer, new_lr):
    """Set the learning rate for the optimizer."""
    for param_group in optimizer.param_groups:
        param_group['lr'] = new_lr

In [None]:
def predict(model):
    predictions = []

    with torch.no_grad():
        for images, data, _ in eval_dataloader:
            prediction = model((images, data)).detach().cpu().numpy()
            predictions.append(prediction) # X4,X11,X18,X26,X50,X3112

    all_predictions_np = np.concatenate(predictions, axis=0)
    df = pd.DataFrame(all_predictions_np, columns=['X4', 'X11', 'X18', 'X26', 'X50', 'X3112'])
    df = df[['X4', 'X11', 'X18', 'X50', 'X26', 'X3112']] # format to id,X4,X11,X18,X50,X26,X3112


    final_df = pd.concat([eval_img_id, df], axis=1)

    filename = 'pred_' + datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + '.csv'
    final_df.to_csv(filename, index=False)

    print(final_df)


# MODEL TRAINING

In [None]:
model = SmallBoy()
model_to_load = None
# model_to_load = os.path.join('data', 'small_boy_untrained.sav')
if model_to_load:
    if os.path.isfile(model_to_load):
        model.load_state_dict(torch.load(model_to_load))
        model.eval()
        print('loaded:', model_to_load)
model.to(device)

optimizer = optim.AdamW(model.parameters(), lr=hp.learning_rate)
lambda_lr = lambda step: hp.lr_decay ** step
scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_lr)
criterion = nn.MSELoss()

bk = bookkeeper(key_lst = ['train_loss', 'test_loss', 'R2'],
                total_batches = len(train_dataloader) * hp.num_epochs)
bk.reset_timer()

cur time 02:53:29


In [None]:
def validate():
    model.eval()

    test_loss = 0.0
    acc_predictions = [] # hold onto predictions and targets for R2
    acc_targets = []

    with torch.no_grad():
        for images, data, target in test_dataloader:

            images = images.to(device)
            data = data.to(device)
            target = target.to(device)

            prediction = model((images, data))
            test_loss += criterion(prediction, target)

            acc_predictions.append(prediction)
            acc_targets.append(target)

    test_loss = test_loss.item() / len(test_dataloader) / hp.batch_size

    acc_targets_cpu = [tensor.cpu().numpy() for tensor in acc_targets]
    acc_predictions_cpu = [tensor.cpu().numpy() for tensor in acc_predictions]

    r2 = r2_score(np.concatenate(acc_targets_cpu), np.concatenate(acc_predictions_cpu))

    return test_loss, r2

In [None]:
epoch_validation_points = []
for i in range(hp.vals_per_epoch):
    epoch_validation_points.append(len(train_dataloader) * (i+1) // hp.vals_per_epoch)

def train_one_epoch(epoch):
    model.train()

    running_loss = []

    for batch_idx, (images, data, target) in enumerate(train_dataloader):
        optimizer.zero_grad()

        images = images.to(device)
        data = data.to(device)
        target = target.to(device)

        prediction = model((images, data))
        loss = criterion(prediction, target)

        loss.backward()
        optimizer.step()

        bk.tick_batch(loss.item())
        running_loss.append(loss.item())

        # bookkeeping 3x PER EPOCH
        if (batch_idx+1) in epoch_validation_points:
            scheduler.step() # STEPPING SCHEDULER 3x PER EPOCH

            test_loss, r2 = validate()
            train_loss = np.array(running_loss).mean() / hp.batch_size
            bk.append('train_loss', train_loss)
            bk.append('test_loss', test_loss)
            bk.append('R2', r2)
            bk.tick_epoch(train_loss, test_loss, r2)

    bk.plot() # plot every epoch

    if np.abs(r2) < 100:
        torch.save(model.state_dict(), os.path.join('data', f"small_boy_epoch{epoch}.sav"))
        print('saved', f"small_boy_epoch{epoch}.sav", 'r2 = ', r2)
        print('lr' , get_current_lr(optimizer))

        predict(model)

    # for i in range(3 - min(int(np.log10(np.abs(r2))), 3)):
    #     scheduler.step()
    #     print('additional step')

In [None]:
def train_model(num_epochs = hp.num_epochs, alr_trained_epochs=0):

    for e in range(num_epochs):
        train_one_epoch(e + 1 + alr_trained_epochs)

In [None]:
# set_learning_rate(optimizer, 0.001* (0.9**5))

train_model()

KeyboardInterrupt: 

In [None]:
# get_current_lr(optimizer)

# torch.save(model.state_dict(), os.path.join('data', f"small_boy_untrained.sav"))
newdf = predict(model)


In [None]:
for k in newdf.columns[1:]:
    col = newdf[k]
    plt.title(k)
    plt.hist(col)
    plt.show()

todo: make bk pausable

and plot in place

AND FIX YOUR FUCKING ETA DIPSHIT