In [1]:
import pathlib

import numpy as np
import torch
import torch.nn.functional as F
import torch.optim as optim
from dataset import SequenceDataset
from sequence_transformations import TransformationRefined
from torch import Tensor
from torch.utils.data import DataLoader, random_split
from torch.utils.tensorboard import SummaryWriter
from torcheval.metrics.functional import (
    binary_accuracy,
    binary_f1_score,
    binary_precision,
    binary_recall,
)

torch.manual_seed(23)

<torch._C.Generator at 0x7f41680ff1b0>

In [2]:
DATASET_PATH = pathlib.Path("../data/classification/data.csv")
BATCH_SIZE = 32
SEQUENCE_LEN = 500

In [3]:
from torch import nn
from typing import Callable
import numpy as np
from tqdm import tqdm


def loss_batch(
    model: nn.Module,
    loss_func: Callable[[Tensor, Tensor], Tensor],
    xb: Tensor,
    yb: Tensor,
    opt: optim.Optimizer | None = None,
) -> tuple[float, int]:
    loss = loss_func(model(xb), yb)

    if opt is not None:
        loss.backward()
        opt.step()
        opt.zero_grad()

    return loss.item(), len(xb)


def calculate_epoch_metric(metrics, batch_sizes) -> float:
    return np.sum(np.multiply(metrics, batch_sizes)) / np.sum(batch_sizes)


def log_metrics(model, valid_dl) -> None:
    preds = model()


def fit(
    epochs: int,
    model: nn.Module,
    loss_func: Callable[[Tensor, Tensor], Tensor],
    opt: optim.Optimizer,
    train_dl: DataLoader,
    valid_dl: DataLoader,
    writer: SummaryWriter,
) -> None:
    for epoch in range(epochs):
        model.train()

        train_history = []
        for xb, yb in tqdm(train_dl):
            loss, n = loss_batch(model, loss_func, xb, yb, opt)
            train_history.append((loss, n))

        # train loss
        print(f"Epoch: {epoch}")
        losses, nums = zip(*train_history)
        train_loss = calculate_epoch_metric(losses, nums)
        print(f"Train loss: {train_loss}")
        writer.add_scalar("Loss/train", train_loss, epoch)

        model.eval()
        with torch.no_grad():
            # losses, nums = zip(
            #     *[loss_batch(model, loss_func, xb, yb) for xb, yb in valid_dl]
            # )
            # very slow, I do inference for each metric separate!! REFACTOR
            accuracies, nums = zip(
                *[(binary_accuracy(model(xb), yb), len(xb)) for xb, yb in valid_dl]
            )

        # validation loss
        # val_loss = calculate_epoch_metric(losses, nums)
        val_accuracy = calculate_epoch_metric(accuracies, nums)
        # writer.add_scalar("Loss/val", val_loss, epoch)
        writer.add_scalar("Accuracy/val", val_accuracy, epoch)
        # print(f"Validation loss: {val_loss}")

# "Refined" Represenation

In [4]:
dataset = SequenceDataset(DATASET_PATH, TransformationRefined())

train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(
    dataset,
    [train_size, val_size],
)

# from torch.utils.data import DataLoader, SubsetRandomSampler

# fraction = 0.02
# num_samples = int(len(train_dataset) * fraction)
# indices = torch.randperm(len(train_dataset))
# subset_sampler = SubsetRandomSampler(indices[:num_samples])

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
# train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=subset_sampler)
val_loader = DataLoader(val_dataset, batch_size=(BATCH_SIZE * 2), shuffle=False)

## Baseline

In [5]:
from torch import nn


class LogisticRegression(nn.Module):
    def __init__(self):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(SEQUENCE_LEN * 2, 1)

    def forward(self, x):  # B, T, C
        x = x.view(x.size(0), -1)  # B, T * C
        x = F.sigmoid(self.linear(x))  # B, 1
        x = x.squeeze()  # B
        x = x.to(torch.float32)
        return x

In [6]:
logistic_regression_model = LogisticRegression()
opt = optim.Adam(logistic_regression_model.parameters(), lr=0.01)

In [7]:
# print(binary_f1_score(output, ground_truth))
# print(binary_accuracy(output, ground_truth))
# print(binary_precision(output, ground_truth))
# print(binary_recall(output, ground_truth))

In [8]:
writer = SummaryWriter("runs/logistic_regression_refined")

In [9]:
fit(
    3,
    logistic_regression_model,
    F.binary_cross_entropy,
    opt,
    train_loader,
    val_loader,
    writer,
)

100%|██████████| 7183/7183 [01:20<00:00, 89.65it/s]


Epoch: 0
Train loss: 0.8062556870663057


100%|██████████| 7183/7183 [01:24<00:00, 84.85it/s]


Epoch: 1
Train loss: 0.7965391815188708


100%|██████████| 7183/7183 [01:31<00:00, 78.56it/s]


Epoch: 2
Train loss: 0.7880991046400913


In [None]:
writer.flush()