In [1]:
import pathlib

import numpy as np
import torch
import torch.nn.functional as F
import torch.optim as optim
from dataset import SequenceDataset
from sequence_transformations import TransformationRefined
from torch import Tensor
from torch.utils.data import DataLoader, random_split
from torch.utils.tensorboard import SummaryWriter
from torcheval.metrics.functional import (
    binary_accuracy,
    binary_f1_score,
    binary_precision,
    binary_recall,
)

torch.manual_seed(23)

<torch._C.Generator at 0x7f82903f71b0>

In [2]:
DATASET_PATH = pathlib.Path("../data/classification/data.csv")
BATCH_SIZE = 32
SEQUENCE_LEN = 500

In [3]:
from torch import nn
from typing import Callable
import numpy as np
from tqdm import tqdm


def calculate_epoch_metric(metrics: list[float], batch_sizes: list[int]) -> float:
    return np.sum(np.multiply(metrics, batch_sizes)) / np.sum(batch_sizes)


def log_metrics(model, valid_dl) -> None:
    preds = model()


def fit(
    epochs: int,
    model: nn.Module,
    loss_func: Callable[[Tensor, Tensor], Tensor],
    opt: optim.Optimizer,
    train_dl: DataLoader,
    valid_dl: DataLoader,
    writer: SummaryWriter,
) -> None:
    for epoch in range(epochs):
        model.train()

        train_history = []
        for xb, yb in tqdm(train_dl):
            batch_size = len(xb)
            loss = loss_func(model(xb), yb)
            loss.backward()
            opt.step()
            opt.zero_grad()
            train_history.append((loss.item(), batch_size))

        # train loss
        print(f"Epoch: {epoch}")
        losses, nums = zip(*train_history)
        train_loss = calculate_epoch_metric(losses, nums)
        print(f"Train loss: {train_loss}")
        writer.add_scalar("loss/train", train_loss, epoch)

        model.eval()
        with torch.no_grad():
            metrics = {k: [] for k in ["loss", "accuracy", "precision", "recall", "f1"]}
            batch_sizes = []
            for xb, yb in tqdm(valid_dl):
                outputs = model(xb)
                batch_sizes.append(len(xb))
                metrics["loss"].append(loss_func(outputs, yb).item())
                metrics["accuracy"].append(binary_accuracy(outputs, yb).item())
                metrics["precision"].append(binary_precision(outputs, yb).item())
                metrics["recall"].append(
                    binary_recall(outputs.to(torch.int8), yb.to(torch.int8)).item()
                )
                metrics["f1"].append(binary_f1_score(outputs, yb).item())

            for m_name, metrics in metrics.items():
                epoch_metric = calculate_epoch_metric(metrics, batch_sizes)
                writer.add_scalar(f"{m_name}/val", epoch_metric, epoch)
                print(f"{m_name}: {epoch_metric}")

# "Refined" Represenation

In [4]:
dataset = SequenceDataset(DATASET_PATH, TransformationRefined())

train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(
    dataset,
    [train_size, val_size],
)

# from torch.utils.data import DataLoader, SubsetRandomSampler

# fraction = 0.02
# num_samples = int(len(train_dataset) * fraction)
# indices = torch.randperm(len(train_dataset))
# subset_sampler = SubsetRandomSampler(indices[:num_samples])

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
# train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=subset_sampler)
val_loader = DataLoader(val_dataset, batch_size=(BATCH_SIZE * 2), shuffle=False)

## Baseline

In [5]:
from torch import nn


class LogisticRegression(nn.Module):
    def __init__(self):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(SEQUENCE_LEN * 2, 1)

    def forward(self, x):  # B, T, C
        x = x.view(x.size(0), -1)  # B, T * C
        x = F.sigmoid(self.linear(x))  # B, 1
        x = x.squeeze()  # B
        x = x.to(torch.float32)
        return x

In [6]:
logistic_regression_model = LogisticRegression()
opt = optim.Adam(logistic_regression_model.parameters(), lr=0.01)

In [7]:
writer = SummaryWriter("runs/logistic_regression_refined")

In [8]:
fit(
    3,
    logistic_regression_model,
    F.binary_cross_entropy,
    opt,
    train_loader,
    val_loader,
    writer,
)

100%|██████████| 7183/7183 [03:30<00:00, 34.09it/s]


Epoch: 0
Train loss: 0.8062556870663057


100%|██████████| 898/898 [00:50<00:00, 17.78it/s]


loss: 0.9323835954366491
accuracy: 0.5075787898289403
precision: 0.6364704746944678
recall: 0.0
f1: 0.31374545374862284


100%|██████████| 7183/7183 [03:30<00:00, 34.05it/s]


Epoch: 1
Train loss: 0.7965391815188708


100%|██████████| 898/898 [00:50<00:00, 17.88it/s]


loss: 0.7626403066528733
accuracy: 0.5765623096497564
precision: 0.5904730093843653
recall: 0.0
f1: 0.6430722986073307


100%|██████████| 7183/7183 [03:29<00:00, 34.21it/s]


Epoch: 2
Train loss: 0.7880991046400913


100%|██████████| 898/898 [00:50<00:00, 17.88it/s]

loss: 0.7508740363713853
accuracy: 0.576057637087176
precision: 0.5977548786603166
recall: 0.0
f1: 0.6258367040093713





In [9]:
writer.flush()