In [2]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from tqdm import tqdm
from utils import PositionalEncoder, load_data, split_data
from models import TrainConfig, RNNClassifier

if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print("Device: cuda")
        print(torch.cuda.get_device_name(i))
else:
    print("Device: cpu")

Device: cpu


In [3]:
# Only needed if running on Colab
# from google.colab import drive
# drive.mount('/content/drive')
# base_dir = "/content/drive/MyDrive/data"

base_dir = "data/power/"

In [4]:
# Load packages
file_list = [
    'power-gb-train.tsv',
    # 'power-ua-train.tsv',
    # 'power-fr-train.tsv',
    # 'power-nl-train.tsv',
]

full_data = load_data(folder_path=base_dir, file_list=file_list,text_head='text_en')
train_dev_raw, test_raw = split_data(full_data, test_size=0.2, random_state=0)
train_raw, dev_raw = split_data(train_dev_raw, test_size=0.2, random_state=0)


Load power-gb-train.tsv...


In [5]:

print("Prepare data encoder...")
train_encoder = PositionalEncoder()
train_encoder.fit(train_raw.texts)

Prepare data encoder...


In [6]:
train_dataloader = DataLoader(train_raw, batch_size=50, shuffle=True)
test_dataloader = DataLoader(test_raw, batch_size=50, shuffle=True)

# Prepare baseline config
train_config = TrainConfig(
    optimizer_params = {'lr': 0.01},
    num_epochs       = 10,
    early_stop       = False,
    violation_limit  = 5
)

# Train baseline model
baseline_lstm = RNNClassifier(
    rnn_network         = nn.LSTM,
    word_embedding_dim  = 32,
    hidden_dim          = 64,
    bidirectional       = False,
    dropout             = 0,
    encoder             = train_encoder,
    device              = 'cuda'
)

# TODO: Ask Fredrik if this is truly the case
# This is slow because LSTM reads in one word in the sentence at a time. The maximum "sentence" length of a batch can be 1000,
# so it does at least 1000 matrix multiplication per batch
baseline_lstm.fit(train_dataloader, train_config, no_progress_bar=False)



CUDA not available. Run model on CPU.


  tokens_sparse = torch.sparse_csr_tensor(crow, col, token_val, size=mat_size, dtype=torch.long)
Epoch 1:   0%|          | 2/429 [00:13<46:22,  6.52s/batch, batch_accuracy=0.52, loss=106]


KeyboardInterrupt: 

In [None]:

def evaluate_model(
        model: nn.Module | RNNClassifier,
        test_dataloader,
        train_encoder
    ) -> float:
    """Evaluate the model on an inputs-targets set, using accuracy metric.

    Parameters
    ----------
    model : nn.Module
        Should be one of the two custom RNN taggers we defined.
    inputs : torch.Tensor
    targets : torch.Tensor
    pad_tag_idx : int
        Index of the <PAD> tag in the tagset to be ignored when calculating accuracy

    Returns
    -------
    float
        Accuracy metric (ignored the <PAD> tag)
    """
    corrects = []
    total_dpoints = 0
    for ids, speakers, raw_inputs, raw_targets in tqdm(test_dataloader, unit="batch"):

        batch_encoder = PositionalEncoder(vocabulary=train_encoder.vocabulary)
        inputs = batch_encoder.fit_transform(raw_inputs)
        targets = torch.as_tensor(raw_targets, dtype=torch.float).to(model.device)  # nn.CrossEntropyLoss() require target to be float

        # Make prediction
        scores = model(inputs.to(model.device))
        pred = scores > 0.5
        correct = (pred == targets).sum().item()
        corrects.append(correct)
        total_dpoints += len(inputs)

    accuracy = sum(corrects) / total_dpoints

    return accuracy

# Evaluate  model
baseline_lstm_acc = evaluate_model(baseline_lstm, test_dataloader, train_encoder)
print(f"Last train accuracy: {baseline_lstm.training_accuracy_[-1] * 100:.1f}%. Test accuracy {baseline_lstm_acc * 100:.1f}%")
