In [1]:
# Load packages
import numpy as np
import torch
from torch.utils.data import DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer
from models import NeuralNetwork, TrainConfig, evaluate_nn_model, save_model, load_model, plot_results
from utils import load_data, split_data, encode_data
from pathlib import Path
import altair as alt
import pandas as pd

from tqdm import tqdm

if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print("Device: cuda")
        print(torch.cuda.get_device_name(i))
else:
    print("Device: cpu")

Device: cuda
NVIDIA GeForce RTX 3050 Laptop GPU


**Overview of the flow:**
1. Load the raw data in to a RawParliamentData object containing (text ID, speaker ID, text, label)
2. Split the raw data into train, dev, test datasets. Data is split so that speakers in each set does not appear in another set.
3. Prepare a TfidfVectorizer. Fit the vectorizer on the train set, and use it to transform all train, dev, test sets. Use the `create_dataset()` function on the RawParliamentData objects, and supply the fitted encoder so that the same trained encoder is used on all sets.
4. Run the `train_neural_network()` function.

**To test different types of train-dev-test sets:**
If you want to use some countries as the **train set**, and some other countries as the **dev & test set**, you will need to load the train, dev, test countries separately. For example: 

```python
train_raw = load_data(folder_path="data/power/", file_list=['power-gb-train.tsv',],text_head='text_en')
dev_raw = load_data(folder_path="data/power/", file_list=['power-ua-train.tsv',],text_head='text_en')
test_raw = load_data(folder_path="data/power/", file_list=['power-cz-train.tsv',],text_head='text_en')
```

In [2]:

#%%
file_list = [
    'power-gb-train.tsv',
    # 'power-ua-train.tsv',
    # 'power-fr-train.tsv',
    # 'power-nl-train.tsv',
]

full_data = load_data(folder_path="data/train/power/", file_list=file_list,text_head='text_en')
train_dev_raw, test_raw = split_data(full_data, test_size=0.2, random_state=0)
train_raw, dev_raw = split_data(train_dev_raw, test_size=0.2, random_state=0)


Load power-gb-train.tsv...


In [2]:
file_list = [
    'power-gb-test.tsv',
    # 'power-ua-train.tsv',
    # 'power-fr-train.tsv',
    # 'power-nl-train.tsv',
]

test_data = load_data(folder_path="data/test/power/", file_list=file_list,text_head='text_en')


In [13]:

print("Prepare data encoder...")
# train_encoder = TfidfVectorizer(sublinear_tf=True, analyzer="char", ngram_range=(1,3))
train_encoder = TfidfVectorizer(max_features=10000)
train_encoder.fit(train_raw.texts)

print("Prepare data...")
train_dataset = encode_data(train_raw, train_encoder)
dev_dataset = encode_data(dev_raw, train_encoder)
test_dataset = encode_data(test_raw, train_encoder)


Prepare data encoder...
Prepare data...


Train model.
If you use Google Colab or your machine has a CUDA-supported graphic card, you can try setting `device='cuda'`

In [15]:
models_dir = Path('models')

if not models_dir.exists():
    models_dir.mkdir(parents=True, exist_ok=True)

train_config = TrainConfig(
    num_epochs      = 10,
    early_stop      = False,
    violation_limit = 5,
)

train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)
dev_dataloader = DataLoader(dev_dataset, batch_size=128, shuffle=True)


model_nn = NeuralNetwork(
    input_size=len(train_encoder.vocabulary_),
    hidden_size=128,
    device='cuda'
)

if Path('models/model_nn.pt').exists():
    model_nn = load_model(model_nn, 'model_nn')
else:
    model_nn.fit(train_dataloader, train_config)
    save_model(model_nn, "model_nn")

model_nn_results = evaluate_nn_model(model_nn, test_dataset)
np.save('models/model_nn_results.npy', model_nn_results)
print(model_nn_results)



Epoch 1: 100%|██████████| 168/168 [00:01<00:00, 156.39batch/s, batch_accuracy=0.75, loss=68.4]
Epoch 2: 100%|██████████| 168/168 [00:01<00:00, 162.30batch/s, batch_accuracy=0.875, loss=89.9]
Epoch 3: 100%|██████████| 168/168 [00:01<00:00, 165.91batch/s, batch_accuracy=0.85, loss=81.5]
Epoch 4: 100%|██████████| 168/168 [00:01<00:00, 159.18batch/s, batch_accuracy=0.95, loss=70.3]
Epoch 5: 100%|██████████| 168/168 [00:01<00:00, 163.66batch/s, batch_accuracy=0.925, loss=62.3]
Epoch 6: 100%|██████████| 168/168 [00:01<00:00, 159.80batch/s, batch_accuracy=0.975, loss=90.3]
Epoch 7: 100%|██████████| 168/168 [00:01<00:00, 165.74batch/s, batch_accuracy=1, loss=68.2]   
Epoch 8: 100%|██████████| 168/168 [00:01<00:00, 154.43batch/s, batch_accuracy=0.95, loss=64.5]
Epoch 9: 100%|██████████| 168/168 [00:01<00:00, 151.17batch/s, batch_accuracy=1, loss=72.2]   
Epoch 10: 100%|██████████| 168/168 [00:01<00:00, 148.66batch/s, batch_accuracy=1, loss=68.2]   


(0.7215514779090881, 0.8594974279403687, 0.7751919031143188)


In [6]:
# Plot training accuracy and loss side-by-side
plot_results(model_nn, train_config, train_dataloader)