In [1]:
# Load packages
import torch
from torch.utils.data import DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer
from models import NeuralNetwork, TrainConfig, evaluate_nn_model
from utils import load_data, split_data, encode_data

if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print("Device: cuda")
        print(torch.cuda.get_device_name(i))
else:
    print("Device: cpu")

Device: cpu


**Overview of the flow:**
1. Load the raw data in to a RawParliamentData object containing (text ID, speaker ID, text, label)
2. Split the raw data into train, dev, test datasets. Data is split so that speakers in each set does not appear in another set.
3. Prepare a TfidfVectorizer. Fit the vectorizer on the train set, and use it to transform all train, dev, test sets. Use the `create_dataset()` function on the RawParliamentData objects, and supply the fitted encoder so that the same trained encoder is used on all sets.
4. Run the `train_neural_network()` function.

**To test different types of train-dev-test sets:**
If you want to use some countries as the **train set**, and some other countries as the **dev & test set**, you will need to load the train, dev, test countries separately. For example: 

```python
train_raw = load_data(folder_path="data/power/", file_list=['power-gb-train.tsv',],text_head='text_en')
dev_raw = load_data(folder_path="data/power/", file_list=['power-ua-train.tsv',],text_head='text_en')
test_raw = load_data(folder_path="data/power/", file_list=['power-cz-train.tsv',],text_head='text_en')
```

In [2]:

#%%
file_list = [
    'power-gb-train.tsv',
    # 'power-ua-train.tsv',
    # 'power-fr-train.tsv',
    # 'power-nl-train.tsv',
]

full_data = load_data(folder_path="data/power/", file_list=file_list,text_head='text_en')
train_dev_raw, test_raw = split_data(full_data, test_size=0.2, random_state=0)
train_raw, dev_raw = split_data(train_dev_raw, test_size=0.2, random_state=0)


Load power-gb-train.tsv...


In [3]:

print("Prepare data encoder...")
train_encoder = TfidfVectorizer(sublinear_tf=True, analyzer="char", ngram_range=(1,3))
train_encoder.fit(train_raw.texts)

print("Prepare data...")
train_dataset = encode_data(train_raw, train_encoder)
dev_dataset = encode_data(dev_raw, train_encoder)
test_dataset = encode_data(test_raw, train_encoder)


Prepare data encoder...
Prepare data...


Train model.
If you use Google Colab or your machine has a CUDA-supported graphic card, you can try setting `device='cuda'`

In [4]:
train_config = TrainConfig(
    num_epochs      = 1,
    early_stop      = True,
    violation_limit = 5,
)

model = NeuralNetwork(
    input_size=len(train_encoder.vocabulary_),
    num_classes=2,
    hidden_size=128,
    device='cpu'
)

train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)
dev_dataloader = DataLoader(dev_dataset, batch_size=128, shuffle=True)

model.fit(
    train_dataloader   = train_dataloader,
    dev_dataloader     = dev_dataloader,
    train_config       = train_config
)


Epoch 1: 100%|██████████| 168/168 [00:03<00:00, 53.81batch/s]


Check model's final performance

In [6]:
precision, recall, f1 = evaluate_nn_model(model, test_dataset)
print(precision, recall, f1)



0.7496002190046804 0.7082020798259653 0.708926310652036
