In [2]:
# Load packages
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from models import train_neural_network, evaluate_model
from utils import DataProcessor, CustomDataset

if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name())

Overview of the data creation flow:
1. Load the raw data into a list of tuples (text ID, speaker ID, text, label)
2. Split the raw data into train, dev, test datasets, if you only need to randomly split data
3. Prepare a TfidfVectorizer. Fit the vectorizer on the train set, and use it to transform all train, dev, test sets.
4. Prepare the CustomDataset objects to be fed to the `train_` function.

What's important is that at the end, the train, dev, test sets must be CustomDataset objects. So if you want to use some countries as the **train set**, and some other countries as the **dev & test set**, you will need to load the train, dev, test countries separately

In [3]:

#%%
file_list = [
    'power-gb-train.tsv',
    'power-ua-train.tsv'
]

processor = DataProcessor()

raw_data = processor.load_data(
    folder_path="data/power/",
    file_list=file_list,
    text_head='text_en'
)

train_dev_raw, test_raw = processor.split_data(raw_data, test_size=0.2)
train_raw, dev_raw = processor.split_data(train_dev_raw, test_size=0.2)

#%%
print("Prepare data encoder...")
train_texts = [tup[2] for tup in train_raw]
train_encoder = TfidfVectorizer(sublinear_tf=True, analyzer="char", ngram_range=(1,3))
train_encoder.fit(train_texts)

print("Prepare data...")
train_dataset = CustomDataset(train_raw, train_encoder)
dev_dataset = CustomDataset(dev_raw, train_encoder)
test_dataset = CustomDataset(test_raw, train_encoder)


Load power-gb-train.tsv...
Load power-ua-train.tsv...
Prepare data encoder...
Prepare data...


Train model.
If you use Google Colab or your machine has a CUDA-supported graphic card, you can try setting `device='cuda'`

In [4]:

print("Train model...")
model = train_neural_network(
    train_data=train_dataset,
    dev_data=dev_dataset,
    num_classes=2,
    hidden_size=64,
    num_epochs=20,
    early_stop_patience=5,
    device='cpu'
)


Train model...


Epoch 1: 100%|██████████| 443/443 [00:03<00:00, 118.17batch/s]
Epoch 2: 100%|██████████| 443/443 [00:03<00:00, 116.17batch/s]
Epoch 3: 100%|██████████| 443/443 [00:03<00:00, 119.85batch/s]
Epoch 4: 100%|██████████| 443/443 [00:03<00:00, 121.25batch/s]
Epoch 5: 100%|██████████| 443/443 [00:03<00:00, 122.35batch/s]
Epoch 6: 100%|██████████| 443/443 [00:03<00:00, 112.63batch/s]
Epoch 7: 100%|██████████| 443/443 [00:03<00:00, 116.25batch/s]


Validation loss did not improve for 5 epochs. Stopping early.


Check model's final performance

In [5]:
precision, recall, f1 = evaluate_model(model, test_dataset)
print(precision, recall, f1)



0.7163155387764184 0.7146647603415488 0.7152942856591957
