In [1]:
# Load packages
import numpy as np
import torch
from torch.utils.data import DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer
from models import NeuralNetwork, TrainConfig, evaluate_nn_model, save_model, load_model, plot_results
from utils import load_data, split_data, encode_data, mapping_dict
from pathlib import Path
import altair as alt
import pandas as pd

from tqdm import tqdm

if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print("Device: cuda")
        print(torch.cuda.get_device_name(i))
else:
    print("Device: cpu")

Device: cuda
NVIDIA GeForce RTX 3050 Laptop GPU


### POC

In [2]:
# POC
#%%
file_list = [
    'power-gb-train.tsv',
    # 'power-ua-train.tsv',
    # 'power-fr-train.tsv',
    # 'power-nl-train.tsv',
]

full_data = load_data(folder_path="data/train/power/", file_list=file_list,text_head='text_en')
train_raw, test_raw = split_data(full_data, test_size=0.2, random_state=0)

file_list = [
    'power-gb-test.tsv',
    # 'power-ua-train.tsv',
    # 'power-fr-train.tsv',
    # 'power-nl-train.tsv',
]

test_data = load_data(folder_path="data/test/power/", file_list=file_list,text_head='text_en')

print("Prepare data encoder...")
# train_encoder = TfidfVectorizer(sublinear_tf=True, analyzer="char", ngram_range=(1,3))
train_encoder = TfidfVectorizer(max_features=10000)
train_encoder.fit(train_raw.texts)

print("Prepare data...")
train_dataset = encode_data(train_raw, train_encoder)
test_dataset = encode_data(test_raw, train_encoder)

print("Train model")
models_dir = Path('models')

if not models_dir.exists():
    models_dir.mkdir(parents=True, exist_ok=True)

train_config = TrainConfig(
    num_epochs      = 10,
    early_stop      = False,
    violation_limit = 5,
)

train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)


model_nn = NeuralNetwork(
    input_size=len(train_encoder.vocabulary_),
    hidden_size=128,
    device='cuda'
)

if Path('models/model_nn.pt').exists():
    model_nn = load_model(model_nn, 'model_nn')
else:
    model_nn.fit(train_dataloader, train_config)
    save_model(model_nn, "model_nn")

model_nn_results = evaluate_nn_model(model_nn, test_dataset)
np.save('models/model_nn_results.npy', model_nn_results)
print(model_nn_results)


# Plot training accuracy and loss side-by-side
plot_results(model_nn, train_config, train_dataloader)

Prepare data encoder...


KeyboardInterrupt: 

# Mass analysis

In [9]:
# Mass testing all countries"s English text

parent_dir = Path("data/train/power")

file_list = sorted([file for file in parent_dir.glob("*.tsv")])
text_en_result_list = []

for file in file_list:

    full_data = load_data(folder_path=parent_dir, file_list=[file.name],text_head="text_en")
    train_dev_raw, test_raw = split_data(full_data, test_size=0.2, random_state=0)
    train_raw, dev_raw = split_data(train_dev_raw, test_size=0.2, random_state=0)

    # train_encoder = TfidfVectorizer(sublinear_tf=True, analyzer="char", ngram_range=(1,3))
    train_encoder = TfidfVectorizer()
    train_encoder.fit(train_raw.texts)

    train_dataset = encode_data(train_raw, train_encoder)
    dev_dataset = encode_data(dev_raw, train_encoder)
    test_dataset = encode_data(test_raw, train_encoder)

    train_config = TrainConfig(
        num_epochs      = 10,
        early_stop      = False,
        violation_limit = 5,
    )

    train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)
    dev_dataloader = DataLoader(dev_dataset, batch_size=128, shuffle=True)


    model_nn = NeuralNetwork(
        input_size=len(train_encoder.vocabulary_),
        hidden_size=128,
        device="cpu"
    )

    if Path(f"models/model_nn_{file.stem}_en.pt").exists():
        model_nn = load_model(model_nn, f"model_nn_{file.stem}_en")
    else:
        model_nn.fit(train_dataloader, train_config)
        save_model(model_nn, f"model_nn_{file.stem}_en")

    model_nn_results = evaluate_nn_model(model_nn, test_dataset)
    text_en_result_list.append(model_nn_results)
    
    np.save(f"models/model_nn_{file.stem}_en_results.npy", model_nn_results)
    print(file.stem, model_nn_results)


# Mass testing all countries's original text

parent_dir = Path("data/train/power")

file_list = sorted([file for file in parent_dir.glob("*.tsv")])
text_ori_result_list = []

for file in file_list:

    full_data = load_data(folder_path=parent_dir, file_list=[file.name],text_head="text")
    train_dev_raw, test_raw = split_data(full_data, test_size=0.2, random_state=0)
    train_raw, dev_raw = split_data(train_dev_raw, test_size=0.2, random_state=0)

    # train_encoder = TfidfVectorizer(sublinear_tf=True, analyzer="char", ngram_range=(1,3))
    train_encoder = TfidfVectorizer()
    train_encoder.fit(train_raw.texts)

    train_dataset = encode_data(train_raw, train_encoder)
    dev_dataset = encode_data(dev_raw, train_encoder)
    test_dataset = encode_data(test_raw, train_encoder)

    train_config = TrainConfig(
        num_epochs      = 10,
        early_stop      = False,
        violation_limit = 5,
    )

    train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)
    dev_dataloader = DataLoader(dev_dataset, batch_size=128, shuffle=True)


    model_nn = NeuralNetwork(
        input_size=len(train_encoder.vocabulary_),
        hidden_size=128,
        device="cpu"
    )

    if Path(f"models/model_nn_{file.stem}_ori.pt").exists():
        model_nn = load_model(model_nn, f"model_nn_{file.stem}_ori")
    else:
        model_nn.fit(train_dataloader, train_config)
        save_model(model_nn, f"model_nn_{file.stem}_ori")

    model_nn_results = evaluate_nn_model(model_nn, test_dataset)
    text_ori_result_list.append(model_nn_results)
    
    np.save(f"models/model_nn_{file.stem}_ori_results.npy", model_nn_results)
    print(file.stem, model_nn_results)


# Detect class imbalance
parent_dir = Path("data/train/power")

file_list = sorted([file for file in parent_dir.glob("*.tsv")])
stats = []

for file in file_list:
    full_data = load_data(folder_path=parent_dir, file_list=[file.name],text_head="text")
    positive = sum(full_data.labels)
    stats.append((positive, len(full_data), positive / len(full_data)))
    print(f"{file.name}: Positive {positive / len(full_data) * 100:.2f}%")


power-at-train (0.758832573890686, 0.5467289686203003, 0.5984655022621155)
power-ba-train (0.8353909254074097, 0.9308755993843079, 0.9099099040031433)
power-be-train (0.540569007396698, 0.39673912525177, 0.5011441707611084)
power-bg-train (0.6343283653259277, 0.48170730471611023, 0.5632798671722412)
power-cz-train (0.6032568216323853, 0.457337886095047, 0.5)
power-dk-train (0.6287744045257568, 0.5846599340438843, 0.6590538620948792)
power-es-ct-train (0.7605177760124207, 0.8916256427764893, 0.8302752375602722)
power-es-ga-train (0.6823529601097107, 0.8928571343421936, 0.7352941036224365)
power-es-pv-train (0.6978723406791687, 0.5773195624351501, 0.6120218634605408)
power-es-train (0.7452702522277832, 0.7231897115707397, 0.8071611523628235)
power-fi-train (0.5883293151855469, 0.5476190447807312, 0.537286639213562)
power-fr-train (0.699592649936676, 0.35103243589401245, 0.4465290904045105)
power-gb-train (0.7102324366569519, 0.7643880248069763, 0.7466350197792053)
power-gr-train (0.82998

In [10]:
# Plot performance of the two languages

def plot_countries(country_group):

    for file, en ,ori, stat in zip(file_list, text_en_result_list, text_ori_result_list, stats):
        data = [
            ('en', stat[2], *en),
            ('ori', stat[2], *ori)
        ]

        country_code = file.stem.replace('power-', '').replace('-train', '')
        country_name = mapping_dict[country_code]

        if country_code in country_group:

            results_df = pd.DataFrame(data, columns=["language", "positive_pct", "precision", "recall", 'f1']).melt(id_vars="language")

            result_chart = alt.Chart(results_df).mark_bar().encode(
                x = alt.X('variable:N', axis = alt.Axis(title = '', labels = False, ticks = False), sort = None, ),
                y = alt.Y('value:Q', axis = alt.Axis(title = 'Score'), scale=alt.Scale(domain=(0, 1))),
                column=alt.Column('language:N', title='Language', sort = None),
                color=alt.Color('variable:N', scale=alt.Scale(scheme='category20'), title='Evaluation Metric', sort = None)
            ).properties(
                width=200,
                height=300,
                title = f"{country_code} - {country_name} - {stat[1]} datapoints"
            )

            result_chart.show()

### Result summary
- Austria: lots of data, good precision, low recall. Worse performance on original language
- Bosnia: Not much data, good results overall
- Belgium: medium data, bad result
- Bulgaria: medium data, bad
- Czechia: medium data, Bad results
- Denmark: medium data, bad result
- Catalonia: less data, result quite good, precision < recall. EN and ORI are comparable
- Galacia: same with Catalonia
- Basque: Same with others, but worse result using EN
- Spain: Medium-large data, everything is balanced around 0.7 - 0.8, similar perofrmance in both lang. Lots of positive labels
- Finland: medium data, overall bad performance, lower performance in ORI
- France: medium-large data, very low recall -> cannot capture negative class. Similar performance in both lang
- GB: large data, balance results
- Greece: Medium data, good result on en text
- Croatia: Large data, bad results on both. Slightly better in English
- Hungary: less data, quite good results on EN
- Italy: Medium data, good precision, bad recall, low positive percentage
- Latvia: Not much data, bad result
- Netherlands: medium data, slight class imbalance, overall bad result
- Poland: class balance, result is bad / so-so
- Portugal: medium data, Slight class imbalance, high precision but low recall
- Serbia: large data, high class imbalance, high precision, low recall
- Slovenia: Medium-large data ,high class imbalance, high precision, low recall
- Turkey: Large data, balance class, good overall result on English
- Ukraine: medium-large data, high class imbalance, high precision, low recall

- Low positive score: good precision, bad recall (true positive / total positive) -> can easily capture true positive and negative by guessing all to be negative, so miss lots of true positive -> the effect of class imbalance

- Less data = better result?

# Experiment: Test by country groups

- Balkan
  - Bosnia and Herzegovina (ba)
  - Croatia (hr)
  - Serbia (rs)

- Diff
  - Greece (gr)
  - Bulgaria (bg)

- Spanish
  - Spain (es)
  - Catalonia (es-ct)
  - Galicia (es-ga)
  - Basque Country (es-pv) [only power]

- Nordic
  - Denmark (dk) 
  - Finland (fi)
  - Iceland (is) [only political orientation] 
  - Norway (no) [only political orientation] 
  - Sweden (se) [only political orientation] 

- Slavic
  - Poland (pl)
  - Ukraine (ua)
  - Czechia (cz)
  - Serbia (rs)
  - Slovenia (si)

- West German
  - Austria (at)
  - Great Britain (gb)
  - The Netherlands (nl)
  - Norway (no) [only political orientation] 
  - Sweden (se) [only political orientation] 
  - Belgium (be)


- Romance
  - France (fr)
  - Portugal (pt)
  - Italy (it)

- Uralic
  - Estonia (ee) 
  - Hungary (hu)

- Baltic
  - Latvia (lv)
  - Lithuanian


- Turkic
  - Turkey (tr)

In [6]:
balkans = ['ba', 'hr', 'rs']
parent_dir = Path("data/train/power")

for code in balkans:
    full_data = load_data(folder_path=parent_dir, file_list=[f"power-{code}-train.tsv"],text_head="text")
    print(
        code, "\t", 
        mapping_dict[code], "\t", 
        len(full_data), "\t",
        full_data.texts[0][:100])

ba 	 Bosnia and Herzegovina 	 2531 	 Zahvaljujem gospodo predsjedavajući, dame i gospodo, Sa žaljenjem mogu konstatovati da na sjednici K
hr 	 Croatia 	 10741 	 Gospodine predsjedniče, uvaženi kolega zastupnik Leko i kolega zastupnik Arlović iznosili su neke ar
rs 	 Serbia 	 15114 	 Dame i gospodo, dozvolite da u ime našeg izbornog tela prenesem pozdrave i želju da ova skupština po


In [11]:
plot_countries(balkans)

In [12]:
# Try to handle Croatia differently
# How to deal with class imbalance

- Training and tetsing on original text seems to have good effect
- Using character-level tokens seems to work better. a vocabulary of 50000 tokens provides diminishing return.
- Still need to deal with imbalance labels

In [3]:
# POC
#%%
balkan_file_list = [
    'power-ba-train.tsv',
    'power-rs-train.tsv',
    'power-hr-train.tsv',
]

balkan_data = load_data(folder_path="data/train/power/", file_list=balkan_file_list,text_head='text')
train_raw, test_raw = split_data(balkan_data, test_size=0.2, random_state=0)

print(len(train_raw), len(test_raw))
print("Percentage of positive:", sum(train_raw.labels) / len(train_raw), sum(test_raw.labels) / len(test_raw))

print("Prepare data encoder...")
# train_encoder = TfidfVectorizer(sublinear_tf=True, analyzer="char", ngram_range=(1,3))
train_encoder = TfidfVectorizer(max_features=50000, analyzer="char", ngram_range=(3,5))
train_encoder.fit(train_raw.texts)
print("Vocabulary", len(train_encoder.vocabulary_))

print("Prepare data...")
train_dataset = encode_data(train_raw, train_encoder)
test_dataset = encode_data(test_raw, train_encoder)

print("Train model")
models_dir = Path('models')

if not models_dir.exists():
    models_dir.mkdir(parents=True, exist_ok=True)

train_config = TrainConfig(
    num_epochs      = 10,
    early_stop      = False,
    violation_limit = 5,
)

train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)


model_nn_balkan = NeuralNetwork(
    input_size=len(train_encoder.vocabulary_),
    hidden_size=128,
    device='cuda'
)

USE_CACHE = False

if Path('models/model_nn_balkan.pt').exists() and USE_CACHE:
    model_nn_balkan = load_model(model_nn_balkan, 'model_nn_balkan')
else:
    model_nn_balkan.fit(train_dataloader, train_config, disable_progress_bar=False)
    save_model(model_nn_balkan, "model_nn_balkan")

model_nn_balkan_results = evaluate_nn_model(model_nn_balkan, test_dataset)
np.save('models/model_nn_balkan_results.npy', model_nn_balkan_results)
print(model_nn_balkan_results)

model_nn_balkan.cpu()

# Plot training accuracy and loss side-by-side
plot_results(model_nn_balkan, train_config, train_dataloader)

22672 5714
Percentage of positive: 0.3698394495412844 0.364193209660483
Prepare data encoder...
Vocabulary 50000
Prepare data...
Train model


Epoch 1: 100%|██████████| 178/178 [00:02<00:00, 70.13batch/s, batch_accuracy=0.875, loss=16.3]
Epoch 2: 100%|██████████| 178/178 [00:02<00:00, 70.81batch/s, batch_accuracy=0.625, loss=11.5]
Epoch 3: 100%|██████████| 178/178 [00:02<00:00, 68.61batch/s, batch_accuracy=0.812, loss=9.81]
Epoch 4: 100%|██████████| 178/178 [00:02<00:00, 69.61batch/s, batch_accuracy=0.938, loss=6.31]
Epoch 5: 100%|██████████| 178/178 [00:02<00:00, 67.86batch/s, batch_accuracy=1, loss=23.4]   
Epoch 6: 100%|██████████| 178/178 [00:02<00:00, 69.49batch/s, batch_accuracy=1, loss=5.69]   
Epoch 7: 100%|██████████| 178/178 [00:02<00:00, 72.44batch/s, batch_accuracy=1, loss=8.9]    
Epoch 8: 100%|██████████| 178/178 [00:02<00:00, 69.93batch/s, batch_accuracy=1, loss=16.7]   
Epoch 9: 100%|██████████| 178/178 [00:02<00:00, 70.51batch/s, batch_accuracy=1, loss=1.49]   
Epoch 10: 100%|██████████| 178/178 [00:02<00:00, 70.12batch/s, batch_accuracy=1, loss=19.8]   


(0.7394120097160339, 0.6516097784042358, 0.6455605626106262)
