In [12]:
# Load packages
import numpy as np
import torch
from torch.utils.data import DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_fscore_support
from sklearn.svm import SVC, NuSVC, LinearSVC
from sklearn.linear_model import LogisticRegression
import time

from models import NeuralNetwork, TrainConfig, evaluate, save_model, load_model, plot_results
from utils import load_data, split_data, encode_data, mapping_dict
from pathlib import Path
import altair as alt
import pandas as pd

from tqdm import tqdm

if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print("Device: cuda")
        print(torch.cuda.get_device_name(i))
else:
    print("Device: cpu")

Device: cuda
NVIDIA GeForce RTX 3050 Laptop GPU


# Prepare

In [2]:
data = load_data(folder_path="data/train/power/", file_list=['power-gb-train.tsv'],text_head='text_en')
train_raw, test_raw = split_data(data, test_size=0.2, random_state=0)


print("Prepare data encoder...")
tfidf_encoder = TfidfVectorizer(max_features=50000)
tfidf_encoder.fit(train_raw.texts)

# POC
print("Prepare data...")
train_data_nn = encode_data(train_raw, tfidf_encoder)
test_data_nn = encode_data(test_raw, tfidf_encoder)

print("Train model")
models_dir = Path('models/gb')

if not models_dir.exists():
    models_dir.mkdir(parents=True, exist_ok=True)

train_config = TrainConfig(
    num_epochs      = 10,
    early_stop      = False,
    violation_limit = 5,
    
)

dataloader = DataLoader(train_data_nn, batch_size=128, shuffle=True)

USE_CACHE = False

model_nn = NeuralNetwork(
    input_size=len(tfidf_encoder.vocabulary_),
    hidden_size=128,
    device='cuda'
)

if (models_dir / 'model_nn.pt').exists() and USE_CACHE:
    model_nn = load_model(model_nn, models_dir, 'model_nn')
else:
    model_nn.fit(dataloader, train_config, disable_progress_bar=False)
    save_model(model_nn, models_dir, "model_nn")


with torch.no_grad():
    # X_test = torch.stack([dta[0] for dta in test])
    X_test = torch.stack([test[0] for test in test_data_nn]).to(model_nn.device)
    y_test = torch.stack([test[1] for test in test_data_nn]).to(model_nn.device)
    y_pred = model_nn.predict(X_test)
    y_prob = model_nn.forward(X_test)


result_nn = evaluate(y_test, y_pred, y_prob)

# Plot training accuracy and loss side-by-side
plot_results(model_nn, train_config, dataloader)

Prepare data encoder...
Prepare data...
Train model



Epoch 1: 100%|██████████| 209/209 [00:12<00:00, 16.71batch/s, batch_accuracy=0.571, loss=0.563]
Epoch 2: 100%|██████████| 209/209 [00:11<00:00, 17.45batch/s, batch_accuracy=1, loss=0.372]    
Epoch 3: 100%|██████████| 209/209 [00:11<00:00, 17.54batch/s, batch_accuracy=1, loss=0.311]    
Epoch 4: 100%|██████████| 209/209 [00:12<00:00, 17.13batch/s, batch_accuracy=1, loss=0.0111]    
Epoch 5: 100%|██████████| 209/209 [00:12<00:00, 16.17batch/s, batch_accuracy=1, loss=0.0256]    
Epoch 6: 100%|██████████| 209/209 [00:12<00:00, 16.72batch/s, batch_accuracy=1, loss=0.00426]   
Epoch 7: 100%|██████████| 209/209 [00:12<00:00, 16.91batch/s, batch_accuracy=1, loss=0.000161]  
Epoch 8: 100%|██████████| 209/209 [00:11<00:00, 17.69batch/s, batch_accuracy=1, loss=0.000172]  
Epoch 9: 100%|██████████| 209/209 [00:11<00:00, 18.28batch/s, batch_accuracy=1, loss=0.000407]  
Epoch 10: 100%|██████████| 209/209 [00:11<00:00, 17.61batch/s, batch_accuracy=1, loss=3.66e-6]   


Accuracy: 0.7025, Precision: 0.7226, Recall: 0.7587, F1: 0.7402, AUC: 0.7732


In [90]:

def mass_run_models(file_name: str):
    data = load_data(folder_path="data/train/power/", file_list=[file_name],text_head="text")
    train_raw, test_raw = split_data(data, test_size=0.2, random_state=0)


    # print("Prepare chars_encoder...")
    chars_encoder = TfidfVectorizer(max_features=50000, analyzer="char", ngram_range=(3,5), use_idf=True, sublinear_tf=True)
    chars_encoder.fit(train_raw.texts)

    # Neural Network
    # print("Prepare data...")
    train_data_nn = encode_data(train_raw, chars_encoder)
    test_data_nn = encode_data(test_raw, chars_encoder)

    models_dir = Path("models/")
    dataloader = DataLoader(train_data_nn, batch_size=128, shuffle=True)

    USE_CACHE = False

    model_nn = NeuralNetwork(
        input_size=len(chars_encoder.vocabulary_),
        hidden_size=512,
        n_linear_layers=3,
        device="cuda",
    )

    train_config = TrainConfig(
        num_epochs      = 10,
        early_stop      = False,
        violation_limit = 5,
        # optimizer_params={"weight_decay": 0.9}
        
    )

    model_name = f"model_nn_{file_name}"

    if (models_dir / f"{model_name}.pt").exists() and USE_CACHE:
        model_nn = load_model(model_nn, models_dir, model_name)
    else:
        nn_train_t0 = time.time_ns()
        model_nn.fit(dataloader, train_config, disable_progress_bar=True)
        nn_train_time = (time.time_ns() - nn_train_t0) / 10**9
        save_model(model_nn, models_dir, model_name)


    with torch.no_grad():
        # X_test = torch.stack([dta[0] for dta in test])
        X_test = torch.stack([test[0] for test in test_data_nn]).cpu()
        y_test = torch.stack([test[1] for test in test_data_nn]).cpu()
        nn_pred_t0 = time.time_ns()
        y_pred = model_nn.predict(X_test).cpu()
        nn_pred_time = (time.time_ns() - nn_pred_t0) / 10**9
        y_prob = model_nn.forward(X_test).cpu()

    print(f"NN train time: {nn_train_time}, pred time: {nn_pred_time}")
    result_nn = evaluate(y_test, y_pred, y_prob)
    result_nn.update({"train_time": nn_train_time, "pred_time": nn_pred_time ,"n_samples": len(X_test)})

    model_nn.cpu()
    torch.cuda.empty_cache()

    # Logistic regression

    X_train_skl = chars_encoder.transform(train_raw.texts)
    X_test_skl = chars_encoder.transform(test_raw.texts)

    # print("Fit model")
    model_logreg = LogisticRegression()
    logreg_train_t0 = time.time_ns()
    model_logreg.fit(X_train_skl, train_raw.labels)
    logreg_train_time = (time.time_ns() - logreg_train_t0) / 10**9

    logreg_pred_t0 = time.time_ns()
    pred_logreg = model_logreg.predict(X_test_skl)
    logreg_pred_time = (time.time_ns() - logreg_pred_t0) / 10**9
    prob_logreg = model_logreg.predict_proba(X_test_skl)

    print(f"Logreg train time: {logreg_train_time}, pred time: {logreg_pred_time}")
    result_logreg =  evaluate(test_raw.labels, pred_logreg, prob_logreg[:, 1])
    result_logreg.update({"train_time": logreg_train_time, "pred_time": logreg_pred_time, "n_sample": X_test_skl.shape[0]})

    return {file_name: {"neural_network": result_nn, "logreg": result_logreg}}

In [94]:
import copy

data_dir = Path("data/train/power/")

file_list = [i.name for i in data_dir.glob("*")]


all_results = {}

for file_name in file_list:
    print("=====", file_name)
    result = mass_run_models(file_name)
    all_results.update(result)

all_results

{'power-at-train.tsv': {'neural_network': {'accuracy': tensor(0.7923),
   'precision': tensor(0.6775),
   'recall': tensor(0.7028),
   'f1': tensor(0.6899),
   'auc': 0.8599337026027076,
   'train_time': 28.403456983,
   'pred_time': 0.168883082,
   'n_samples': 3255},
  'logreg': {'accuracy': 0.8043010752688172,
   'precision': 0.6962828649138713,
   'recall': 0.7177570093457943,
   'f1': 0.7068568798895536,
   'auc': 0.8682606557026455,
   'train_time': 8.909597263,
   'pred_time': 0.025267002,
   'n_sample': 3255}},
 'power-ba-train.tsv': {'neural_network': {'accuracy': tensor(0.8251),
   'precision': tensor(0.9087),
   'recall': tensor(0.8940),
   'f1': tensor(0.9013),
   'auc': 0.630184331797235,
   'train_time': 4.648984209,
   'pred_time': 0.0218023,
   'n_samples': 486},
  'logreg': {'accuracy': 0.8930041152263375,
   'precision': 0.8930041152263375,
   'recall': 1.0,
   'f1': 0.9434782608695652,
   'auc': 0.6422811059907834,
   'train_time': 2.31956522,
   'pred_time': 0.00259

In [121]:
import json
import torch

for file_name, result in all_results.items():
    for model, metrics in result.items():
        if model == 'neural_network':
            for metric in metrics.keys():
                if isinstance(metrics[metric], torch.Tensor):
                    metrics[metric] = metrics[metric].item()

with open("all_results.json", "w") as f:
    json.dump(all_results, f)    

In [5]:
import pandas as pd
data_dir = Path("data/train/power/")

file_list = [i.name for i in data_dir.glob("*")]

data_points = []

for file_name in file_list:
    data = load_data(folder_path="data/train/power/", file_list=[file_name],text_head="text")
    data_points.append(len(data))



In [13]:
df = pd.DataFrame({
    "dataset": file_list,
    "country": [mapping_dict[Path(file_name).stem.replace('power-', '').replace('-train', '')] for file_name in file_list],
    "data_points": data_points
})

df

Unnamed: 0,dataset,country,data_points
0,power-at-train.tsv,Austria,15971
1,power-ba-train.tsv,Bosnia and Herzegovina,2531
2,power-be-train.tsv,Belgium,4765
3,power-bg-train.tsv,Bulgaria,6699
4,power-cz-train.tsv,Czechia,6744
5,power-dk-train.tsv,Denmark,5493
6,power-es-ct-train.tsv,Catalonia,1525
7,power-es-ga-train.tsv,Galicia,953
8,power-es-pv-train.tsv,Basque Country,1031
9,power-es-train.tsv,Spain,7198


In [6]:
import json

all_results = json.load(open("all_results.json", "r"))

In [18]:

result_df = pd.DataFrame({
    "dataset": all_results.keys(),
    # "accuracy_nn": [round(values["neural_network"]["accuracy"] ,4) for values in all_results.values()],
    # "accuracy_logreg": [round(values["logreg"]["accuracy"] ,4) for values in all_results.values()],
    # "f1_nn": [round(values["neural_network"]["f1"] ,4) for values in all_results.values()],
    # "f1_logreg": [round(values["logreg"]["f1"] ,4) for values in all_results.values()],
    # "auc_nn": [round(values["neural_network"]["auc"] ,4) for values in all_results.values()],
    # "auc_logreg": [round(values["logreg"]["auc"] ,4) for values in all_results.values()],
    "total_data_size": data_points,
    "test_size": [round(values["neural_network"]["n_samples"] ,4) for values in all_results.values()],
    "train_time_nn": [round(values["neural_network"]["train_time"] ,4) for values in all_results.values()],
    "train_time_logreg": [round(values["logreg"]["train_time"] ,4) for values in all_results.values()],
    "pred_time_nn": [round(values["neural_network"]["pred_time"] ,4) for values in all_results.values()],
    "pred_time_logreg": [round(values["logreg"]["pred_time"] ,4) for values in all_results.values()]
})
result_df.to_csv("results_efficiency.csv", index=False)

In [101]:
# Plot performance of the two languages

def plot_performance(file_name: Path, all_results: dict):

    data = load_data(folder_path="data/train/power/", file_list=[file_name], text_head='text_en')

    country_code = Path(file_name).stem.replace('power-', '').replace('-train', '')
    country_name = mapping_dict[country_code]

    plot_df = pd.DataFrame({
        "model": ['neural_network', 'logreg'],
        "accuracy": [round(all_results[file_name]["neural_network"]["accuracy"].item(), 4), round(all_results[file_name]["logreg"]["accuracy"], 4)],
        "f1": [round(all_results[file_name]["neural_network"]["f1"].item(), 4), round(all_results[file_name]["logreg"]["f1"], 4)],
        "auc": [round(all_results[file_name]["neural_network"]["auc"].item(), 4), round(all_results[file_name]["logreg"]["auc"], 4)],
    }).melt(id_vars="model")

    bar_chart = alt.Chart().mark_bar().encode(
        color=alt.Color('variable:N', scale=alt.Scale(scheme='category20'), title='Evaluation Metric', sort = None)
    ).properties(width = 150, height = 300)

    text = alt.Chart().mark_text(
        align='center',
        yOffset=-10
    ).encode(
        text=alt.Text('value:Q'),
        
    ).properties(width = 150, height = 300)

    full_chart = alt.layer(bar_chart, text, data=plot_df).encode(
        x = alt.X('variable:N', axis = alt.Axis(title = '', labels = False, ticks = False), sort = None, ),
        y = alt.Y('value:Q', axis = alt.Axis(title = 'Score'), scale=alt.Scale(domain=(0, 1))),
    ).facet(
        column=alt.Column('model:N', title='Model', sort = None),
    ).properties(
        title = f"{country_code} - {country_name} - {len(data)} datapoints"
    )

    full_chart.show()

In [102]:
for country in all_results.keys():
    plot_performance(country, all_results)

In [113]:
# Plot performance of the two languages

def plot_efficiency(file_name: Path, all_results: dict):

    data = load_data(folder_path="data/train/power/", file_list=[file_name], text_head='text_en')
    test_size = all_results[file_name]["neural_network"]["n_samples"]

    country_code = Path(file_name).stem.replace('power-', '').replace('-train', '')
    country_name = mapping_dict[country_code]

    plot_df = pd.DataFrame({
        "model": ['neural_network', 'logreg'],
        # "train_time": [round(all_results[file_name]["neural_network"]["train_time"], 4), round(all_results[file_name]["logreg"]["train_time"], 4)],
        "pred_time": [round(all_results[file_name]["neural_network"]["pred_time"], 4), round(all_results[file_name]["logreg"]["pred_time"], 4)],
    }).melt(id_vars="model")

    bar_chart = alt.Chart().mark_bar().encode(
        color=alt.Color('variable:N', scale=alt.Scale(scheme='category20'), title='Evaluation Metric', sort = None)
    ).properties(width = 150, height = 300)

    text = alt.Chart().mark_text(
        align='center',
        yOffset=-10
    ).encode(
        text=alt.Text('value:Q'),
        
    ).properties(width = 150, height = 300)

    full_chart = alt.layer(bar_chart, text, data=plot_df).encode(
        x = alt.X('variable:N', axis = alt.Axis(title = '', labels = False, ticks = False), sort = None, ),
        y = alt.Y('value:Q', axis = alt.Axis(title = 'Time (seconds)')),
    ).facet(
        column=alt.Column('model:N', title='Model', sort = None),
    ).properties(
        title = f"{country_code} - {country_name} - {len(data) - test_size} datapoints"
    )

    full_chart.show()

for country in all_results.keys():
    plot_efficiency(country, all_results)
    