In [6]:
# Load packages
import numpy as np
import torch
from torch.utils.data import DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_fscore_support
from sklearn.svm import SVC, NuSVC, LinearSVC
from sklearn.linear_model import SGDClassifier

from models import NeuralNetwork, TrainConfig, evaluate_nn_model, save_model, load_model, plot_results
from utils import load_data, split_data, encode_data, mapping_dict
from pathlib import Path
import altair as alt
import pandas as pd

from tqdm import tqdm

if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print("Device: cuda")
        print(torch.cuda.get_device_name(i))
else:
    print("Device: cpu")

Device: cuda
NVIDIA GeForce RTX 3050 Laptop GPU


# Load data

In [2]:
balkan_file_list = [
    'power-ba-train.tsv',
    'power-hr-train.tsv',
    'power-rs-train.tsv'
]
data = load_data(folder_path="data/train/power/", file_list=balkan_file_list,text_head='text')
train_raw, test_raw = split_data(data, test_size=0.2, random_state=0)


# Encode

In [3]:

print("Prepare data encoder...")
tfidf_encoder = TfidfVectorizer(max_features=50000)
tfidf_encoder.fit(train_raw.texts)

print("Prepare data...")
train_data_nn = encode_data(train_raw, tfidf_encoder)
test_data_nn = encode_data(test_raw, tfidf_encoder)

Prepare data encoder...
Prepare data...


# Neural Networks

In [4]:
# POC

print("Train model")
models_dir = Path('models/hr')

if not models_dir.exists():
    models_dir.mkdir(parents=True, exist_ok=True)

train_config = TrainConfig(
    num_epochs      = 10,
    early_stop      = False,
    violation_limit = 5,
    
)

dataloader = DataLoader(train_data_nn, batch_size=128, shuffle=True)

USE_CACHE = False

model_nn = NeuralNetwork(
    input_size=len(tfidf_encoder.vocabulary_),
    hidden_size=128,
    device='cpu'
)

if (models_dir / 'model_nn.pt').exists() and USE_CACHE:
    model_nn = load_model(model_nn, models_dir, 'model_nn')
else:
    model_nn.fit(dataloader, train_config, disable_progress_bar=False)
    save_model(model_nn, models_dir, "model_nn")


with torch.no_grad():
    # X_test = torch.stack([dta[0] for dta in test])
    X_test = torch.stack([test[0] for test in test_data_nn]).to(model_nn.device)
    y_test = torch.stack([test[1] for test in test_data_nn]).to(model_nn.device)
    y_pred = model_nn.predict(X_test)


print(precision_recall_fscore_support(y_test, y_pred, average='binary'))
print("AUC", roc_auc_score(y_test, y_pred))

# Plot training accuracy and loss side-by-side
plot_results(model_nn, train_config, dataloader)

Train model



Epoch 1: 100%|██████████| 178/178 [00:03<00:00, 45.86batch/s, batch_accuracy=0.841, loss=0.513]
Epoch 2: 100%|██████████| 178/178 [00:03<00:00, 47.35batch/s, batch_accuracy=0.89, loss=0.322] 
Epoch 3: 100%|██████████| 178/178 [00:03<00:00, 45.71batch/s, batch_accuracy=1, loss=0.0346]    
Epoch 4: 100%|██████████| 178/178 [00:03<00:00, 45.31batch/s, batch_accuracy=1, loss=0.0119]    
Epoch 5: 100%|██████████| 178/178 [00:03<00:00, 44.52batch/s, batch_accuracy=1, loss=0.000489]   
Epoch 6: 100%|██████████| 178/178 [00:03<00:00, 46.59batch/s, batch_accuracy=1, loss=0.000165]
Epoch 7: 100%|██████████| 178/178 [00:04<00:00, 44.33batch/s, batch_accuracy=1, loss=0.000128]
Epoch 8: 100%|██████████| 178/178 [00:03<00:00, 45.28batch/s, batch_accuracy=1, loss=6.22e-5] 
Epoch 9: 100%|██████████| 178/178 [00:03<00:00, 46.16batch/s, batch_accuracy=1, loss=7.74e-6]
Epoch 10: 100%|██████████| 178/178 [00:03<00:00, 46.41batch/s, batch_accuracy=1, loss=5.11e-6]


(0.6982421875, 0.6296785557023338, 0.6621903218337578, None)
AUC 0.7233379453074891


In [5]:
# Parameters finding

print("Train model")
models_dir = Path('models/hr')

if not models_dir.exists():
    models_dir.mkdir(parents=True, exist_ok=True)

train_config = TrainConfig(
    num_epochs      = 10,
    early_stop      = False,
    violation_limit = 5,
    optimizer_params= {"lr": 0.0001, "weight_decay": 0.001, }
)

dataloader = DataLoader(train_data_nn, batch_size=128, shuffle=True)

USE_CACHE = False

model_nn = NeuralNetwork(
    input_size=len(tfidf_encoder.vocabulary_),
    hidden_size=128,
    device='cpu'
)

if (models_dir / 'model_nn.pt').exists() and USE_CACHE:
    model_nn = load_model(model_nn, models_dir, 'model_nn')
else:
    model_nn.fit(dataloader, train_config, disable_progress_bar=False)
    save_model(model_nn, models_dir, "model_nn")


with torch.no_grad():
    # X_test = torch.stack([dta[0] for dta in test])
    X_test = torch.stack([test[0] for test in test_data_nn]).to(model_nn.device)
    y_test = torch.stack([test[1] for test in test_data_nn]).to(model_nn.device)
    y_pred = model_nn.predict(X_test)


print(precision_recall_fscore_support(y_test, y_pred, average='binary'))
print("AUC", roc_auc_score(y_test, y_pred))

# Plot training accuracy and loss side-by-side
plot_results(model_nn, train_config, dataloader)

Train model



Epoch 1: 100%|██████████| 178/178 [00:04<00:00, 40.90batch/s, batch_accuracy=0.829, loss=0.379]
Epoch 2: 100%|██████████| 178/178 [00:04<00:00, 43.08batch/s, batch_accuracy=0.854, loss=0.39] 
Epoch 3: 100%|██████████| 178/178 [00:03<00:00, 45.38batch/s, batch_accuracy=1, loss=0.0505]    
Epoch 4: 100%|██████████| 178/178 [00:03<00:00, 45.41batch/s, batch_accuracy=1, loss=0.00856]   
Epoch 5: 100%|██████████| 178/178 [00:03<00:00, 45.77batch/s, batch_accuracy=1, loss=0.000763]  
Epoch 6: 100%|██████████| 178/178 [00:03<00:00, 44.68batch/s, batch_accuracy=1, loss=0.000177]
Epoch 7: 100%|██████████| 178/178 [00:03<00:00, 45.26batch/s, batch_accuracy=1, loss=0.000114]
Epoch 8: 100%|██████████| 178/178 [00:03<00:00, 44.82batch/s, batch_accuracy=1, loss=6.93e-6]
Epoch 9: 100%|██████████| 178/178 [00:03<00:00, 46.40batch/s, batch_accuracy=1, loss=4.07e-6]
Epoch 10: 100%|██████████| 178/178 [00:04<00:00, 42.50batch/s, batch_accuracy=1, loss=1.85e-6]


(0.6993006993006993, 0.6164685160722149, 0.6552773227240815, None)
AUC 0.7191018920307773


In [6]:
# Drop out

print("Train model")
models_dir = Path('models/hr')

if not models_dir.exists():
    models_dir.mkdir(parents=True, exist_ok=True)

train_config = TrainConfig(
    num_epochs      = 10,
    early_stop      = False,
    violation_limit = 5,
    optimizer_params= {"lr": 0.001, "weight_decay": 0.01, }
)

dataloader = DataLoader(train_data_nn, batch_size=128, shuffle=True)

USE_CACHE = False

model_nn = NeuralNetwork(
    input_size=len(tfidf_encoder.vocabulary_),
    hidden_size=128,
    dropout=0.5,
    device='cpu'
)

if (models_dir / 'model_nn.pt').exists() and USE_CACHE:
    model_nn = load_model(model_nn, models_dir, 'model_nn')
else:
    model_nn.fit(dataloader, train_config, disable_progress_bar=False)
    save_model(model_nn, models_dir, "model_nn")


with torch.no_grad():
    # X_test = torch.stack([dta[0] for dta in test])
    X_test = torch.stack([test[0] for test in test_data_nn]).to(model_nn.device)
    y_test = torch.stack([test[1] for test in test_data_nn]).to(model_nn.device)
    y_pred = model_nn.predict(X_test)


print(precision_recall_fscore_support(y_test, y_pred, average='binary'))
print("AUC", roc_auc_score(y_test, y_pred))

# Plot training accuracy and loss side-by-side
plot_results(model_nn, train_config, dataloader)

Train model



Epoch 1:   0%|          | 0/178 [00:00<?, ?batch/s]

Epoch 1: 100%|██████████| 178/178 [00:04<00:00, 41.99batch/s, batch_accuracy=0.793, loss=0.503]
Epoch 2: 100%|██████████| 178/178 [00:04<00:00, 42.16batch/s, batch_accuracy=0.878, loss=0.294]
Epoch 3: 100%|██████████| 178/178 [00:03<00:00, 44.68batch/s, batch_accuracy=0.976, loss=0.0952]
Epoch 4: 100%|██████████| 178/178 [00:03<00:00, 46.00batch/s, batch_accuracy=1, loss=0.00309]   
Epoch 5: 100%|██████████| 178/178 [00:04<00:00, 42.29batch/s, batch_accuracy=1, loss=0.00049]   
Epoch 6: 100%|██████████| 178/178 [00:04<00:00, 44.43batch/s, batch_accuracy=1, loss=0.00024]   
Epoch 7: 100%|██████████| 178/178 [00:04<00:00, 43.30batch/s, batch_accuracy=1, loss=0.000525]
Epoch 8: 100%|██████████| 178/178 [00:03<00:00, 45.18batch/s, batch_accuracy=1, loss=0.000444]
Epoch 9: 100%|██████████| 178/178 [00:04<00:00, 44.34batch/s, batch_accuracy=1, loss=0.000269]  
Epoch 10: 100%|██████████| 178/178 [00:04<00:00, 44.14batch/s, batch_accuracy=1, loss=0.000201]


(0.6819457436856876, 0.6420079260237781, 0.6613744613290996, None)
AUC 0.7203228851321141


# Other classifiers

https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html

"Particularly in high-dimensional spaces, data can more easily be separated linearly and the simplicity of classifiers such as naive Bayes and linear SVMs might lead to better generalization than is achieved by other classifiers."

## SVC, SVM
Effective in high dimensional spaces.

Still effective in cases where number of dimensions is greater than the number of samples.

LinearSVC with TfIdf did good on balanced English

In [7]:
from sklearn.svm import LinearSVC
# LinearSVC, tfidf
X_train = tfidf_encoder.transform(train_raw.texts)
print("Fit model")
model_LinearSVC_tfidf = LinearSVC()
model_LinearSVC_tfidf.fit(X_train, train_raw.labels)

pred_LinearSVC_tfidf = model_LinearSVC_tfidf.predict(tfidf_encoder.transform(test_raw.texts))

print(precision_recall_fscore_support(test_raw.labels, pred_LinearSVC_tfidf, average='binary'))
print("AUC:", roc_auc_score(y_test, pred_LinearSVC_tfidf))


Fit model




(0.7292759706190975, 0.6120651695288419, 0.6655494373952597, None)
AUC: 0.7296334139027094


## SGDClassifier
SGD requires a number of hyperparameters such as the regularization parameter and the number of iterations.

SGD is sensitive to feature scaling.

In [8]:
from sklearn.linear_model import SGDClassifier

model_SGDClassifier_tfidf = SGDClassifier()
model_SGDClassifier_tfidf.fit(X_train, train_raw.labels)

pred_SGDClassifier_tfidf = model_SGDClassifier_tfidf.predict(tfidf_encoder.transform(test_raw.texts))

print(precision_recall_fscore_support(test_raw.labels, pred_SGDClassifier_tfidf, average='binary'))

roc_auc_score(test_raw.labels, pred_SGDClassifier_tfidf)


(0.790434235368156, 0.5530603258476442, 0.6507772020725389, None)


0.7272260468444618

## Naive Bayes

Overall bad performance, not worth pursuing

from sklearn.naive_bayes import GaussianNB

model_GaussianNB_tfidf = GaussianNB()
model_GaussianNB_tfidf.fit(X_train.toarray(), train_raw.labels)

pred_GaussianNB_tfidf = model_GaussianNB_tfidf.predict(tfidf_encoder.transform(test_raw.texts).toarray())

print(precision_recall_fscore_support(test_raw.labels, pred_GaussianNB_tfidf, average='binary'))
roc_auc_score(test_raw.labels, pred_GaussianNB_tfidf)

# Observations

- Neural network is still a good option
- sklearn's SGD is also good

# Test features

## Standard count vectors & scale
Not good on both LinearSVC and SGD


from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

encoding_pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('scaler', StandardScaler(with_mean=False))
])

encoding_pipeline.fit(train_raw.texts)

X_train = encoding_pipeline.transform(train_raw.texts)


print("Fit model")
model_LinearSVC_tfidf = LinearSVC()
model_LinearSVC_tfidf.fit(X_train, train_raw.labels)
pred_LinearSVC_tfidf = model_LinearSVC_tfidf.predict(encoding_pipeline.transform(test_raw.texts))
print(precision_recall_fscore_support(test_raw.labels, pred_LinearSVC_tfidf, average='binary'))
print("AUC:", roc_auc_score(y_test, pred_LinearSVC_tfidf))

model_SGDClassifier_tfidf = SGDClassifier()
model_SGDClassifier_tfidf.fit(X_train, train_raw.labels)
pred_SGDClassifier_tfidf = model_SGDClassifier_tfidf.predict(encoding_pipeline.transform(test_raw.texts))
print(precision_recall_fscore_support(test_raw.labels, pred_SGDClassifier_tfidf, average='binary'))
roc_auc_score(test_raw.labels, pred_SGDClassifier_tfidf)


## word TFIDF

In [9]:
word_tfidf = TfidfVectorizer(sublinear_tf=True, analyzer="word", ngram_range=(3,5), max_features=10000)

X_train = word_tfidf.fit_transform(train_raw.texts)


print("LinearSVC")
model_LinearSVC_tfidf = LinearSVC()
model_LinearSVC_tfidf.fit(X_train, train_raw.labels)
pred_LinearSVC_tfidf = model_LinearSVC_tfidf.predict(word_tfidf.transform(test_raw.texts))
print(precision_recall_fscore_support(test_raw.labels, pred_LinearSVC_tfidf, average='binary'))
print("AUC:", roc_auc_score(y_test, pred_LinearSVC_tfidf))

print("SGDClassifier")
model_SGDClassifier_tfidf = SGDClassifier()
model_SGDClassifier_tfidf.fit(X_train, train_raw.labels)

pred_SGDClassifier_tfidf = model_SGDClassifier_tfidf.predict(word_tfidf.transform(test_raw.texts))
print(precision_recall_fscore_support(test_raw.labels, pred_SGDClassifier_tfidf, average='binary'))
roc_auc_score(test_raw.labels, pred_SGDClassifier_tfidf)


LinearSVC




(0.6410123966942148, 0.5464553060325847, 0.5899690991205134, None)
AUC: 0.6703256690068166
SGDClassifier
(0.7664526484751204, 0.420519594892118, 0.5430764856411715, None)


0.6671742185298611

Use more tfidf word (50000) features improve 1%, but takes much more time to transform

In [3]:
word_tfidf = TfidfVectorizer(sublinear_tf=True, analyzer="word", ngram_range=(3,5), max_features=50000)

X_train = word_tfidf.fit_transform(train_raw.texts)


FileNotFoundError: [Errno 2] No such file or directory: 'models/tfidf/ngram_word_3to7_50000.npz'

In [7]:

# import scipy
# scipy.sparse.save_npz("models/tfidf/ngram_word_3to7_50000.npz", X_train)

print("LinearSVC")
model_LinearSVC_tfidf = LinearSVC()
model_LinearSVC_tfidf.fit(X_train, train_raw.labels)
pred_LinearSVC_tfidf = model_LinearSVC_tfidf.predict(word_tfidf.transform(test_raw.texts))
print(precision_recall_fscore_support(test_raw.labels, pred_LinearSVC_tfidf, average='binary'))
print("AUC:", roc_auc_score(test_raw.labels, pred_LinearSVC_tfidf))

print("SGDClassifier")
model_SGDClassifier_tfidf = SGDClassifier()
model_SGDClassifier_tfidf.fit(X_train, train_raw.labels)

pred_SGDClassifier_tfidf = model_SGDClassifier_tfidf.predict(word_tfidf.transform(test_raw.texts))
print(precision_recall_fscore_support(test_raw.labels, pred_SGDClassifier_tfidf, average='binary'))
roc_auc_score(test_raw.labels, pred_SGDClassifier_tfidf)


LinearSVC
(0.6931690929451287, 0.5451343020695729, 0.6103031796894257, None)
AUC: 0.6914300470963796
SGDClassifier
(0.8221343873517787, 0.3663584324086306, 0.5068534876637222, None)


0.6565283426479044

Char ngram

In [8]:
char_tfidf = TfidfVectorizer(sublinear_tf=True, analyzer="char", ngram_range=(3,7), max_features=50000)

X_train = char_tfidf.fit_transform(train_raw.texts)

import scipy
scipy.sparse.save_npz("models/tfidf/ngram_char_3to7_50000.npz", X_train)


LinearSVC
(0.7396907216494846, 0.6318802289740203, 0.681548325813346, None)


NameError: name 'y_test' is not defined

In [9]:

print("LinearSVC")
model_LinearSVC_tfidf = LinearSVC()
model_LinearSVC_tfidf.fit(X_train, train_raw.labels)
pred_LinearSVC_tfidf = model_LinearSVC_tfidf.predict(char_tfidf.transform(test_raw.texts))
print(precision_recall_fscore_support(test_raw.labels, pred_LinearSVC_tfidf, average='binary'))
print("AUC:", roc_auc_score(test_raw.labels, pred_LinearSVC_tfidf))

print("SGDClassifier")
model_SGDClassifier_tfidf = SGDClassifier()
model_SGDClassifier_tfidf.fit(X_train, train_raw.labels)

pred_SGDClassifier_tfidf = model_SGDClassifier_tfidf.predict(char_tfidf.transform(test_raw.texts))
print(precision_recall_fscore_support(test_raw.labels, pred_SGDClassifier_tfidf, average='binary'))
roc_auc_score(test_raw.labels, pred_SGDClassifier_tfidf)


LinearSVC
(0.7396907216494846, 0.6318802289740203, 0.681548325813346, None)
AUC: 1.0
SGDClassifier
(0.7989661114302126, 0.6125055041831792, 0.693419740777667, None)


0.7544316090652349

In [10]:
print("AUC:", roc_auc_score(test_raw.labels, pred_LinearSVC_tfidf))


AUC: 0.7411696081204127


In [11]:
# Test char tfidf feature on NN
train_data_nn = encode_data(train_raw, char_tfidf)
test_data_nn = encode_data(test_raw, char_tfidf)

train_config = TrainConfig(
    num_epochs      = 10,
    early_stop      = False,
    violation_limit = 5,
)

dataloader = DataLoader(train_data_nn, batch_size=128, shuffle=True)


NameError: name 'models_dir' is not defined

In [12]:

USE_CACHE = False
models_dir = Path("models")

model_nn = NeuralNetwork(
    input_size=len(char_tfidf.vocabulary_),
    hidden_size=128,
    device='cpu'
)

if (models_dir / 'model_nn.pt').exists() and USE_CACHE:
    model_nn = load_model(model_nn, models_dir, 'model_nn')
else:
    model_nn.fit(dataloader, train_config, disable_progress_bar=False)
    save_model(model_nn, models_dir, "model_nn")


with torch.no_grad():
    # X_test = torch.stack([dta[0] for dta in test])
    X_test = torch.stack([test[0] for test in test_data_nn]).to(model_nn.device)
    y_test = torch.stack([test[1] for test in test_data_nn]).to(model_nn.device)
    y_pred = model_nn.predict(X_test)


print(precision_recall_fscore_support(y_test, y_pred, average='binary'))
print("AUC", roc_auc_score(y_test, y_pred))

# Plot training accuracy and loss side-by-side
plot_results(model_nn, train_config, dataloader)




Epoch 1: 100%|██████████| 178/178 [00:08<00:00, 19.83batch/s, batch_accuracy=0.768, loss=0.474]
Epoch 2: 100%|██████████| 178/178 [00:08<00:00, 20.10batch/s, batch_accuracy=0.829, loss=0.37] 
Epoch 3: 100%|██████████| 178/178 [00:08<00:00, 20.51batch/s, batch_accuracy=0.866, loss=0.353]
Epoch 4: 100%|██████████| 178/178 [00:09<00:00, 19.05batch/s, batch_accuracy=0.951, loss=0.222]
Epoch 5: 100%|██████████| 178/178 [00:08<00:00, 20.49batch/s, batch_accuracy=0.963, loss=0.149] 
Epoch 6: 100%|██████████| 178/178 [00:09<00:00, 19.18batch/s, batch_accuracy=1, loss=0.0364]    
Epoch 7: 100%|██████████| 178/178 [00:09<00:00, 18.84batch/s, batch_accuracy=1, loss=0.0102]    
Epoch 8: 100%|██████████| 178/178 [00:09<00:00, 18.27batch/s, batch_accuracy=1, loss=0.00124]   
Epoch 9: 100%|██████████| 178/178 [00:08<00:00, 20.36batch/s, batch_accuracy=1, loss=0.000476]
Epoch 10: 100%|██████████| 178/178 [00:08<00:00, 20.40batch/s, batch_accuracy=1, loss=0.000152]


(0.7087424029920524, 0.6675473359753412, 0.6875283446712018, None)
AUC 0.7415320334007591
