In [1]:
# Load packages
import numpy as np
import torch
from torch.utils.data import DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_fscore_support
from sklearn.svm import SVC, NuSVC, LinearSVC
from sklearn.linear_model import SGDClassifier

from models import NeuralNetwork, TrainConfig, save_model, load_model, plot_results, evaluate
from utils import load_data, split_data, encode_data, mapping_dict
from pathlib import Path
import altair as alt
import pandas as pd

from tqdm import tqdm

if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print("Device: cuda")
        print(torch.cuda.get_device_name(i))
else:
    print("Device: cpu")

Device: cuda
NVIDIA GeForce RTX 3050 Laptop GPU


# Load data

In [2]:
balkan_file_list = [
    'power-es-train.tsv',
    'power-es-ct-train.tsv',
    'power-es-ga-train.tsv',
    'power-es-pv-train.tsv'
]
data = load_data(folder_path="data/train/power/", file_list=balkan_file_list,text_head='text')
train_raw, test_raw = split_data(data, test_size=0.2, random_state=0)


# Encode

In [4]:

print("Prepare data encoder...")
tfidf_encoder = TfidfVectorizer(max_features=50000)
tfidf_encoder.fit(train_raw.texts)

print("Prepare data...")
train_data_nn = encode_data(train_raw, tfidf_encoder)
test_data_nn = encode_data(test_raw, tfidf_encoder)

Prepare data encoder...
Prepare data...


# Neural Networks

In [5]:
# POC

print("Train model")
models_dir = Path('models/es')

if not models_dir.exists():
    models_dir.mkdir(parents=True, exist_ok=True)

train_config = TrainConfig(
    num_epochs      = 10,
    early_stop      = False,
    violation_limit = 5,
    
)

dataloader = DataLoader(train_data_nn, batch_size=128, shuffle=True)

USE_CACHE = False

model_nn = NeuralNetwork(
    input_size=len(tfidf_encoder.vocabulary_),
    hidden_size=128,
    device='cuda'
)

if (models_dir / 'model_nn.pt').exists() and USE_CACHE:
    model_nn = load_model(model_nn, models_dir, 'model_nn')
else:
    model_nn.fit(dataloader, train_config, disable_progress_bar=False)
    save_model(model_nn, models_dir, "model_nn")


with torch.no_grad():
    # X_test = torch.stack([dta[0] for dta in test])
    X_test = torch.stack([test[0] for test in test_data_nn]).to(model_nn.device)
    y_test = torch.stack([test[1] for test in test_data_nn]).to(model_nn.device)
    y_pred = model_nn.predict(X_test)



Train model



Epoch 1: 100%|██████████| 68/68 [00:01<00:00, 55.83batch/s, batch_accuracy=0.692, loss=0.573]
Epoch 2: 100%|██████████| 68/68 [00:01<00:00, 65.83batch/s, batch_accuracy=0.925, loss=0.238]
Epoch 3: 100%|██████████| 68/68 [00:01<00:00, 67.35batch/s, batch_accuracy=0.963, loss=0.108] 
Epoch 4: 100%|██████████| 68/68 [00:01<00:00, 65.93batch/s, batch_accuracy=1, loss=0.0037]    
Epoch 5: 100%|██████████| 68/68 [00:00<00:00, 68.66batch/s, batch_accuracy=1, loss=0.00108]   
Epoch 6: 100%|██████████| 68/68 [00:01<00:00, 66.58batch/s, batch_accuracy=1, loss=0.00056] 
Epoch 7: 100%|██████████| 68/68 [00:01<00:00, 65.61batch/s, batch_accuracy=1, loss=0.000561]
Epoch 8: 100%|██████████| 68/68 [00:01<00:00, 67.77batch/s, batch_accuracy=1, loss=0.000137]
Epoch 9: 100%|██████████| 68/68 [00:01<00:00, 66.44batch/s, batch_accuracy=1, loss=0.000113]
Epoch 10: 100%|██████████| 68/68 [00:01<00:00, 63.82batch/s, batch_accuracy=1, loss=4.58e-5] 


TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.

In [9]:

print(precision_recall_fscore_support(y_test.cpu(), y_pred.cpu(), average='binary'))
print("AUC", roc_auc_score(y_test.cpu(), y_pred.cpu()))

# Plot training accuracy and loss side-by-side
plot_results(model_nn, train_config, dataloader)

(0.7920384351407, 0.8911196911196911, 0.8386627906976745, None)
AUC 0.7377409155186933


In [10]:
# Parameters finding

print("Train model")
models_dir = Path('models/es')

if not models_dir.exists():
    models_dir.mkdir(parents=True, exist_ok=True)

train_config = TrainConfig(
    num_epochs      = 10,
    early_stop      = False,
    violation_limit = 5,
    optimizer_params= {"lr": 0.0001, "weight_decay": 0.001, }
)

dataloader = DataLoader(train_data_nn, batch_size=128, shuffle=True)

USE_CACHE = False

model_nn = NeuralNetwork(
    input_size=len(tfidf_encoder.vocabulary_),
    hidden_size=128,
    device='cuda'
)

if (models_dir / 'model_nn.pt').exists() and USE_CACHE:
    model_nn = load_model(model_nn, models_dir, 'model_nn')
else:
    model_nn.fit(dataloader, train_config, disable_progress_bar=False)
    save_model(model_nn, models_dir, "model_nn")



Train model



Epoch 1: 100%|██████████| 68/68 [00:01<00:00, 66.30batch/s, batch_accuracy=0.766, loss=0.513]
Epoch 2: 100%|██████████| 68/68 [00:01<00:00, 67.60batch/s, batch_accuracy=0.888, loss=0.299]
Epoch 3: 100%|██████████| 68/68 [00:01<00:00, 64.59batch/s, batch_accuracy=0.981, loss=0.0727]
Epoch 4: 100%|██████████| 68/68 [00:00<00:00, 69.09batch/s, batch_accuracy=1, loss=0.00646]   
Epoch 5: 100%|██████████| 68/68 [00:01<00:00, 64.87batch/s, batch_accuracy=1, loss=0.00153]   
Epoch 6: 100%|██████████| 68/68 [00:00<00:00, 68.21batch/s, batch_accuracy=1, loss=0.000908]
Epoch 7: 100%|██████████| 68/68 [00:01<00:00, 66.47batch/s, batch_accuracy=1, loss=0.000394]
Epoch 8: 100%|██████████| 68/68 [00:01<00:00, 67.40batch/s, batch_accuracy=1, loss=0.000119]
Epoch 9: 100%|██████████| 68/68 [00:00<00:00, 68.12batch/s, batch_accuracy=1, loss=5.77e-5] 
Epoch 10: 100%|██████████| 68/68 [00:01<00:00, 64.79batch/s, batch_accuracy=1, loss=3.31e-5] 


TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.

In [11]:

with torch.no_grad():
    # X_test = torch.stack([dta[0] for dta in test])
    X_test = torch.stack([test[0] for test in test_data_nn]).cpu()
    y_test = torch.stack([test[1] for test in test_data_nn]).cpu()
    y_pred = model_nn.predict(X_test).cpu()


print(precision_recall_fscore_support(y_test, y_pred, average='binary'))
print("AUC", roc_auc_score(y_test, y_pred))

# Plot training accuracy and loss side-by-side
plot_results(model_nn, train_config, dataloader)

(0.7905866302864939, 0.894980694980695, 0.8395508873596523, None)
AUC 0.7369279332242294


In [12]:
# Drop out

print("Train model")
models_dir = Path('models/es')

if not models_dir.exists():
    models_dir.mkdir(parents=True, exist_ok=True)

train_config = TrainConfig(
    num_epochs      = 10,
    early_stop      = False,
    violation_limit = 5,
    optimizer_params= {"lr": 0.001, "weight_decay": 0.01, }
)

dataloader = DataLoader(train_data_nn, batch_size=128, shuffle=True)

USE_CACHE = False

model_nn = NeuralNetwork(
    input_size=len(tfidf_encoder.vocabulary_),
    hidden_size=128,
    dropout=0.5,
    device='cuda'
)

if (models_dir / 'model_nn.pt').exists() and USE_CACHE:
    model_nn = load_model(model_nn, models_dir, 'model_nn')
else:
    model_nn.fit(dataloader, train_config, disable_progress_bar=False)
    save_model(model_nn, models_dir, "model_nn")


with torch.no_grad():
    # X_test = torch.stack([dta[0] for dta in test])
    X_test = torch.stack([test[0] for test in test_data_nn]).cpu()
    y_test = torch.stack([test[1] for test in test_data_nn]).cpu()
    y_pred = model_nn.predict(X_test).cpu()


print(precision_recall_fscore_support(y_test, y_pred, average='binary'))
print("AUC", roc_auc_score(y_test, y_pred))

# Plot training accuracy and loss side-by-side
plot_results(model_nn, train_config, dataloader)

Train model



Epoch 1: 100%|██████████| 68/68 [00:01<00:00, 65.46batch/s, batch_accuracy=0.71, loss=0.557] 
Epoch 2: 100%|██████████| 68/68 [00:00<00:00, 68.76batch/s, batch_accuracy=0.925, loss=0.236]
Epoch 3: 100%|██████████| 68/68 [00:01<00:00, 65.81batch/s, batch_accuracy=0.981, loss=0.0583]
Epoch 4: 100%|██████████| 68/68 [00:01<00:00, 64.70batch/s, batch_accuracy=1, loss=0.00669]   
Epoch 5: 100%|██████████| 68/68 [00:00<00:00, 69.72batch/s, batch_accuracy=1, loss=0.00225]   
Epoch 6: 100%|██████████| 68/68 [00:01<00:00, 66.71batch/s, batch_accuracy=1, loss=0.000935]
Epoch 7: 100%|██████████| 68/68 [00:01<00:00, 65.51batch/s, batch_accuracy=1, loss=0.00058] 
Epoch 8: 100%|██████████| 68/68 [00:01<00:00, 67.36batch/s, batch_accuracy=1, loss=0.000313]
Epoch 9: 100%|██████████| 68/68 [00:00<00:00, 69.05batch/s, batch_accuracy=1, loss=0.000333]
Epoch 10: 100%|██████████| 68/68 [00:01<00:00, 65.12batch/s, batch_accuracy=1, loss=0.000172]


(0.7850340136054422, 0.8911196911196911, 0.8347197106690778, None)
AUC 0.7288245917875547


# Other classifiers

https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html

"Particularly in high-dimensional spaces, data can more easily be separated linearly and the simplicity of classifiers such as naive Bayes and linear SVMs might lead to better generalization than is achieved by other classifiers."

## SVC, SVM
Effective in high dimensional spaces.

Still effective in cases where number of dimensions is greater than the number of samples.

LinearSVC with TfIdf did good on balanced English

In [13]:
from sklearn.svm import LinearSVC
# LinearSVC, tfidf
X_train = tfidf_encoder.transform(train_raw.texts)
print("Fit model")
model_LinearSVC_tfidf = LinearSVC()
model_LinearSVC_tfidf.fit(X_train, train_raw.labels)

pred_LinearSVC_tfidf = model_LinearSVC_tfidf.predict(tfidf_encoder.transform(test_raw.texts))

print(precision_recall_fscore_support(test_raw.labels, pred_LinearSVC_tfidf, average='binary'))
print("AUC:", roc_auc_score(y_test, pred_LinearSVC_tfidf))


Fit model
(0.7828114590273151, 0.9073359073359073, 0.8404864091559371, None)
AUC: 0.7300739893332486


## SGDClassifier
SGD requires a number of hyperparameters such as the regularization parameter and the number of iterations.

SGD is sensitive to feature scaling.

In [14]:
from sklearn.linear_model import SGDClassifier

model_SGDClassifier_tfidf = SGDClassifier()
model_SGDClassifier_tfidf.fit(X_train, train_raw.labels)

pred_SGDClassifier_tfidf = model_SGDClassifier_tfidf.predict(tfidf_encoder.transform(test_raw.texts))

print(precision_recall_fscore_support(test_raw.labels, pred_SGDClassifier_tfidf, average='binary'))

roc_auc_score(test_raw.labels, pred_SGDClassifier_tfidf)


(0.7813923227065712, 0.9274131274131274, 0.8481638418079096, None)


0.7332538888094443

## Naive Bayes

Overall bad performance, not worth pursuing

from sklearn.naive_bayes import GaussianNB

model_GaussianNB_tfidf = GaussianNB()
model_GaussianNB_tfidf.fit(X_train.toarray(), train_raw.labels)

pred_GaussianNB_tfidf = model_GaussianNB_tfidf.predict(tfidf_encoder.transform(test_raw.texts).toarray())

print(precision_recall_fscore_support(test_raw.labels, pred_GaussianNB_tfidf, average='binary'))
roc_auc_score(test_raw.labels, pred_GaussianNB_tfidf)

# Observations

- Neural network is still a good option
- sklearn's SGD is also good

# Test features

## Standard count vectors & scale
Not good on both LinearSVC and SGD


from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

encoding_pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('scaler', StandardScaler(with_mean=False))
])

encoding_pipeline.fit(train_raw.texts)

X_train = encoding_pipeline.transform(train_raw.texts)


print("Fit model")
model_LinearSVC_tfidf = LinearSVC()
model_LinearSVC_tfidf.fit(X_train, train_raw.labels)
pred_LinearSVC_tfidf = model_LinearSVC_tfidf.predict(encoding_pipeline.transform(test_raw.texts))
print(precision_recall_fscore_support(test_raw.labels, pred_LinearSVC_tfidf, average='binary'))
print("AUC:", roc_auc_score(y_test, pred_LinearSVC_tfidf))

model_SGDClassifier_tfidf = SGDClassifier()
model_SGDClassifier_tfidf.fit(X_train, train_raw.labels)
pred_SGDClassifier_tfidf = model_SGDClassifier_tfidf.predict(encoding_pipeline.transform(test_raw.texts))
print(precision_recall_fscore_support(test_raw.labels, pred_SGDClassifier_tfidf, average='binary'))
roc_auc_score(test_raw.labels, pred_SGDClassifier_tfidf)


## word TFIDF

word_tfidf = TfidfVectorizer(sublinear_tf=True, analyzer="word", ngram_range=(3,5), max_features=10000)

X_train = word_tfidf.fit_transform(train_raw.texts)


print("LinearSVC")
model_LinearSVC_tfidf = LinearSVC()
model_LinearSVC_tfidf.fit(X_train, train_raw.labels)
pred_LinearSVC_tfidf = model_LinearSVC_tfidf.predict(word_tfidf.transform(test_raw.texts))
print(precision_recall_fscore_support(test_raw.labels, pred_LinearSVC_tfidf, average='binary'))
print("AUC:", roc_auc_score(y_test, pred_LinearSVC_tfidf))

print("SGDClassifier")
model_SGDClassifier_tfidf = SGDClassifier()
model_SGDClassifier_tfidf.fit(X_train, train_raw.labels)

pred_SGDClassifier_tfidf = model_SGDClassifier_tfidf.predict(word_tfidf.transform(test_raw.texts))
print(precision_recall_fscore_support(test_raw.labels, pred_SGDClassifier_tfidf, average='binary'))
roc_auc_score(test_raw.labels, pred_SGDClassifier_tfidf)


Use more tfidf word (50000) features improve 1%, but takes much more time to transform

In [15]:
word_tfidf = TfidfVectorizer(sublinear_tf=True, analyzer="word", ngram_range=(3,5), max_features=50000)

X_train = word_tfidf.fit_transform(train_raw.texts)


# import scipy
# scipy.sparse.save_npz("models/tfidf/ngram_word_3to7_50000.npz", X_train)

print("LinearSVC")
model_LinearSVC_tfidf = LinearSVC()
model_LinearSVC_tfidf.fit(X_train, train_raw.labels)
pred_LinearSVC_tfidf = model_LinearSVC_tfidf.predict(word_tfidf.transform(test_raw.texts))
print(precision_recall_fscore_support(test_raw.labels, pred_LinearSVC_tfidf, average='binary'))
print("AUC:", roc_auc_score(test_raw.labels, pred_LinearSVC_tfidf))

print("SGDClassifier")
model_SGDClassifier_tfidf = SGDClassifier()
model_SGDClassifier_tfidf.fit(X_train, train_raw.labels)

pred_SGDClassifier_tfidf = model_SGDClassifier_tfidf.predict(word_tfidf.transform(test_raw.texts))
print(precision_recall_fscore_support(test_raw.labels, pred_SGDClassifier_tfidf, average='binary'))
roc_auc_score(test_raw.labels, pred_SGDClassifier_tfidf)


LinearSVC
(0.7557692307692307, 0.9104247104247104, 0.8259194395796847, None)
AUC: 0.6938954827843717
SGDClassifier
(0.7554278416347382, 0.9135135135135135, 0.8269835721775602, None)


0.6940681422162904

Char ngram

In [16]:
char_tfidf = TfidfVectorizer(sublinear_tf=True, analyzer="char", ngram_range=(3,7), max_features=50000)

X_train = char_tfidf.fit_transform(train_raw.texts)

import scipy
scipy.sparse.save_npz("models/tfidf/ngram_char_3to7_50000.npz", X_train)



LinearSVC
(0.7997293640054127, 0.9127413127413128, 0.85250631085467, None)
AUC: 0.753352823723194
SGDClassifier
(0.7918313570487484, 0.9281853281853282, 0.8546036260220405, None)


0.7473574103203733

In [23]:

print("LinearSVC")
model_LinearSVC_tfidf = LinearSVC()
model_LinearSVC_tfidf.fit(X_train, train_raw.labels)
pred_LinearSVC_tfidf = model_LinearSVC_tfidf.predict(char_tfidf.transform(test_raw.texts))
result_LinearSVC_tfidf = evaluate(test_raw.labels, pred_SGDClassifier_tfidf)

print("SGDClassifier")
model_SGDClassifier_tfidf = SGDClassifier()
model_SGDClassifier_tfidf.fit(X_train, train_raw.labels)

pred_SGDClassifier_tfidf = model_SGDClassifier_tfidf.predict(char_tfidf.transform(test_raw.texts))
result_SGDClassifier_tfidf = evaluate(test_raw.labels, pred_SGDClassifier_tfidf)

LinearSVC
Accuracy: 0.7979, Precision: 0.7918, Recall: 0.9282, F1: 0.8546, AUC: 0.7474
SGDClassifier
Accuracy: 0.7959, Precision: 0.7905, Recall: 0.9266, F1: 0.8532, AUC: 0.7452


In [22]:
def evaluate(y_test, y_pred) -> float:
    
    true_pos = sum([pred == y == 1 for pred, y in zip(y_pred, y_test)])
    true_neg = sum([pred == y == 0 for pred, y in zip(y_pred, y_test)])
    false_pos = sum([(pred == 1) * (y == 0) for pred, y in zip(y_pred, y_test)])
    false_neg = sum([(pred == 0) * (y == 1) for pred, y in zip(y_pred, y_test)])
    total = len(y_test)

    accuracy = (true_pos + true_neg) / total
    precision = true_pos / (true_pos + false_pos)
    recall = true_pos / (true_pos + false_neg)
    f1 = 2 * true_pos / (2 * true_pos + false_pos + false_neg)
    auc = roc_auc_score(y_test, y_pred)

    result = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "auc": auc
    }

    print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, AUC: {auc:.4f}")
    return result



Accuracy: 0.7979, Precision: 0.7918, Recall: 0.9282, F1: 0.8546, AUC: 0.7474


{'accuracy': 0.7979249011857708,
 'precision': 0.7918313570487484,
 'recall': 0.9281853281853282,
 'f1': 0.8546036260220405,
 'auc': 0.7473574103203733}

In [18]:
# Test char tfidf feature on NN
train_data_nn = encode_data(train_raw, char_tfidf)
test_data_nn = encode_data(test_raw, char_tfidf)

train_config = TrainConfig(
    num_epochs      = 10,
    early_stop      = False,
    violation_limit = 5,
)

dataloader = DataLoader(train_data_nn, batch_size=128, shuffle=True)


USE_CACHE = False
models_dir = Path("models")

model_nn = NeuralNetwork(
    input_size=len(char_tfidf.vocabulary_),
    hidden_size=128,
    device='cuda'
)

if (models_dir / 'model_nn.pt').exists() and USE_CACHE:
    model_nn = load_model(model_nn, models_dir, 'model_nn')
else:
    model_nn.fit(dataloader, train_config, disable_progress_bar=False)
    save_model(model_nn, models_dir, "model_nn")




Epoch 1: 100%|██████████| 68/68 [00:01<00:00, 65.04batch/s, batch_accuracy=0.813, loss=0.518]
Epoch 2: 100%|██████████| 68/68 [00:01<00:00, 66.21batch/s, batch_accuracy=0.832, loss=0.379]
Epoch 3: 100%|██████████| 68/68 [00:00<00:00, 69.93batch/s, batch_accuracy=0.925, loss=0.309]
Epoch 4: 100%|██████████| 68/68 [00:01<00:00, 66.92batch/s, batch_accuracy=0.972, loss=0.12]  
Epoch 5: 100%|██████████| 68/68 [00:00<00:00, 69.10batch/s, batch_accuracy=0.991, loss=0.0586]
Epoch 6: 100%|██████████| 68/68 [00:01<00:00, 65.78batch/s, batch_accuracy=0.991, loss=0.0502]
Epoch 7: 100%|██████████| 68/68 [00:01<00:00, 66.62batch/s, batch_accuracy=1, loss=0.00338]   
Epoch 8: 100%|██████████| 68/68 [00:01<00:00, 66.00batch/s, batch_accuracy=1, loss=0.00258]   
Epoch 9: 100%|██████████| 68/68 [00:00<00:00, 68.44batch/s, batch_accuracy=1, loss=0.00113]
Epoch 10: 100%|██████████| 68/68 [00:01<00:00, 64.62batch/s, batch_accuracy=1, loss=0.00088] 


TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.

In [19]:

with torch.no_grad():
    # X_test = torch.stack([dta[0] for dta in test])
    X_test = torch.stack([test[0] for test in test_data_nn]).cpu()
    y_test = torch.stack([test[1] for test in test_data_nn]).cpu()
    y_pred = model_nn.predict(X_test).cpu()


print(precision_recall_fscore_support(y_test, y_pred, average='binary'))
print("AUC", roc_auc_score(y_test, y_pred))

# Plot training accuracy and loss side-by-side
plot_results(model_nn, train_config, dataloader)

(0.7998585572842999, 0.8733590733590734, 0.8349944629014396, None)
AUC 0.7425780277632129
