In [1]:
# Load packages
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

from models import NeuralNetwork, RNNClassifier, TrainConfig, evaluate, save_model, load_model, plot_results
from utils import load_data, split_data, encode_data, check_cuda_memory, PositionalEncoder
from pathlib import Path
import altair as alt
import pandas as pd

from tqdm import tqdm

if torch.cuda.is_available():
    print("Device: cuda")
    check_cuda_memory()
else:
    print("Device: cpu")

models_dir = Path('models/gb')


Device: cuda
NVIDIA GeForce RTX 3050 Laptop GPU
Total: 4,091.48 MB
Reserved: 0.00 MB
Allocated: 0.00 MB
Free in reserved: 0.00 MB


# Load data

In [2]:
data = load_data(folder_path="data/train/power/", file_list=['power-gb-train.tsv'],text_head='text_en')
train_raw, test_raw = split_data(data, test_size=0.2, random_state=0)
print(f"Data size: {len(data)}, % positive class: {sum(data.labels) / len(data) * 100:.2f}%")

Data size: 33257, % positive class: 56.39%


# Encode

In [3]:

print("Prepare words_encoder...")
words_encoder = TfidfVectorizer(max_features=50000)
words_encoder.fit(train_raw.texts)

print("Prepare chars_encoder...")
chars_encoder = TfidfVectorizer(max_features=50000, analyzer="char", ngram_range=(3,5), use_idf=True, sublinear_tf=True)
chars_encoder.fit(train_raw.texts)

Prepare words_encoder...
Prepare chars_encoder...


# Neural Networks

### Word feature

In [4]:
print("Prepare data...")
train_nn_words = encode_data(train_raw, words_encoder)
test_nn_words = encode_data(test_raw, words_encoder)

print("Train model")

if not models_dir.exists():
    models_dir.mkdir(parents=True, exist_ok=True)

train_config = TrainConfig(
    num_epochs      = 10,
    early_stop      = False,
    violation_limit = 5,
    
)

dataloader = DataLoader(train_nn_words, batch_size=128, shuffle=True)

USE_CACHE = True

model_nn_words = NeuralNetwork(
    input_size=len(words_encoder.vocabulary_),
    hidden_size=128,
    device='cuda'
)

if (models_dir / 'model_nn_words.pt').exists() and USE_CACHE:
    model_nn_words = load_model(model_nn_words, models_dir, 'model_nn_words')
else:
    model_nn_words.fit(dataloader, train_config, disable_progress_bar=False)
    save_model(model_nn_words, models_dir, "model_nn_words")


# Evaluate
with torch.no_grad():
    X_test_nn = torch.stack([test[0] for test in test_nn_words]).cpu()
    y_test_nn = torch.stack([test[1] for test in test_nn_words]).cpu()
    y_pred_nn_words = model_nn_words.predict(X_test_nn)
    logits_nn_words = model_nn_words.forward(X_test_nn)

result_nn_words = evaluate(y_test_nn.cpu(), y_pred_nn_words.cpu(), logits_nn_words.cpu())

# Plot training accuracy and loss side-by-side
plot_results(model_nn_words, train_config, dataloader)

model_nn_words.to('cpu')
torch.cuda.empty_cache()

Prepare data...
Train model
Accuracy: 0.7030, Precision: 0.7256, Recall: 0.7530, F1: 0.7391, AUC: 0.7781


### Char features

In [5]:
print("Prepare data...")
test_nn_chars = encode_data(test_raw, chars_encoder)

print("Train model")

if not models_dir.exists():
    models_dir.mkdir(parents=True, exist_ok=True)

train_config = TrainConfig(
    num_epochs      = 10,
    early_stop      = False,
    violation_limit = 5,
    
)



USE_CACHE = True

model_nn_chars = NeuralNetwork(
    input_size=len(chars_encoder.vocabulary_),
    hidden_size=128,
    device='cuda'
)

if (models_dir / 'model_nn_chars.pt').exists() and USE_CACHE:
    model_nn_chars = load_model(model_nn_chars, models_dir, 'model_nn_chars')
else:
    train_nn_chars = encode_data(train_raw, chars_encoder)
    dataloader = DataLoader(train_nn_chars, batch_size=128, shuffle=True)

    model_nn_chars.fit(dataloader, train_config, disable_progress_bar=False)
    save_model(model_nn_chars, models_dir, "model_nn_chars")


with torch.no_grad():
    X_test = torch.stack([test[0] for test in test_nn_chars]).to(model_nn_chars.device)
    y_test = torch.stack([test[1] for test in test_nn_chars]).to(model_nn_chars.device)
    y_pred = model_nn_chars.predict(X_test)
    logits = model_nn_chars.forward(X_test)

result_nn_chars = evaluate(y_test.cpu(), y_pred.cpu(), logits.cpu())

# Plot training accuracy and loss side-by-side
plot_results(model_nn_chars, train_config, dataloader)

Prepare data...
Train model
Accuracy: 0.7466, Precision: 0.7354, Recall: 0.8533, F1: 0.7900, AUC: 0.8188


# RNN

### Word feature

In [4]:

print("Prepare data encoder...")
rnn_words_encoder = PositionalEncoder()
rnn_words_encoder.fit(train_raw.texts)

train_dataloader = DataLoader(train_raw, batch_size=128, shuffle=True)
test_dataloader = DataLoader(test_raw, batch_size=128, shuffle=False)

# Prepare baseline config
train_config = TrainConfig(
    optimizer_params = {'lr': 0.01},
    num_epochs       = 10,
    early_stop       = False,
    violation_limit  = 5
)

# Train baseline model
model_lstm_words = RNNClassifier(
    rnn_network         = nn.LSTM,
    word_embedding_dim  = 32,
    hidden_dim          = 64,
    bidirectional       = False,
    dropout             = 0,
    encoder             = rnn_words_encoder,
    device              = 'cuda'
)

USE_CACHE = False

if (models_dir / 'model_lstm_words.pt').exists() and USE_CACHE:
    model_lstm_words = load_model(model_lstm_words, 'model_lstm_words')
else:
    model_lstm_words.fit(train_dataloader, train_config, no_progress_bar=False)
    save_model(model_lstm_words, models_dir, "model_lstm_words")

test_dataloader = DataLoader(test_raw, batch_size=128, shuffle=False)

# Evaluate
with torch.no_grad():
    model_lstm_words.device = "cpu"
    model_lstm_words.cpu()

    pred_LSTM_words_lst = []
    probs_LSTM_words_lst = []

    for _, _, raw_inputs, raw_targets in tqdm(test_dataloader, unit="batch", desc="Predicting"):
        batch_encoder = PositionalEncoder(vocabulary=rnn_words_encoder.vocabulary)
        test_inputs = batch_encoder.fit_transform(raw_inputs).cpu()
        test_targets = torch.as_tensor(raw_targets, dtype=torch.float).cpu()
        
        pred_LSTM_words_lst.append(model_lstm_words.predict(test_inputs))
        probs_LSTM_words_lst.append(model_lstm_words._sigmoid(model_lstm_words.forward(test_inputs)).squeeze())


pred_LSTM_words = torch.cat(pred_LSTM_words_lst).long().numpy()
probs_LSTM_words = torch.concat(probs_LSTM_words_lst).numpy()

model_lstm_words_result = evaluate(test_raw.labels, pred_LSTM_words, probs_LSTM_words)

np.save(models_dir / 'model_lstm_words_results.npy', model_lstm_words_result)

model_lstm_words.cpu()
torch.cuda.empty_cache()


Prepare data encoder...


  tokens_sparse = torch.sparse_csr_tensor(crow, col, token_val, size=mat_size, dtype=torch.long)
Epoch 1: 100%|██████████| 209/209 [01:33<00:00,  2.23batch/s, batch_accuracy=0.714, loss=9.66]
Epoch 2: 100%|██████████| 209/209 [01:33<00:00,  2.24batch/s, batch_accuracy=0.714, loss=7.41]
Epoch 3: 100%|██████████| 209/209 [01:33<00:00,  2.23batch/s, batch_accuracy=0.714, loss=2.86]
Epoch 4: 100%|██████████| 209/209 [01:33<00:00,  2.23batch/s, batch_accuracy=1, loss=11.1]   
Epoch 5: 100%|██████████| 209/209 [01:33<00:00,  2.24batch/s, batch_accuracy=1, loss=10.9]   
Epoch 6: 100%|██████████| 209/209 [01:34<00:00,  2.22batch/s, batch_accuracy=1, loss=5.56]   
Epoch 7: 100%|██████████| 209/209 [01:34<00:00,  2.22batch/s, batch_accuracy=0.571, loss=2.64]
Epoch 8: 100%|██████████| 209/209 [01:34<00:00,  2.22batch/s, batch_accuracy=0.857, loss=4.01]
Epoch 9: 100%|██████████| 209/209 [01:34<00:00,  2.21batch/s, batch_accuracy=0.857, loss=4.43]
Epoch 10: 100%|██████████| 209/209 [01:36<00:00,  2

Accuracy: 0.7255, Precision: 0.7036, Recall: 0.8787, F1: 0.7814, AUC: 0.7682





### Char features

In [5]:

print("Prepare data encoder...")
rnn_chars_encoder = PositionalEncoder(tokenizer=chars_encoder.build_tokenizer())
rnn_chars_encoder.fit(train_raw.texts)

train_dataloader = DataLoader(train_raw, batch_size=128, shuffle=True)
test_dataloader = DataLoader(test_raw, batch_size=128, shuffle=False)

test_inputs = rnn_chars_encoder.transform(test_raw.texts)

# Prepare baseline config
train_config = TrainConfig(
    optimizer_params = {'lr': 0.01},
    num_epochs       = 10,
    early_stop       = False,
    violation_limit  = 5
)

# Train baseline model
model_lstm_chars = RNNClassifier(
    rnn_network         = nn.LSTM,
    word_embedding_dim  = 32,
    hidden_dim          = 64,
    bidirectional       = False,
    dropout             = 0,
    encoder             = rnn_chars_encoder,
    device              = 'cuda'
)

USE_CACHE = False

if (models_dir / 'model_lstm_chars.pt').exists() and USE_CACHE:
    model_lstm_chars = load_model(model_lstm_chars, 'model_lstm_chars')
else:
    model_lstm_chars.fit(train_dataloader, train_config, no_progress_bar=False)
    save_model(model_lstm_chars, models_dir, "model_lstm_chars")


with torch.no_grad():
    model_lstm_chars.device = "cpu"
    model_lstm_chars.cpu()

    pred_LSTM_chars = []
    logits_LSTM_chars = []

    for _, _, raw_inputs, raw_targets in tqdm(test_dataloader, unit="batch", desc="Predicting"):
        batch_encoder = PositionalEncoder(vocabulary=rnn_chars_encoder.vocabulary)
        test_inputs = batch_encoder.fit_transform(raw_inputs).cpu()
        test_targets = torch.as_tensor(raw_targets, dtype=torch.float).cpu()

        pred_LSTM_chars.append(model_lstm_chars.predict(test_inputs))
        logits_LSTM_chars.append(model_lstm_chars.forward(test_inputs))

pred_LSTM_chars = torch.concat(pred_LSTM_chars).numpy()
logits_LSTM_chars = torch.concat(logits_LSTM_chars).numpy()

model_lstm_chars_result = evaluate(test_raw.labels, pred_LSTM_chars, logits_LSTM_chars)
# print(model_lstm_chars_result)

np.save(models_dir / 'model_lstm_chars_results.npy', model_lstm_chars_result)
model_lstm_words.cpu()
torch.cuda.empty_cache()



Prepare data encoder...


Epoch 1: 100%|██████████| 209/209 [01:34<00:00,  2.20batch/s, batch_accuracy=0.571, loss=7.78]
Epoch 2: 100%|██████████| 209/209 [01:35<00:00,  2.20batch/s, batch_accuracy=0.571, loss=8.04]
Epoch 3: 100%|██████████| 209/209 [01:34<00:00,  2.22batch/s, batch_accuracy=0.571, loss=4.85]
Epoch 4: 100%|██████████| 209/209 [01:36<00:00,  2.17batch/s, batch_accuracy=0.857, loss=11.3]
Epoch 5: 100%|██████████| 209/209 [01:35<00:00,  2.18batch/s, batch_accuracy=0.857, loss=6.47]
Epoch 6: 100%|██████████| 209/209 [01:35<00:00,  2.20batch/s, batch_accuracy=0.714, loss=6.7]
Epoch 7: 100%|██████████| 209/209 [01:35<00:00,  2.19batch/s, batch_accuracy=1, loss=10.8]   
Epoch 8: 100%|██████████| 209/209 [01:35<00:00,  2.19batch/s, batch_accuracy=1, loss=1.44]   
Epoch 9: 100%|██████████| 209/209 [01:36<00:00,  2.16batch/s, batch_accuracy=1, loss=10.8]   
Epoch 10: 100%|██████████| 209/209 [01:36<00:00,  2.17batch/s, batch_accuracy=1, loss=5.61]   


Accuracy: 0.5314, Precision: 0.5581, Recall: 0.7733, F1: 0.6483, AUC: 0.4975


# Other classifiers from sklearn

https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html

"Particularly in high-dimensional spaces, data can more easily be separated linearly and the simplicity of classifiers such as naive Bayes and linear SVMs might lead to better generalization than is achieved by other classifiers."

In [6]:
# Prepare train & test set
X_train_skl_words = words_encoder.transform(train_raw.texts)
X_test_skl_words = words_encoder.transform(test_raw.texts)

X_train_skl_chars = chars_encoder.transform(train_raw.texts)
X_test_skl_chars = chars_encoder.transform(test_raw.texts)

## Linear SVC
Effective in high dimensional spaces.

Still effective in cases where number of dimensions is greater than the number of samples.

LinearSVC with TfIdf did good on balanced English

### Word feature

In [20]:
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV

# LinearSVC, word


print("Fit model")
base_svc = LinearSVC()
model_LinearSVC_words = CalibratedClassifierCV(estimator=base_svc, cv=5)
model_LinearSVC_words.fit(X_train_skl_words, train_raw.labels)

pred_LinearSVC_words = model_LinearSVC_words.predict(X_test_skl_words)
logits_linearSVC_words = model_LinearSVC_words.predict_proba(X_test_skl_words)

result_linearSVC_words =  evaluate(test_raw.labels, pred_LinearSVC_words, logits_linearSVC_words[:, 1])

Fit model
Accuracy: 0.7468, Precision: 0.7452, Recall: 0.8306, F1: 0.7856, AUC: 0.8251


In [23]:
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV

print("Fit model")
base_svc = LinearSVC()
model_LinearSVC_chars = CalibratedClassifierCV(estimator=base_svc, cv=5)
model_LinearSVC_chars.fit(X_train_skl_chars, train_raw.labels)

pred_LinearSVC_chars = model_LinearSVC_chars.predict(X_test_skl_chars)
logits_linearSVC_chars = model_LinearSVC_chars.predict_proba(X_test_skl_chars)

result_linearSVC_chars =  evaluate(test_raw.labels, pred_LinearSVC_chars, logits_linearSVC_chars[:, 1])

Fit model
Accuracy: 0.7757, Precision: 0.7731, Recall: 0.8471, F1: 0.8084, AUC: 0.8514


# Logistic Regression


In [26]:
# Word features

from sklearn.linear_model import LogisticRegression

print("Fit model")
model_logreg_words = LogisticRegression()
model_logreg_words.fit(X_train_skl_words, train_raw.labels)

pred_logreg_words = model_logreg_words.predict(X_test_skl_words)
logits_logreg_words = model_logreg_words.predict_proba(X_test_skl_words)

result_linearlogreg_words =  evaluate(test_raw.labels, pred_logreg_words, logits_logreg_words[:, 1])

Fit model
Accuracy: 0.7628, Precision: 0.7515, Recall: 0.8595, F1: 0.8019, AUC: 0.8357


In [27]:
# char features

from sklearn.linear_model import LogisticRegression

print("Fit model")
model_logreg_chars = LogisticRegression()
model_logreg_chars.fit(X_train_skl_chars, train_raw.labels)

pred_logreg_chars = model_logreg_chars.predict(X_test_skl_chars)
logits_logreg_chars = model_logreg_chars.predict_proba(X_test_skl_chars)

result_linearlogreg_chars =  evaluate(test_raw.labels, pred_logreg_chars, logits_logreg_chars[:, 1])

Fit model
Accuracy: 0.7800, Precision: 0.7694, Recall: 0.8654, F1: 0.8146, AUC: 0.8573


## SGDClassifier
SGD requires a number of hyperparameters such as the regularization parameter and the number of iterations.

SGD is sensitive to feature scaling.

In [31]:
# word features

from sklearn.linear_model import SGDClassifier

print("Fit model")
model_sgd_words = SGDClassifier(loss='log_loss')
model_sgd_words.fit(X_train_skl_words, train_raw.labels)

pred_sgd_words = model_sgd_words.predict(X_test_skl_words)
logits_sgd_words = model_sgd_words.predict_proba(X_test_skl_words)

result_linearsgd_words =  evaluate(test_raw.labels, pred_sgd_words, logits_sgd_words[:, 1])

Fit model
Accuracy: 0.7552, Precision: 0.7364, Recall: 0.8749, F1: 0.7997, AUC: 0.8299


In [32]:
# chars features

from sklearn.linear_model import SGDClassifier

print("Fit model")
model_sgd_chars = SGDClassifier(loss='log_loss')
model_sgd_chars.fit(X_train_skl_chars, train_raw.labels)

pred_sgd_chars = model_sgd_chars.predict(X_test_skl_chars)
logits_sgd_chars = model_sgd_chars.predict_proba(X_test_skl_chars)

result_linearsgd_chars =  evaluate(test_raw.labels, pred_sgd_chars, logits_sgd_chars[:, 1])

Fit model
Accuracy: 0.7760, Precision: 0.7663, Recall: 0.8619, F1: 0.8113, AUC: 0.8516


## Naive Bayes

Overall bad performance, not worth pursuing

In [5]:
# words features

from sklearn.naive_bayes import GaussianNB

print("Fit model")
model_gnb_words = GaussianNB()
model_gnb_words.fit(X_train_skl_words.toarray(), train_raw.labels)

pred_gnb_words = model_gnb_words.predict(X_test_skl_words.toarray())
logits_gnb_words = model_gnb_words.predict_proba(X_test_skl_words.toarray())

result_lineargnb_words =  evaluate(test_raw.labels, pred_gnb_words, logits_gnb_words[:, 1])

Fit model
Accuracy: 0.5340, Precision: 0.6202, Recall: 0.4272, F1: 0.5059, AUC: 0.5490


In [6]:
# chars features

from sklearn.naive_bayes import GaussianNB

print("Fit model")
model_gnb_chars = GaussianNB()
model_gnb_chars.fit(X_train_skl_chars.toarray(), train_raw.labels)

pred_gnb_chars = model_gnb_chars.predict(X_test_skl_chars.toarray())
logits_gnb_chars = model_gnb_chars.predict_proba(X_test_skl_chars.toarray())

result_lineargnb_chars =  evaluate(test_raw.labels, pred_gnb_chars, logits_gnb_chars[:, 1])

Fit model
Accuracy: 0.7037, Precision: 0.7322, Recall: 0.7403, F1: 0.7363, AUC: 0.7258


# Xgboost


In [7]:
import xgboost as xgb
import pandas as pd

# Prepare data for xgboost
train_dmat_words = xgb.DMatrix(X_train_skl_words, pd.array(train_raw.labels).astype('category'))
test_dmat_words = xgb.DMatrix(X_test_skl_words, pd.array(test_raw.labels).astype('category'))

train_dmat_chars = xgb.DMatrix(X_train_skl_chars, pd.array(train_raw.labels).astype('category'))
test_dmat_chars = xgb.DMatrix(X_test_skl_chars, pd.array(test_raw.labels).astype('category'))

### Word features

In [7]:
params = {
    "booster": "gbtree",
    "device": "cpu",
    "objective": "binary:logistic",  # there is also binary:hinge but hinge does not output probability
    "tree_method": "hist",  # default to hist
    "device": "cuda",

    # Params for tree booster
    "eta": 0.3,
    "gamma": 0.0,  # Min loss achieved to split the tree
    "max_depth": 6,
    "reg_alpha": 0,
    "reg_lambda": 1,

}
evals_words = [(train_dmat_words, "train")]
iterations = 2000

model_xgb_words = xgb.train(
    params = params,
    dtrain = train_dmat_words,
    num_boost_round = iterations,
    evals = evals_words,
    verbose_eval = 100
)

pred_xgb_words_probs = model_xgb_words.predict(test_dmat_words)
result_xgb_words = evaluate(test_raw.labels, pred_xgb_words_probs > 0.5, pred_xgb_words_probs)

[0]	train-logloss:0.62296
[100]	train-logloss:0.26590
[200]	train-logloss:0.18824
[300]	train-logloss:0.13465
[400]	train-logloss:0.09987
[500]	train-logloss:0.07312
[600]	train-logloss:0.05486
[700]	train-logloss:0.04140
[800]	train-logloss:0.03124
[900]	train-logloss:0.02381
[1000]	train-logloss:0.01857
[1100]	train-logloss:0.01495
[1200]	train-logloss:0.01230
[1300]	train-logloss:0.01008
[1400]	train-logloss:0.00856
[1500]	train-logloss:0.00722
[1600]	train-logloss:0.00625
[1700]	train-logloss:0.00558
[1800]	train-logloss:0.00483
[1900]	train-logloss:0.00438
[1999]	train-logloss:0.00391
Accuracy: 0.7661, Precision: 0.7630, Recall: 0.8430, F1: 0.8010, AUC: 0.8420


In [None]:
# Can use only half of the original max features
xgb_chars_encoder = TfidfVectorizer(max_features=20000, analyzer="char", ngram_range=(3,5), use_idf=True, sublinear_tf=True)
xgb_chars_encoder.fit(train_raw.texts)

# Prepare train & test set
X_train_xgb_chars = xgb_chars_encoder.transform(train_raw.texts)
X_test_xgb_chars = xgb_chars_encoder.transform(test_raw.texts)

import xgboost as xgb

train_dmat_chars = xgb.DMatrix(X_train_xgb_chars, pd.array(train_raw.labels).astype('category'))
test_dmat_chars = xgb.DMatrix(X_test_xgb_chars, pd.array(test_raw.labels).astype('category'))

params = {
    "booster": "gbtree",
    "device": "cpu",
    "objective": "binary:logistic",  # there is also binary:hinge but hinge does not output probability
    "tree_method": "hist",  # default to hist
    "device": "cuda",

    # Params for tree booster
    "eta": 0.3,
    "gamma": 0.0,  # Min loss achieved to split the tree
    "max_depth": 6,
    "reg_alpha": 0,
    "reg_lambda": 1,

}
evals_chars = [(train_dmat_chars, "train")]
iterations = 2000

model_xgb_chars = xgb.train(
    params = params,
    dtrain = train_dmat_chars,
    num_boost_round = iterations,
    evals = evals_chars,
    verbose_eval = 100
)

pred_xgb_chars_probs = model_xgb_chars.predict(test_dmat_chars)
result_xgb_chars = evaluate(test_raw.labels, pred_xgb_chars_probs > 0.5, pred_xgb_chars_probs)

[0]	train-logloss:0.62296
[100]	train-logloss:0.26590
[200]	train-logloss:0.18824
[300]	train-logloss:0.13465
[400]	train-logloss:0.09987
[500]	train-logloss:0.07312
[600]	train-logloss:0.05486
[700]	train-logloss:0.04140
[800]	train-logloss:0.03124
[900]	train-logloss:0.02381
[1000]	train-logloss:0.01857
[1100]	train-logloss:0.01495
[1200]	train-logloss:0.01230
[1300]	train-logloss:0.01008
[1400]	train-logloss:0.00856
[1500]	train-logloss:0.00722
[1600]	train-logloss:0.00625
[1700]	train-logloss:0.00558
[1800]	train-logloss:0.00483
[1900]	train-logloss:0.00438
[1999]	train-logloss:0.00391
Accuracy: 0.7661, Precision: 0.7630, Recall: 0.8430, F1: 0.8010, AUC: 0.8420
