# Experiment: Classical models vs. Neural Networks on GB dataset

We will compare the following models:
- Neural Network, TF-IDF features, character token
- Neural Network, TF-IDF features, word token
- RNN, character features
- RNN, word features
- Linear SVC word, chars
- Logistic regression words, chars
- SGD classifier words, chars
- Naive Bayes words, chars
- XGBoost Words, chars

In [5]:
# Load packages
import sys
from pathlib import Path
PARENT_DIR = Path.cwd().parent.parent.resolve()
sys.path.append(str(PARENT_DIR))

import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

from lib.data_processing import load_data, split_data, encode_data, PositionalEncoder
from lib.models import NeuralNetwork, RNNClassifier, TrainConfig, save_model, load_model
from lib.evaluation import evaluate, plot_results
from lib.utils import check_cuda_memory
from lib.logger import CustomLogger
from pathlib import Path

from tqdm import tqdm

# Set up device

PREFERRED_DEVICE = 'cuda'

if torch.cuda.is_available():
    DEVICE = PREFERRED_DEVICE
    print(f"CUDA available. Use: {DEVICE}")
    check_cuda_memory()
else:
    DEVICE = 'cpu'
    print(f"CUDA not available. Use: {DEVICE}")

# Set up dirs

project_dir = Path.cwd().parent.parent
models_dir = project_dir / ('models/gb')

if not models_dir.exists():
    models_dir.mkdir(parents=True, exist_ok=True)

# Set up logger
logger = CustomLogger("experiment_gb", log_to_local=False)

CUDA not available. Use: cpu


# Load data

In [7]:
data = load_data(file_path_list=[project_dir / "data/train/power/power-gb-train.tsv"],text_head="text_en")
train_raw, test_raw = split_data(data, test_size=0.2, random_state=0)
logger.info(f"Data size: {len(data)}, % positive class: {sum(data.labels) / len(data) * 100:.2f}%")

2024-09-27 17:41:04,079:experiment_gb:INFO:Data size: 33257, % positive class: 56.39%


# Prepare feature encoders

In [None]:

logger.info("Prepare words_encoder...")
words_encoder = TfidfVectorizer(max_features=50000)
words_encoder.fit(train_raw.texts)

logger.info("Prepare chars_encoder...")
chars_encoder = TfidfVectorizer(max_features=50000, analyzer="char", ngram_range=(3,5), use_idf=True, sublinear_tf=True)
chars_encoder.fit(train_raw.texts)

2024-09-27 17:41:13,318:experiment_gb:INFO:Prepare words_encoder...
2024-09-27 17:41:18,745:experiment_gb:INFO:Prepare chars_encoder...


# Neural Networks

### Word feature

In [9]:
logger.info("Prepare data...")
train_nn_words = encode_data(train_raw, words_encoder)
test_nn_words = encode_data(test_raw, words_encoder)

logger.info("Train model...")
train_config = TrainConfig(
    num_epochs      = 10,
    early_stop      = False,
    violation_limit = 5
)

dataloader = DataLoader(train_nn_words, batch_size=128, shuffle=True)

USE_CACHE = True

model_nn_words = NeuralNetwork(
    input_size=len(words_encoder.vocabulary_),
    hidden_size=128,
    device=DEVICE
)

if (models_dir / 'model_nn_words.pt').exists() and USE_CACHE:
    model_nn_words = load_model(model_nn_words, models_dir, 'model_nn_words')
else:
    model_nn_words.fit(dataloader, train_config, disable_progress_bar=False)
    save_model(model_nn_words, models_dir, "model_nn_words")


# Evaluate
with torch.no_grad():
    X_test_nn = torch.stack([test[0] for test in test_nn_words]).cpu()
    y_test_nn = torch.stack([test[1] for test in test_nn_words]).cpu()
    y_pred_nn_words = model_nn_words.predict(X_test_nn)
    logits_nn_words = model_nn_words.forward(X_test_nn)

result_nn_words = evaluate(y_test_nn.cpu(), y_pred_nn_words.cpu(), logits_nn_words.cpu())

# Plot training accuracy and loss side-by-side
plot_results("model_nn_words - Training accuracy & Loss", model_nn_words, train_config, dataloader)

# Move model to CPU to save CUDA memory
model_nn_words.to('cpu')
torch.cuda.empty_cache()

2024-09-27 17:44:27,861:experiment_gb:INFO:Prepare data...
2024-09-27 17:44:42,568:experiment_gb:INFO:Train model...


Accuracy: 0.7087, Precision: 0.7254, Recall: 0.7701, F1: 0.7471, AUC: 0.7760


### Char features

In [11]:
logger.info("Prepare data...")
test_nn_chars = encode_data(test_raw, chars_encoder)

logger.info("Train model")

train_config = TrainConfig(
    num_epochs      = 10,
    early_stop      = False,
    violation_limit = 5,
)

USE_CACHE = True

model_nn_chars = NeuralNetwork(
    input_size=len(chars_encoder.vocabulary_),
    hidden_size=128,
    device=DEVICE
)

if (models_dir / 'model_nn_chars.pt').exists() and USE_CACHE:
    model_nn_chars = load_model(model_nn_chars, models_dir, 'model_nn_chars')
else:
    train_nn_chars = encode_data(train_raw, chars_encoder)
    dataloader = DataLoader(train_nn_chars, batch_size=128, shuffle=True)

    model_nn_chars.fit(dataloader, train_config, disable_progress_bar=False)
    save_model(model_nn_chars, models_dir, "model_nn_chars")


# Evaluate
with torch.no_grad():
    X_test = torch.stack([test[0] for test in test_nn_chars]).to(model_nn_chars.device)
    y_test = torch.stack([test[1] for test in test_nn_chars]).to(model_nn_chars.device)
    y_pred = model_nn_chars.predict(X_test)
    logits = model_nn_chars.forward(X_test)

result_nn_chars = evaluate(y_test.cpu(), y_pred.cpu(), logits.cpu())

# Plot training accuracy and loss side-by-side
plot_results("model_nn_chars - Training accuracy & Loss", model_nn_chars, train_config, dataloader)

# Move model to CPU to save CUDA memory
model_nn_words.to('cpu')
torch.cuda.empty_cache()

2024-09-27 17:48:22,685:experiment_gb:INFO:Prepare data...
2024-09-27 17:48:47,016:experiment_gb:INFO:Train model


Accuracy: 0.7513, Precision: 0.7564, Recall: 0.8182, F1: 0.7861, AUC: 0.8244


# RNN

### Word feature

In [None]:

logger.info("Prepare data encoder...")
rnn_words_encoder = PositionalEncoder()
rnn_words_encoder.fit(train_raw.texts)

train_dataloader = DataLoader(train_raw, batch_size=128, shuffle=True)
test_dataloader = DataLoader(test_raw, batch_size=128, shuffle=False)

# Prepare baseline config
train_config = TrainConfig(
    optimizer_params = {'lr': 0.01},
    num_epochs       = 10,
    early_stop       = False,
    violation_limit  = 5
)

# Train baseline model
model_lstm_words = RNNClassifier(
    rnn_network         = nn.LSTM,
    word_embedding_dim  = 32,
    hidden_dim          = 64,
    bidirectional       = False,
    dropout             = 0,
    encoder             = rnn_words_encoder,
    device              = 'cuda'
)

USE_CACHE = True

if (models_dir / 'model_lstm_words.pt').exists() and USE_CACHE:
    model_lstm_words = load_model(model_lstm_words, 'model_lstm_words')
else:
    model_lstm_words.fit(train_dataloader, train_config, no_progress_bar=False)
    save_model(model_lstm_words, models_dir, "model_lstm_words")

test_dataloader = DataLoader(test_raw, batch_size=128, shuffle=False)

# Evaluate
with torch.no_grad():
    model_lstm_words.device = "cpu"
    model_lstm_words.cpu()

    pred_LSTM_words_lst = []
    probs_LSTM_words_lst = []

    for _, _, raw_inputs, raw_targets in tqdm(test_dataloader, unit="batch", desc="Predicting"):
        batch_encoder = PositionalEncoder(vocabulary=rnn_words_encoder.vocabulary)
        test_inputs = batch_encoder.fit_transform(raw_inputs).cpu()
        test_targets = torch.as_tensor(raw_targets, dtype=torch.float).cpu()
        
        pred_LSTM_words_lst.append(model_lstm_words.predict(test_inputs))
        probs_LSTM_words_lst.append(model_lstm_words._sigmoid(model_lstm_words.forward(test_inputs)).squeeze())


pred_LSTM_words = torch.cat(pred_LSTM_words_lst).long().numpy()
probs_LSTM_words = torch.concat(probs_LSTM_words_lst).numpy()

model_lstm_words_result = evaluate(test_raw.labels, pred_LSTM_words, probs_LSTM_words)

np.save(models_dir / 'model_lstm_words_results.npy', model_lstm_words_result)

model_lstm_words.cpu()
torch.cuda.empty_cache()


### Char features

In [None]:

logger.info("Prepare data encoder...")
rnn_chars_encoder = PositionalEncoder(tokenizer=chars_encoder.build_tokenizer())
rnn_chars_encoder.fit(train_raw.texts)

train_dataloader = DataLoader(train_raw, batch_size=128, shuffle=True)
test_dataloader = DataLoader(test_raw, batch_size=128, shuffle=False)

test_inputs = rnn_chars_encoder.transform(test_raw.texts)

# Prepare baseline config
train_config = TrainConfig(
    optimizer_params = {'lr': 0.01},
    num_epochs       = 10,
    early_stop       = False,
    violation_limit  = 5
)

# Train baseline model
model_lstm_chars = RNNClassifier(
    rnn_network         = nn.LSTM,
    word_embedding_dim  = 32,
    hidden_dim          = 64,
    bidirectional       = False,
    dropout             = 0,
    encoder             = rnn_chars_encoder,
    device              = 'cuda'
)

USE_CACHE = False

if (models_dir / 'model_lstm_chars.pt').exists() and USE_CACHE:
    model_lstm_chars = load_model(model_lstm_chars, 'model_lstm_chars')
else:
    model_lstm_chars.fit(train_dataloader, train_config, no_progress_bar=False)
    save_model(model_lstm_chars, models_dir, "model_lstm_chars")


with torch.no_grad():
    model_lstm_chars.device = "cpu"
    model_lstm_chars.cpu()

    pred_LSTM_chars = []
    logits_LSTM_chars = []

    for _, _, raw_inputs, raw_targets in tqdm(test_dataloader, unit="batch", desc="Predicting"):
        batch_encoder = PositionalEncoder(vocabulary=rnn_chars_encoder.vocabulary)
        test_inputs = batch_encoder.fit_transform(raw_inputs).cpu()
        test_targets = torch.as_tensor(raw_targets, dtype=torch.float).cpu()

        pred_LSTM_chars.append(model_lstm_chars.predict(test_inputs))
        logits_LSTM_chars.append(model_lstm_chars.forward(test_inputs))

pred_LSTM_chars = torch.concat(pred_LSTM_chars).numpy()
logits_LSTM_chars = torch.concat(logits_LSTM_chars).numpy()

model_lstm_chars_result = evaluate(test_raw.labels, pred_LSTM_chars, logits_LSTM_chars)

np.save(models_dir / 'model_lstm_chars_results.npy', model_lstm_chars_result)
model_lstm_words.cpu()
torch.cuda.empty_cache()



# Other classifiers from sklearn

Models tested: 
- LinearSVC
- LogisticRegression
- SGDClassifier

In [12]:
# Prepare train & test set
X_train_skl_words = words_encoder.transform(train_raw.texts)
X_test_skl_words = words_encoder.transform(test_raw.texts)

X_train_skl_chars = chars_encoder.transform(train_raw.texts)
X_test_skl_chars = chars_encoder.transform(test_raw.texts)

# Linear SVC
Effective in high dimensional spaces.

Still effective in cases where number of dimensions is greater than the number of samples.

LinearSVC with TfIdf did good on balanced English

### Word feature

In [None]:
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV

# LinearSVC, word


logger.info("Fit model")
base_svc = LinearSVC()
model_LinearSVC_words = CalibratedClassifierCV(estimator=base_svc, cv=5)
model_LinearSVC_words.fit(X_train_skl_words, train_raw.labels)

pred_LinearSVC_words = model_LinearSVC_words.predict(X_test_skl_words)
logits_linearSVC_words = model_LinearSVC_words.predict_proba(X_test_skl_words)

result_linearSVC_words =  evaluate(test_raw.labels, pred_LinearSVC_words, logits_linearSVC_words[:, 1])

### Char features

In [None]:
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV

logger.info("Fit model")
base_svc = LinearSVC()
model_LinearSVC_chars = CalibratedClassifierCV(estimator=base_svc, cv=5)
model_LinearSVC_chars.fit(X_train_skl_chars, train_raw.labels)

pred_LinearSVC_chars = model_LinearSVC_chars.predict(X_test_skl_chars)
logits_linearSVC_chars = model_LinearSVC_chars.predict_proba(X_test_skl_chars)

result_linearSVC_chars =  evaluate(test_raw.labels, pred_LinearSVC_chars, logits_linearSVC_chars[:, 1])

# Logistic Regression


### Word features

In [None]:
# Word features

from sklearn.linear_model import LogisticRegression

logger.info("Fit model")
model_logreg_words = LogisticRegression()
model_logreg_words.fit(X_train_skl_words, train_raw.labels)

pred_logreg_words = model_logreg_words.predict(X_test_skl_words)
logits_logreg_words = model_logreg_words.predict_proba(X_test_skl_words)

result_linearlogreg_words =  evaluate(test_raw.labels, pred_logreg_words, logits_logreg_words[:, 1])

### Char features

In [None]:
# char features

from sklearn.linear_model import LogisticRegression

logger.info("Fit model")
model_logreg_chars = LogisticRegression()
model_logreg_chars.fit(X_train_skl_chars, train_raw.labels)

pred_logreg_chars = model_logreg_chars.predict(X_test_skl_chars)
logits_logreg_chars = model_logreg_chars.predict_proba(X_test_skl_chars)

result_linearlogreg_chars = evaluate(test_raw.labels, pred_logreg_chars, logits_logreg_chars[:, 1])

# SGDClassifier
SGD requires a number of hyperparameters such as the regularization parameter and the number of iterations.

SGD is sensitive to feature scaling.

### Word features

In [None]:
# word features

from sklearn.linear_model import SGDClassifier

print("Fit model")
model_sgd_words = SGDClassifier(loss='log_loss')
model_sgd_words.fit(X_train_skl_words, train_raw.labels)

pred_sgd_words = model_sgd_words.predict(X_test_skl_words)
logits_sgd_words = model_sgd_words.predict_proba(X_test_skl_words)

result_linearsgd_words =  evaluate(test_raw.labels, pred_sgd_words, logits_sgd_words[:, 1])

### Char features

In [None]:
# chars features

from sklearn.linear_model import SGDClassifier

print("Fit model")
model_sgd_chars = SGDClassifier(loss='log_loss')
model_sgd_chars.fit(X_train_skl_chars, train_raw.labels)

pred_sgd_chars = model_sgd_chars.predict(X_test_skl_chars)
logits_sgd_chars = model_sgd_chars.predict_proba(X_test_skl_chars)

result_linearsgd_chars =  evaluate(test_raw.labels, pred_sgd_chars, logits_sgd_chars[:, 1])

# Naive Bayes

Overall bad performance, not worth pursuing

### Word features

In [None]:
# words features

from sklearn.naive_bayes import GaussianNB

print("Fit model")
model_gnb_words = GaussianNB()
model_gnb_words.fit(X_train_skl_words.toarray(), train_raw.labels)

pred_gnb_words = model_gnb_words.predict(X_test_skl_words.toarray())
logits_gnb_words = model_gnb_words.predict_proba(X_test_skl_words.toarray())

result_lineargnb_words =  evaluate(test_raw.labels, pred_gnb_words, logits_gnb_words[:, 1])

### Char features

In [None]:
# chars features

from sklearn.naive_bayes import GaussianNB

print("Fit model")
model_gnb_chars = GaussianNB()
model_gnb_chars.fit(X_train_skl_chars.toarray(), train_raw.labels)

pred_gnb_chars = model_gnb_chars.predict(X_test_skl_chars.toarray())
logits_gnb_chars = model_gnb_chars.predict_proba(X_test_skl_chars.toarray())

result_lineargnb_chars =  evaluate(test_raw.labels, pred_gnb_chars, logits_gnb_chars[:, 1])

# Xgboost


In [7]:
import xgboost as xgb
import pandas as pd

# Prepare data for xgboost
train_dmat_words = xgb.DMatrix(X_train_skl_words, pd.array(train_raw.labels).astype('category'))
test_dmat_words = xgb.DMatrix(X_test_skl_words, pd.array(test_raw.labels).astype('category'))

train_dmat_chars = xgb.DMatrix(X_train_skl_chars, pd.array(train_raw.labels).astype('category'))
test_dmat_chars = xgb.DMatrix(X_test_skl_chars, pd.array(test_raw.labels).astype('category'))

### Word features

In [None]:
params = {
    "booster": "gbtree",
    "device": "cpu",
    "objective": "binary:logistic",  # there is also binary:hinge but hinge does not output probability
    "tree_method": "hist",  # default to hist
    "device": "cuda",

    # Params for tree booster
    "eta": 0.3,
    "gamma": 0.0,  # Min loss achieved to split the tree
    "max_depth": 6,
    "reg_alpha": 0,
    "reg_lambda": 1,

}
evals_words = [(train_dmat_words, "train")]
iterations = 2000

model_xgb_words = xgb.train(
    params = params,
    dtrain = train_dmat_words,
    num_boost_round = iterations,
    evals = evals_words,
    verbose_eval = 100
)

pred_xgb_words_probs = model_xgb_words.predict(test_dmat_words)
result_xgb_words = evaluate(test_raw.labels, pred_xgb_words_probs > 0.5, pred_xgb_words_probs)

### Char features

In [None]:
# Can use only half of the original max features
xgb_chars_encoder = TfidfVectorizer(max_features=20000, analyzer="char", ngram_range=(3,5), use_idf=True, sublinear_tf=True)
xgb_chars_encoder.fit(train_raw.texts)

# Prepare train & test set
X_train_xgb_chars = xgb_chars_encoder.transform(train_raw.texts)
X_test_xgb_chars = xgb_chars_encoder.transform(test_raw.texts)

import xgboost as xgb

train_dmat_chars = xgb.DMatrix(X_train_xgb_chars, pd.array(train_raw.labels).astype('category'))
test_dmat_chars = xgb.DMatrix(X_test_xgb_chars, pd.array(test_raw.labels).astype('category'))

params = {
    "booster": "gbtree",
    "device": "cpu",
    "objective": "binary:logistic",  # there is also binary:hinge but hinge does not output probability
    "tree_method": "hist",  # default to hist
    "device": "cuda",

    # Params for tree booster
    "eta": 0.3,
    "gamma": 0.0,  # Min loss achieved to split the tree
    "max_depth": 6,
    "reg_alpha": 0,
    "reg_lambda": 1,

}
evals_chars = [(train_dmat_chars, "train")]
iterations = 2000

model_xgb_chars = xgb.train(
    params = params,
    dtrain = train_dmat_chars,
    num_boost_round = iterations,
    evals = evals_chars,
    verbose_eval = 100
)

pred_xgb_chars_probs = model_xgb_chars.predict(test_dmat_chars)
result_xgb_chars = evaluate(test_raw.labels, pred_xgb_chars_probs > 0.5, pred_xgb_chars_probs)