In [1]:
# Load packages
import numpy as np
import torch
from torch.utils.data import DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_fscore_support
from sklearn.svm import SVC, NuSVC, LinearSVC

from models import NeuralNetwork, TrainConfig, evaluate_nn_model, save_model, load_model, plot_results
from utils import load_data, split_data, encode_data, mapping_dict
from pathlib import Path
import altair as alt
import pandas as pd

from tqdm import tqdm

if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print("Device: cuda")
        print(torch.cuda.get_device_name(i))
else:
    print("Device: cpu")

Device: cpu


# Load data

In [2]:
data = load_data(folder_path="data/train/power/", file_list=['power-gb-train.tsv'],text_head='text_en')
train_raw, test_raw = split_data(data, test_size=0.2, random_state=0)


# Encode

In [3]:

print("Prepare data encoder...")
word_encoder = TfidfVectorizer(max_features=50000)
word_encoder.fit(train_raw.texts)

Prepare data encoder...


# Neural Networks

In [11]:
# POC
print("Prepare data...")
train_data_nn = encode_data(train_raw, tfidf_encoder)
test_data_nn = encode_data(test_raw, tfidf_encoder)

print("Train model")
models_dir = Path('models/gb')

if not models_dir.exists():
    models_dir.mkdir(parents=True, exist_ok=True)

train_config = TrainConfig(
    num_epochs      = 10,
    early_stop      = False,
    violation_limit = 5,
    
)

dataloader = DataLoader(train_data_nn, batch_size=128, shuffle=True)

USE_CACHE = False

model_nn = NeuralNetwork(
    input_size=len(tfidf_encoder.vocabulary_),
    hidden_size=128,
    device='cpu'
)

if (models_dir / 'model_nn.pt').exists() and USE_CACHE:
    model_nn = load_model(model_nn, models_dir, 'model_nn')
else:
    model_nn.fit(dataloader, train_config, disable_progress_bar=False)
    save_model(model_nn, models_dir, "model_nn")


with torch.no_grad():
    # X_test = torch.stack([dta[0] for dta in test])
    X_test = torch.stack([test[0] for test in test_data_nn]).to(model_nn.device)
    y_test = torch.stack([test[1] for test in test_data_nn]).to(model_nn.device)
    y_pred = model_nn.predict(X_test)


print(precision_recall_fscore_support(y_test, y_pred, average='binary'))
print("AUC", roc_auc_score(y_test, y_pred))

# Plot training accuracy and loss side-by-side
plot_results(model_nn, train_config, dataloader)

Train model



Epoch 1: 100%|██████████| 209/209 [00:05<00:00, 40.12batch/s, batch_accuracy=1, loss=0.22]     
Epoch 2: 100%|██████████| 209/209 [00:04<00:00, 42.84batch/s, batch_accuracy=1, loss=0.28]     
Epoch 3: 100%|██████████| 209/209 [00:04<00:00, 42.65batch/s, batch_accuracy=1, loss=0.127]    
Epoch 4: 100%|██████████| 209/209 [00:04<00:00, 44.02batch/s, batch_accuracy=1, loss=0.0512]    
Epoch 5: 100%|██████████| 209/209 [00:04<00:00, 43.34batch/s, batch_accuracy=1, loss=0.00795]   
Epoch 6: 100%|██████████| 209/209 [00:04<00:00, 43.90batch/s, batch_accuracy=1, loss=0.281]     
Epoch 7: 100%|██████████| 209/209 [00:05<00:00, 41.06batch/s, batch_accuracy=1, loss=0.000372]  
Epoch 8: 100%|██████████| 209/209 [00:05<00:00, 41.46batch/s, batch_accuracy=1, loss=0.000976]   
Epoch 9: 100%|██████████| 209/209 [00:04<00:00, 42.94batch/s, batch_accuracy=1, loss=9.73e-5]    
Epoch 10: 100%|██████████| 209/209 [00:05<00:00, 40.96batch/s, batch_accuracy=1, loss=0.000106]  


(0.7205994411988824, 0.7665495811942719, 0.7428646242471851, None)
AUC 0.6952406025629478


In [4]:
# Parameters finding

print("Train model")
models_dir = Path('models/gb')

if not models_dir.exists():
    models_dir.mkdir(parents=True, exist_ok=True)

train_config = TrainConfig(
    num_epochs      = 10,
    early_stop      = False,
    violation_limit = 5,
    optimizer_params= {"lr": 0.0001, "weight_decay": 0.001, }
)

dataloader = DataLoader(train_data_nn, batch_size=128, shuffle=True)

USE_CACHE = False

model_nn = NeuralNetwork(
    input_size=len(tfidf_encoder.vocabulary_),
    hidden_size=128,
    device='cpu'
)

if (models_dir / 'model_nn.pt').exists() and USE_CACHE:
    model_nn = load_model(model_nn, models_dir, 'model_nn')
else:
    model_nn.fit(dataloader, train_config, disable_progress_bar=False)
    save_model(model_nn, models_dir, "model_nn")


with torch.no_grad():
    # X_test = torch.stack([dta[0] for dta in test])
    X_test = torch.stack([test[0] for test in test_data_nn]).to(model_nn.device)
    y_test = torch.stack([test[1] for test in test_data_nn]).to(model_nn.device)
    y_pred = model_nn.predict(X_test)


print(precision_recall_fscore_support(y_test, y_pred, average='binary'))
print("AUC", roc_auc_score(y_test, y_pred))

# Plot training accuracy and loss side-by-side
plot_results(model_nn, train_config, dataloader)

Train model



Epoch 1: 100%|██████████| 209/209 [00:04<00:00, 44.74batch/s, batch_accuracy=1, loss=0.259]    
Epoch 2: 100%|██████████| 209/209 [00:04<00:00, 46.15batch/s, batch_accuracy=0.857, loss=0.224]
Epoch 3: 100%|██████████| 209/209 [00:04<00:00, 45.29batch/s, batch_accuracy=1, loss=0.156]     
Epoch 4: 100%|██████████| 209/209 [00:04<00:00, 45.85batch/s, batch_accuracy=1, loss=0.442]     
Epoch 5: 100%|██████████| 209/209 [00:04<00:00, 44.90batch/s, batch_accuracy=0.857, loss=0.309] 
Epoch 6: 100%|██████████| 209/209 [00:04<00:00, 43.80batch/s, batch_accuracy=1, loss=0.000741]  
Epoch 7: 100%|██████████| 209/209 [00:04<00:00, 45.52batch/s, batch_accuracy=1, loss=0.000174]  
Epoch 8: 100%|██████████| 209/209 [00:04<00:00, 46.16batch/s, batch_accuracy=1, loss=0.0255]    
Epoch 9: 100%|██████████| 209/209 [00:04<00:00, 45.34batch/s, batch_accuracy=1, loss=3.27e-5]   
Epoch 10: 100%|██████████| 209/209 [00:04<00:00, 45.90batch/s, batch_accuracy=1, loss=4.23e-6]  


(0.7226652945716491, 0.7589840583626047, 0.7403795466526094, None)
AUC 0.695218524907798


In [5]:
# Drop out

print("Train model")
models_dir = Path('models/gb')

if not models_dir.exists():
    models_dir.mkdir(parents=True, exist_ok=True)

train_config = TrainConfig(
    num_epochs      = 10,
    early_stop      = False,
    violation_limit = 5,
    optimizer_params= {"lr": 0.001, "weight_decay": 0.01, }
)

dataloader = DataLoader(train_data_nn, batch_size=128, shuffle=True)

USE_CACHE = False

model_nn = NeuralNetwork(
    input_size=len(tfidf_encoder.vocabulary_),
    hidden_size=128,
    dropout=0.5,
    device='cpu'
)

if (models_dir / 'model_nn.pt').exists() and USE_CACHE:
    model_nn = load_model(model_nn, models_dir, 'model_nn')
else:
    model_nn.fit(dataloader, train_config, disable_progress_bar=False)
    save_model(model_nn, models_dir, "model_nn")


with torch.no_grad():
    # X_test = torch.stack([dta[0] for dta in test])
    X_test = torch.stack([test[0] for test in test_data_nn]).to(model_nn.device)
    y_test = torch.stack([test[1] for test in test_data_nn]).to(model_nn.device)
    y_pred = model_nn.predict(X_test)


print(precision_recall_fscore_support(y_test, y_pred, average='binary'))
print("AUC", roc_auc_score(y_test, y_pred))

# Plot training accuracy and loss side-by-side
plot_results(model_nn, train_config, dataloader)

Train model



Epoch 1: 100%|██████████| 209/209 [00:05<00:00, 41.70batch/s, batch_accuracy=0.714, loss=0.424]
Epoch 2: 100%|██████████| 209/209 [00:04<00:00, 46.36batch/s, batch_accuracy=1, loss=0.123]    
Epoch 3: 100%|██████████| 209/209 [00:04<00:00, 45.63batch/s, batch_accuracy=0.857, loss=0.685]
Epoch 4: 100%|██████████| 209/209 [00:04<00:00, 45.54batch/s, batch_accuracy=1, loss=0.0587]    
Epoch 5: 100%|██████████| 209/209 [00:04<00:00, 45.10batch/s, batch_accuracy=1, loss=0.00202]   
Epoch 6: 100%|██████████| 209/209 [00:04<00:00, 45.45batch/s, batch_accuracy=1, loss=0.00921]   
Epoch 7: 100%|██████████| 209/209 [00:04<00:00, 45.36batch/s, batch_accuracy=1, loss=8.01e-5]   
Epoch 8: 100%|██████████| 209/209 [00:04<00:00, 45.52batch/s, batch_accuracy=1, loss=0.00048]  
Epoch 9: 100%|██████████| 209/209 [00:04<00:00, 45.43batch/s, batch_accuracy=1, loss=0.000123]  
Epoch 10: 100%|██████████| 209/209 [00:04<00:00, 44.88batch/s, batch_accuracy=1, loss=0.0189]    


(0.7170138888888888, 0.7811402323696298, 0.7477046424414846, None)
AUC 0.6955273811420799


# Other classifiers

https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html

"Particularly in high-dimensional spaces, data can more easily be separated linearly and the simplicity of classifiers such as naive Bayes and linear SVMs might lead to better generalization than is achieved by other classifiers."

## SVC, SVM
Effective in high dimensional spaces.

Still effective in cases where number of dimensions is greater than the number of samples.

LinearSVC with TfIdf did good on balanced English

In [7]:
from sklearn.svm import LinearSVC
# LinearSVC, tfidf
X_train = tfidf_encoder.transform(train_raw.texts)
print("Fit model")
model_LinearSVC_tfidf = LinearSVC()
model_LinearSVC_tfidf.fit(X_train, train_raw.labels)

pred_LinearSVC_tfidf = model_LinearSVC_tfidf.predict(tfidf_encoder.transform(test_raw.texts))

print(precision_recall_fscore_support(test_raw.labels, pred_LinearSVC_tfidf, average='binary'))
print("AUC:", roc_auc_score(y_test, pred_LinearSVC_tfidf))


Fit model




(0.749000999000999, 0.8103215347203458, 0.7784555483452303, None)
AUC: 0.7333658955653011


## SGDClassifier
SGD requires a number of hyperparameters such as the regularization parameter and the number of iterations.

SGD is sensitive to feature scaling.

In [10]:
from sklearn.linear_model import SGDClassifier

model_SGDClassifier_tfidf = SGDClassifier()
model_SGDClassifier_tfidf.fit(X_train, train_raw.labels)

pred_SGDClassifier_tfidf = model_SGDClassifier_tfidf.predict(tfidf_encoder.transform(test_raw.texts))

print(precision_recall_fscore_support(test_raw.labels, pred_SGDClassifier_tfidf, average='binary'))

roc_auc_score(test_raw.labels, pred_SGDClassifier_tfidf)


(0.7422609493235497, 0.8746284787895163, 0.803026544281816, None)


0.7451774872580061

## Naive Bayes

Overall bad performance, not worth pursuing

In [11]:
from sklearn.naive_bayes import GaussianNB

model_GaussianNB_tfidf = GaussianNB()
model_GaussianNB_tfidf.fit(X_train.toarray(), train_raw.labels)

pred_GaussianNB_tfidf = model_GaussianNB_tfidf.predict(tfidf_encoder.transform(test_raw.texts).toarray())

print(precision_recall_fscore_support(test_raw.labels, pred_GaussianNB_tfidf, average='binary'))
roc_auc_score(test_raw.labels, pred_GaussianNB_tfidf)

(0.620243232640251, 0.427181842745204, 0.50592, None)


0.5481208359025165

# Observations

- Neural network is still a good option
- sklearn's SGD is also good

# Test features

## Standard count vectors & scale
Not good on both LinearSVC and SGD

In [14]:

from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

encoding_pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('scaler', StandardScaler(with_mean=False))
])

encoding_pipeline.fit(train_raw.texts)

X_train = encoding_pipeline.transform(train_raw.texts)


print("Fit model")
model_LinearSVC_tfidf = LinearSVC()
model_LinearSVC_tfidf.fit(X_train, train_raw.labels)
pred_LinearSVC_tfidf = model_LinearSVC_tfidf.predict(encoding_pipeline.transform(test_raw.texts))
print(precision_recall_fscore_support(test_raw.labels, pred_LinearSVC_tfidf, average='binary'))
print("AUC:", roc_auc_score(y_test, pred_LinearSVC_tfidf))

model_SGDClassifier_tfidf = SGDClassifier()
model_SGDClassifier_tfidf.fit(X_train, train_raw.labels)
pred_SGDClassifier_tfidf = model_SGDClassifier_tfidf.predict(encoding_pipeline.transform(test_raw.texts))
print(precision_recall_fscore_support(test_raw.labels, pred_SGDClassifier_tfidf, average='binary'))
roc_auc_score(test_raw.labels, pred_SGDClassifier_tfidf)


Fit model




(0.689257969461559, 0.6952175087814104, 0.6922249125638956, None)
AUC: 0.6493181561001069


## Character-level TFIDF

In [23]:
word_tfidf = TfidfVectorizer(sublinear_tf=True, analyzer="word", ngram_range=(3,5), max_features=10000)

X_train = word_tfidf.fit_transform(train_raw.texts)


print("LinearSVC")
model_LinearSVC_tfidf = LinearSVC()
model_LinearSVC_tfidf.fit(X_train, train_raw.labels)
pred_LinearSVC_tfidf = model_LinearSVC_tfidf.predict(word_tfidf.transform(test_raw.texts))
print(precision_recall_fscore_support(test_raw.labels, pred_LinearSVC_tfidf, average='binary'))
print("AUC:", roc_auc_score(y_test, pred_LinearSVC_tfidf))

print("SGDClassifier")
model_SGDClassifier_tfidf = SGDClassifier()
model_SGDClassifier_tfidf.fit(X_train, train_raw.labels)

pred_SGDClassifier_tfidf = model_SGDClassifier_tfidf.predict(word_tfidf.transform(test_raw.texts))
print(precision_recall_fscore_support(test_raw.labels, pred_SGDClassifier_tfidf, average='binary'))
roc_auc_score(test_raw.labels, pred_SGDClassifier_tfidf)


LinearSVC




(0.753175591531756, 0.8170764658200487, 0.7838258164852255, None)
AUC: 0.7391365235083149
SGDClassifier
(0.7359676695105523, 0.8857065657930289, 0.8039239730226855, None)


0.7418276418708734

Use more tfidf word (50000) features improve 1%, but takes much more time to transform

In [24]:
word_tfidf = TfidfVectorizer(sublinear_tf=True, analyzer="word", ngram_range=(3,7), max_features=50000)

X_train = word_tfidf.fit_transform(train_raw.texts)

import scipy
scipy.sparse.save_npz("models/tfidf/ngram_word_3to7_50000.npz", X_train)

print("LinearSVC")
model_LinearSVC_tfidf = LinearSVC()
model_LinearSVC_tfidf.fit(X_train, train_raw.labels)
pred_LinearSVC_tfidf = model_LinearSVC_tfidf.predict(word_tfidf.transform(test_raw.texts))
print(precision_recall_fscore_support(test_raw.labels, pred_LinearSVC_tfidf, average='binary'))
print("AUC:", roc_auc_score(y_test, pred_LinearSVC_tfidf))

print("SGDClassifier")
model_SGDClassifier_tfidf = SGDClassifier()
model_SGDClassifier_tfidf.fit(X_train, train_raw.labels)

pred_SGDClassifier_tfidf = model_SGDClassifier_tfidf.predict(word_tfidf.transform(test_raw.texts))
print(precision_recall_fscore_support(test_raw.labels, pred_SGDClassifier_tfidf, average='binary'))
roc_auc_score(test_raw.labels, pred_SGDClassifier_tfidf)


LinearSVC




(0.7683168316831683, 0.8386922453390976, 0.801963570598114, None)
AUC: 0.7593461226695487
SGDClassifier
(0.7429515418502203, 0.9113753039718995, 0.8185899769445456, None)


0.7562004724987703

Char ngram is slower and offers the same performance

In [35]:
char_tfidf = TfidfVectorizer(sublinear_tf=True, analyzer="char", ngram_range=(3,7), max_features=50000)

X_train = char_tfidf.fit_transform(train_raw.texts)

import scipy
scipy.sparse.save_npz("models/tfidf/ngram_char_3to7_50000.npz", X_train)

print("LinearSVC")
model_LinearSVC_tfidf = LinearSVC()
model_LinearSVC_tfidf.fit(X_train, train_raw.labels)
pred_LinearSVC_tfidf = model_LinearSVC_tfidf.predict(char_tfidf.transform(test_raw.texts))
print(precision_recall_fscore_support(test_raw.labels, pred_LinearSVC_tfidf, average='binary'))
print("AUC:", roc_auc_score(y_test, pred_LinearSVC_tfidf))

print("SGDClassifier")
model_SGDClassifier_tfidf = SGDClassifier()
model_SGDClassifier_tfidf.fit(X_train, train_raw.labels)

pred_SGDClassifier_tfidf = model_SGDClassifier_tfidf.predict(char_tfidf.transform(test_raw.texts))
print(precision_recall_fscore_support(test_raw.labels, pred_SGDClassifier_tfidf, average='binary'))
roc_auc_score(test_raw.labels, pred_SGDClassifier_tfidf)


LinearSVC




(0.7789526686807654, 0.8359902728992165, 0.8064642252052652, None)
AUC: 0.7679096663641382
SGDClassifier
(0.7625231910946196, 0.88840853823291, 0.8206664170722576, None)


0.7691615340737199

In [37]:
# Test char tfidf feature on NN
train_data_nn = encode_data(train_raw, char_tfidf)
test_data_nn = encode_data(test_raw, char_tfidf)

train_config = TrainConfig(
    num_epochs      = 10,
    early_stop      = False,
    violation_limit = 5,
)

dataloader = DataLoader(train_data_nn, batch_size=128, shuffle=True)

USE_CACHE = False


model_nn = NeuralNetwork(
    input_size=len(char_tfidf.vocabulary_),
    hidden_size=128,
    device='cpu'
)

if (models_dir / 'model_nn.pt').exists() and USE_CACHE:
    model_nn = load_model(model_nn, models_dir, 'model_nn')
else:
    model_nn.fit(dataloader, train_config, disable_progress_bar=False)
    save_model(model_nn, models_dir, "model_nn")


with torch.no_grad():
    # X_test = torch.stack([dta[0] for dta in test])
    X_test = torch.stack([test[0] for test in test_data_nn]).to(model_nn.device)
    y_test = torch.stack([test[1] for test in test_data_nn]).to(model_nn.device)
    y_pred = model_nn.predict(X_test)


print(precision_recall_fscore_support(y_test, y_pred, average='binary'))
print("AUC", roc_auc_score(y_test, y_pred))

# Plot training accuracy and loss side-by-side
plot_results(model_nn, train_config, dataloader)




Epoch 1: 100%|██████████| 209/209 [00:04<00:00, 45.44batch/s, batch_accuracy=1, loss=0.22]     
Epoch 2: 100%|██████████| 209/209 [00:03<00:00, 58.35batch/s, batch_accuracy=1, loss=0.205]    
Epoch 3: 100%|██████████| 209/209 [00:03<00:00, 57.01batch/s, batch_accuracy=1, loss=0.112]    
Epoch 4: 100%|██████████| 209/209 [00:03<00:00, 53.92batch/s, batch_accuracy=1, loss=0.428]    
Epoch 5: 100%|██████████| 209/209 [00:03<00:00, 54.05batch/s, batch_accuracy=1, loss=0.0821]   
Epoch 6: 100%|██████████| 209/209 [00:03<00:00, 54.59batch/s, batch_accuracy=1, loss=0.135]     
Epoch 7: 100%|██████████| 209/209 [00:03<00:00, 53.61batch/s, batch_accuracy=1, loss=0.0163]    
Epoch 8: 100%|██████████| 209/209 [00:03<00:00, 56.02batch/s, batch_accuracy=1, loss=0.0147]    
Epoch 9: 100%|██████████| 209/209 [00:04<00:00, 49.90batch/s, batch_accuracy=1, loss=0.000198]   
Epoch 10: 100%|██████████| 209/209 [00:04<00:00, 51.04batch/s, batch_accuracy=0.857, loss=0.711]


(0.7721052631578947, 0.7927587138611186, 0.782295693907479, None)
AUC 0.7483451688963714


# Xgboost


In [8]:
tfidf_features = tfidf_encoder.transform(train_raw.texts)

In [11]:
import xgboost as xgb
import pandas as pd

# Prepare data for xgboost
train_dmat = xgb.DMatrix(tfidf_encoder.transform(train_raw.texts).todense(), pd.array(train_raw.labels).astype('category'))
test_dmat = xgb.DMatrix(tfidf_encoder.transform(test_raw.texts).todense(), pd.array(test_raw.labels).astype('category'))

In [13]:
params = {
    "booster": "gbtree",
    "device": "cpu",
    "objective": "binary:logistic",  # there is also binary:hinge
    "tree_method": "auto",  # default to hist

    # Params for tree booster
    "eta": 0.3,
    "gamma": 0.0,  # Min loss achieved to split the tree
    "max_depth": 6,
    "reg_alpha": 0,
    "reg_lambda": 1,

}
evals = [(train_dmat, "train")]
iterations = 5000

model_xgb = xgb.train(
    params = params,
    dtrain = train_dmat,
    num_boost_round = iterations,
    evals = evals,
    verbose_eval = 250
)

[0]	train-logloss:0.62286
