In [1]:
# Load packages
import numpy as np
import torch
from torch.utils.data import DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_fscore_support
from sklearn.svm import SVC, NuSVC, LinearSVC

from models import NeuralNetwork, TrainConfig, evaluate_nn_model, save_model, load_model, plot_results
from utils import load_data, split_data, encode_data, mapping_dict
from pathlib import Path
import altair as alt
import pandas as pd

from tqdm import tqdm

if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print("Device: cuda")
        print(torch.cuda.get_device_name(i))
else:
    print("Device: cpu")

Device: cpu


# Load data

In [2]:
data = load_data(folder_path="data/train/power/", file_list=['power-hr-train.tsv'],text_head='text')
train_raw, test_raw = split_data(data, test_size=0.2, random_state=0)


# Encode

In [3]:

print("Prepare data encoder...")
tfidf_encoder = TfidfVectorizer(max_features=50000)
tfidf_encoder.fit(train_raw.texts)

print("Prepare data...")
train_data_nn = encode_data(train_raw, tfidf_encoder)
test_data_nn = encode_data(test_raw, tfidf_encoder)

Prepare data encoder...
Prepare data...


# Neural Networks

In [5]:
# POC

print("Train model")
models_dir = Path('models/hr')

if not models_dir.exists():
    models_dir.mkdir(parents=True, exist_ok=True)

train_config = TrainConfig(
    num_epochs      = 10,
    early_stop      = False,
    violation_limit = 5,
    
)

dataloader = DataLoader(train_data_nn, batch_size=128, shuffle=True)

USE_CACHE = False

model_nn = NeuralNetwork(
    input_size=len(tfidf_encoder.vocabulary_),
    hidden_size=128,
    device='cpu'
)

if (models_dir / 'model_nn.pt').exists() and USE_CACHE:
    model_nn = load_model(model_nn, models_dir, 'model_nn')
else:
    model_nn.fit(dataloader, train_config, disable_progress_bar=False)
    save_model(model_nn, models_dir, "model_nn")


with torch.no_grad():
    # X_test = torch.stack([dta[0] for dta in test])
    X_test = torch.stack([test[0] for test in test_data_nn]).to(model_nn.device)
    y_test = torch.stack([test[1] for test in test_data_nn]).to(model_nn.device)
    y_pred = model_nn.predict(X_test)


print(precision_recall_fscore_support(y_test, y_pred, average='binary'))
print("AUC", roc_auc_score(y_test, y_pred))

# Plot training accuracy and loss side-by-side
plot_results(model_nn, train_config, dataloader)

Train model



Epoch 1:   0%|          | 0/68 [00:00<?, ?batch/s]

Epoch 1: 100%|██████████| 68/68 [00:00<00:00, 81.29batch/s, batch_accuracy=0.675, loss=0.602]
Epoch 2: 100%|██████████| 68/68 [00:00<00:00, 83.85batch/s, batch_accuracy=0.818, loss=0.51] 
Epoch 3: 100%|██████████| 68/68 [00:00<00:00, 89.02batch/s, batch_accuracy=0.935, loss=0.281]
Epoch 4: 100%|██████████| 68/68 [00:00<00:00, 84.95batch/s, batch_accuracy=0.987, loss=0.123] 
Epoch 5: 100%|██████████| 68/68 [00:00<00:00, 88.68batch/s, batch_accuracy=0.974, loss=0.0692]
Epoch 6: 100%|██████████| 68/68 [00:00<00:00, 78.36batch/s, batch_accuracy=1, loss=0.00678]   
Epoch 7: 100%|██████████| 68/68 [00:00<00:00, 72.39batch/s, batch_accuracy=1, loss=0.00374]   
Epoch 8: 100%|██████████| 68/68 [00:01<00:00, 65.91batch/s, batch_accuracy=1, loss=0.00127]    
Epoch 9: 100%|██████████| 68/68 [00:00<00:00, 73.32batch/s, batch_accuracy=1, loss=0.000529]
Epoch 10: 100%|██████████| 68/68 [00:00<00:00, 79.38batch/s, batch_accuracy=1, loss=0.000966]


(0.5334207077326344, 0.5145385587863464, 0.5238095238095238, None)
AUC 0.6200294952759797


In [6]:
# Parameters finding

print("Train model")
models_dir = Path('models/hr')

if not models_dir.exists():
    models_dir.mkdir(parents=True, exist_ok=True)

train_config = TrainConfig(
    num_epochs      = 10,
    early_stop      = False,
    violation_limit = 5,
    optimizer_params= {"lr": 0.0001, "weight_decay": 0.001, }
)

dataloader = DataLoader(train_data_nn, batch_size=128, shuffle=True)

USE_CACHE = False

model_nn = NeuralNetwork(
    input_size=len(tfidf_encoder.vocabulary_),
    hidden_size=128,
    device='cpu'
)

if (models_dir / 'model_nn.pt').exists() and USE_CACHE:
    model_nn = load_model(model_nn, models_dir, 'model_nn')
else:
    model_nn.fit(dataloader, train_config, disable_progress_bar=False)
    save_model(model_nn, models_dir, "model_nn")


with torch.no_grad():
    # X_test = torch.stack([dta[0] for dta in test])
    X_test = torch.stack([test[0] for test in test_data_nn]).to(model_nn.device)
    y_test = torch.stack([test[1] for test in test_data_nn]).to(model_nn.device)
    y_pred = model_nn.predict(X_test)


print(precision_recall_fscore_support(y_test, y_pred, average='binary'))
print("AUC", roc_auc_score(y_test, y_pred))

# Plot training accuracy and loss side-by-side
plot_results(model_nn, train_config, dataloader)

Train model



Epoch 1: 100%|██████████| 68/68 [00:00<00:00, 83.39batch/s, batch_accuracy=0.766, loss=0.574]
Epoch 2: 100%|██████████| 68/68 [00:00<00:00, 87.64batch/s, batch_accuracy=0.831, loss=0.482]
Epoch 3: 100%|██████████| 68/68 [00:00<00:00, 77.28batch/s, batch_accuracy=0.935, loss=0.228]
Epoch 4: 100%|██████████| 68/68 [00:00<00:00, 80.56batch/s, batch_accuracy=0.987, loss=0.0843]
Epoch 5: 100%|██████████| 68/68 [00:00<00:00, 75.62batch/s, batch_accuracy=1, loss=0.0257]    
Epoch 6: 100%|██████████| 68/68 [00:00<00:00, 78.07batch/s, batch_accuracy=1, loss=0.0077]    
Epoch 7: 100%|██████████| 68/68 [00:00<00:00, 83.77batch/s, batch_accuracy=1, loss=0.00416]   
Epoch 8: 100%|██████████| 68/68 [00:00<00:00, 79.57batch/s, batch_accuracy=1, loss=0.00167]   
Epoch 9: 100%|██████████| 68/68 [00:00<00:00, 71.88batch/s, batch_accuracy=1, loss=0.000639]
Epoch 10: 100%|██████████| 68/68 [00:00<00:00, 84.18batch/s, batch_accuracy=1, loss=0.000394]


(0.5217948717948718, 0.5145385587863464, 0.5181413112667091, None)
AUC 0.6134759100793722


In [7]:
# Drop out

print("Train model")
models_dir = Path('models/hr')

if not models_dir.exists():
    models_dir.mkdir(parents=True, exist_ok=True)

train_config = TrainConfig(
    num_epochs      = 10,
    early_stop      = False,
    violation_limit = 5,
    optimizer_params= {"lr": 0.001, "weight_decay": 0.01, }
)

dataloader = DataLoader(train_data_nn, batch_size=128, shuffle=True)

USE_CACHE = False

model_nn = NeuralNetwork(
    input_size=len(tfidf_encoder.vocabulary_),
    hidden_size=128,
    dropout=0.5,
    device='cpu'
)

if (models_dir / 'model_nn.pt').exists() and USE_CACHE:
    model_nn = load_model(model_nn, models_dir, 'model_nn')
else:
    model_nn.fit(dataloader, train_config, disable_progress_bar=False)
    save_model(model_nn, models_dir, "model_nn")


with torch.no_grad():
    # X_test = torch.stack([dta[0] for dta in test])
    X_test = torch.stack([test[0] for test in test_data_nn]).to(model_nn.device)
    y_test = torch.stack([test[1] for test in test_data_nn]).to(model_nn.device)
    y_pred = model_nn.predict(X_test)


print(precision_recall_fscore_support(y_test, y_pred, average='binary'))
print("AUC", roc_auc_score(y_test, y_pred))

# Plot training accuracy and loss side-by-side
plot_results(model_nn, train_config, dataloader)

Train model



Epoch 1: 100%|██████████| 68/68 [00:00<00:00, 78.29batch/s, batch_accuracy=0.494, loss=0.686]
Epoch 2: 100%|██████████| 68/68 [00:00<00:00, 77.96batch/s, batch_accuracy=0.844, loss=0.453]
Epoch 3: 100%|██████████| 68/68 [00:00<00:00, 72.24batch/s, batch_accuracy=0.922, loss=0.238]
Epoch 4: 100%|██████████| 68/68 [00:00<00:00, 77.01batch/s, batch_accuracy=0.974, loss=0.0925]
Epoch 5: 100%|██████████| 68/68 [00:00<00:00, 80.66batch/s, batch_accuracy=1, loss=0.0164]    
Epoch 6: 100%|██████████| 68/68 [00:00<00:00, 73.72batch/s, batch_accuracy=1, loss=0.00961]   
Epoch 7: 100%|██████████| 68/68 [00:00<00:00, 78.78batch/s, batch_accuracy=1, loss=0.00853]   
Epoch 8: 100%|██████████| 68/68 [00:00<00:00, 79.81batch/s, batch_accuracy=1, loss=0.004]     
Epoch 9: 100%|██████████| 68/68 [00:00<00:00, 74.57batch/s, batch_accuracy=1, loss=0.00379]  
Epoch 10: 100%|██████████| 68/68 [00:00<00:00, 70.09batch/s, batch_accuracy=1, loss=0.0015]    


(0.5389876880984952, 0.49810366624525915, 0.5177398160315374, None)
AUC 0.6191366442251739


# Other classifiers

https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html

"Particularly in high-dimensional spaces, data can more easily be separated linearly and the simplicity of classifiers such as naive Bayes and linear SVMs might lead to better generalization than is achieved by other classifiers."

## SVC, SVM
Effective in high dimensional spaces.

Still effective in cases where number of dimensions is greater than the number of samples.

LinearSVC with TfIdf did good on balanced English

In [8]:
from sklearn.svm import LinearSVC
# LinearSVC, tfidf
X_train = tfidf_encoder.transform(train_raw.texts)
print("Fit model")
model_LinearSVC_tfidf = LinearSVC()
model_LinearSVC_tfidf.fit(X_train, train_raw.labels)

pred_LinearSVC_tfidf = model_LinearSVC_tfidf.predict(tfidf_encoder.transform(test_raw.texts))

print(precision_recall_fscore_support(test_raw.labels, pred_LinearSVC_tfidf, average='binary'))
print("AUC:", roc_auc_score(y_test, pred_LinearSVC_tfidf))


Fit model




(0.5743145743145743, 0.5031605562579013, 0.5363881401617251, None)
AUC: 0.6378562997172315


## SGDClassifier
SGD requires a number of hyperparameters such as the regularization parameter and the number of iterations.

SGD is sensitive to feature scaling.

In [9]:
from sklearn.linear_model import SGDClassifier

model_SGDClassifier_tfidf = SGDClassifier()
model_SGDClassifier_tfidf.fit(X_train, train_raw.labels)

pred_SGDClassifier_tfidf = model_SGDClassifier_tfidf.predict(tfidf_encoder.transform(test_raw.texts))

print(precision_recall_fscore_support(test_raw.labels, pred_SGDClassifier_tfidf, average='binary'))

roc_auc_score(test_raw.labels, pred_SGDClassifier_tfidf)


(0.6208695652173913, 0.45132743362831856, 0.5226939970717424, None)


0.6416236242929566

## Naive Bayes

Overall bad performance, not worth pursuing

In [10]:
from sklearn.naive_bayes import GaussianNB

model_GaussianNB_tfidf = GaussianNB()
model_GaussianNB_tfidf.fit(X_train.toarray(), train_raw.labels)

pred_GaussianNB_tfidf = model_GaussianNB_tfidf.predict(tfidf_encoder.transform(test_raw.texts).toarray())

print(precision_recall_fscore_support(test_raw.labels, pred_GaussianNB_tfidf, average='binary'))
roc_auc_score(test_raw.labels, pred_GaussianNB_tfidf)

(0.4217687074829932, 0.5486725663716814, 0.47692307692307695, None)


0.5449608013045762

# Observations

- Neural network is still a good option
- sklearn's SGD is also good

# Test features

## Standard count vectors & scale
Not good on both LinearSVC and SGD

In [11]:

from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

encoding_pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('scaler', StandardScaler(with_mean=False))
])

encoding_pipeline.fit(train_raw.texts)

X_train = encoding_pipeline.transform(train_raw.texts)


print("Fit model")
model_LinearSVC_tfidf = LinearSVC()
model_LinearSVC_tfidf.fit(X_train, train_raw.labels)
pred_LinearSVC_tfidf = model_LinearSVC_tfidf.predict(encoding_pipeline.transform(test_raw.texts))
print(precision_recall_fscore_support(test_raw.labels, pred_LinearSVC_tfidf, average='binary'))
print("AUC:", roc_auc_score(y_test, pred_LinearSVC_tfidf))

model_SGDClassifier_tfidf = SGDClassifier()
model_SGDClassifier_tfidf.fit(X_train, train_raw.labels)
pred_SGDClassifier_tfidf = model_SGDClassifier_tfidf.predict(encoding_pipeline.transform(test_raw.texts))
print(precision_recall_fscore_support(test_raw.labels, pred_SGDClassifier_tfidf, average='binary'))
roc_auc_score(test_raw.labels, pred_SGDClassifier_tfidf)


Fit model




(0.4963054187192118, 0.5094816687737042, 0.5028072364316906, None)
AUC: 0.5970692846567055
(0.48759305210918114, 0.4968394437420986, 0.49217282404508456, None)


0.5892061520946422

## word TFIDF

In [12]:
word_tfidf = TfidfVectorizer(sublinear_tf=True, analyzer="word", ngram_range=(3,5), max_features=10000)

X_train = word_tfidf.fit_transform(train_raw.texts)


print("LinearSVC")
model_LinearSVC_tfidf = LinearSVC()
model_LinearSVC_tfidf.fit(X_train, train_raw.labels)
pred_LinearSVC_tfidf = model_LinearSVC_tfidf.predict(word_tfidf.transform(test_raw.texts))
print(precision_recall_fscore_support(test_raw.labels, pred_LinearSVC_tfidf, average='binary'))
print("AUC:", roc_auc_score(y_test, pred_LinearSVC_tfidf))

print("SGDClassifier")
model_SGDClassifier_tfidf = SGDClassifier()
model_SGDClassifier_tfidf.fit(X_train, train_raw.labels)

pred_SGDClassifier_tfidf = model_SGDClassifier_tfidf.predict(word_tfidf.transform(test_raw.texts))
print(precision_recall_fscore_support(test_raw.labels, pred_SGDClassifier_tfidf, average='binary'))
roc_auc_score(test_raw.labels, pred_SGDClassifier_tfidf)


LinearSVC




(0.5041322314049587, 0.46270543615676357, 0.48253131179960446, None)
AUC: 0.5925709139149277
SGDClassifier
(0.5388429752066116, 0.41213653603034134, 0.4670487106017192, None)


0.5985123697884936

Use more tfidf word (50000) features improve 1%, but takes much more time to transform

In [13]:
word_tfidf = TfidfVectorizer(sublinear_tf=True, analyzer="word", ngram_range=(3,7), max_features=50000)

X_train = word_tfidf.fit_transform(train_raw.texts)

import scipy
scipy.sparse.save_npz("models/tfidf/ngram_word_3to7_50000.npz", X_train)

print("LinearSVC")
model_LinearSVC_tfidf = LinearSVC()
model_LinearSVC_tfidf.fit(X_train, train_raw.labels)
pred_LinearSVC_tfidf = model_LinearSVC_tfidf.predict(word_tfidf.transform(test_raw.texts))
print(precision_recall_fscore_support(test_raw.labels, pred_LinearSVC_tfidf, average='binary'))
print("AUC:", roc_auc_score(y_test, pred_LinearSVC_tfidf))

print("SGDClassifier")
model_SGDClassifier_tfidf = SGDClassifier()
model_SGDClassifier_tfidf.fit(X_train, train_raw.labels)

pred_SGDClassifier_tfidf = model_SGDClassifier_tfidf.predict(word_tfidf.transform(test_raw.texts))
print(precision_recall_fscore_support(test_raw.labels, pred_SGDClassifier_tfidf, average='binary'))
roc_auc_score(test_raw.labels, pred_SGDClassifier_tfidf)


LinearSVC




(0.5390505359877489, 0.4450063211125158, 0.48753462603878117, None)
AUC: 0.6064661520751476
SGDClassifier
(0.5610169491525424, 0.41845764854614415, 0.47936278059377263, None)


0.6093830262776981

Char ngram

In [14]:
char_tfidf = TfidfVectorizer(sublinear_tf=True, analyzer="char", ngram_range=(3,7), max_features=50000)

X_train = char_tfidf.fit_transform(train_raw.texts)

import scipy
scipy.sparse.save_npz("models/tfidf/ngram_char_3to7_50000.npz", X_train)

print("LinearSVC")
model_LinearSVC_tfidf = LinearSVC()
model_LinearSVC_tfidf.fit(X_train, train_raw.labels)
pred_LinearSVC_tfidf = model_LinearSVC_tfidf.predict(char_tfidf.transform(test_raw.texts))
print(precision_recall_fscore_support(test_raw.labels, pred_LinearSVC_tfidf, average='binary'))
print("AUC:", roc_auc_score(y_test, pred_LinearSVC_tfidf))

print("SGDClassifier")
model_SGDClassifier_tfidf = SGDClassifier()
model_SGDClassifier_tfidf.fit(X_train, train_raw.labels)

pred_SGDClassifier_tfidf = model_SGDClassifier_tfidf.predict(char_tfidf.transform(test_raw.texts))
print(precision_recall_fscore_support(test_raw.labels, pred_SGDClassifier_tfidf, average='binary'))
roc_auc_score(test_raw.labels, pred_SGDClassifier_tfidf)


LinearSVC




(0.5706371191135734, 0.5208596713021492, 0.5446133509583608, None)
AUC: 0.6409232820658781
SGDClassifier
(0.6167247386759582, 0.4475347661188369, 0.5186813186813187, None)


0.6389562805150854

In [15]:
# Test char tfidf feature on NN
train_data_nn = encode_data(train_raw, char_tfidf)
test_data_nn = encode_data(test_raw, char_tfidf)

train_config = TrainConfig(
    num_epochs      = 10,
    early_stop      = False,
    violation_limit = 5,
)

dataloader = DataLoader(train_data_nn, batch_size=128, shuffle=True)

USE_CACHE = False


model_nn = NeuralNetwork(
    input_size=len(char_tfidf.vocabulary_),
    hidden_size=128,
    device='cpu'
)

if (models_dir / 'model_nn.pt').exists() and USE_CACHE:
    model_nn = load_model(model_nn, models_dir, 'model_nn')
else:
    model_nn.fit(dataloader, train_config, disable_progress_bar=False)
    save_model(model_nn, models_dir, "model_nn")


with torch.no_grad():
    # X_test = torch.stack([dta[0] for dta in test])
    X_test = torch.stack([test[0] for test in test_data_nn]).to(model_nn.device)
    y_test = torch.stack([test[1] for test in test_data_nn]).to(model_nn.device)
    y_pred = model_nn.predict(X_test)


print(precision_recall_fscore_support(y_test, y_pred, average='binary'))
print("AUC", roc_auc_score(y_test, y_pred))

# Plot training accuracy and loss side-by-side
plot_results(model_nn, train_config, dataloader)




Epoch 1: 100%|██████████| 68/68 [00:01<00:00, 50.77batch/s, batch_accuracy=0.623, loss=0.663]
Epoch 2: 100%|██████████| 68/68 [00:01<00:00, 57.77batch/s, batch_accuracy=0.87, loss=0.382] 
Epoch 3: 100%|██████████| 68/68 [00:01<00:00, 62.57batch/s, batch_accuracy=0.896, loss=0.286]
Epoch 4: 100%|██████████| 68/68 [00:01<00:00, 62.21batch/s, batch_accuracy=0.948, loss=0.213]
Epoch 5: 100%|██████████| 68/68 [00:01<00:00, 59.00batch/s, batch_accuracy=0.974, loss=0.126] 
Epoch 6: 100%|██████████| 68/68 [00:01<00:00, 61.50batch/s, batch_accuracy=0.987, loss=0.0883]
Epoch 7: 100%|██████████| 68/68 [00:01<00:00, 59.31batch/s, batch_accuracy=1, loss=0.045]     
Epoch 8: 100%|██████████| 68/68 [00:01<00:00, 59.67batch/s, batch_accuracy=1, loss=0.019]     
Epoch 9: 100%|██████████| 68/68 [00:01<00:00, 63.09batch/s, batch_accuracy=1, loss=0.00419]   
Epoch 10: 100%|██████████| 68/68 [00:01<00:00, 62.57batch/s, batch_accuracy=1, loss=0.00185]   


(0.546975546975547, 0.5372945638432364, 0.5420918367346939, None)
AUC 0.6329495178506852
