# Load packages

In [1]:
import sys
from pathlib import Path

PARENT_DIR = Path.cwd().parent.parent
sys.path.append(str(PARENT_DIR))

import torch
from torch.utils.data import DataLoader
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score

from lib.models import TrainConfig, NeuralNetwork, save_model
from lib.data_processing import load_data, split_data, encode_data

from tqdm import tqdm

if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print("Device: cuda")
        print(torch.cuda.get_device_name(i))
else:
    print("Device: cpu")

Device: cuda
NVIDIA GeForce RTX 3050 Ti Laptop GPU


# Load data

In [2]:
data = load_data(file_path_list=[str(PARENT_DIR) + "/data/power/power-gb-train.tsv"])
train_raw, test_raw = split_data(data, test_size=0.2, random_state=0)

# Set training configurations

In [3]:
train_config = TrainConfig(num_epochs=10,early_stop=False,violation_limit=5)

# Experiment

In [4]:
max_features = [1000, 5000, 10000, 20000, 50000]
models_dir = Path(str(PARENT_DIR) + '/models/feature_engineering')
if not models_dir.exists():
    models_dir.mkdir(parents=True, exist_ok=True)
    
for i in range(len(max_features)):
    print(f"Experiment {i+1}: Bag-of-Words with max_features {max_features[i]}")
    
    # Encode
    print("Prepare data...")
    bow_vectorizer = CountVectorizer(max_features=max_features[i])
    bow_vectorizer.fit(train_raw.texts)
    
    train_data_nn = encode_data(train_raw, bow_vectorizer)
    test_data_nn = encode_data(test_raw, bow_vectorizer)
    
    dataloader = DataLoader(train_data_nn, batch_size=128, shuffle=True)
    
    # Train
    print("Train model...")
    model_nn = NeuralNetwork(input_size=len(bow_vectorizer.vocabulary_), hidden_size=128, device="cuda")
    model_nn.fit(dataloader, train_config, disable_progress_bar=False)
    
    save_model(model_nn, models_dir, f"model_bow_max_features_{max_features[i]}.pt")
    
    # Test
    with torch.no_grad():
        X_test = torch.stack([test[0] for test in test_data_nn]).to(model_nn.device)
        y_test = torch.stack([test[1] for test in test_data_nn]).to(model_nn.device)
        y_pred = model_nn.predict(X_test)
    
    # Evaluate
    y_test = y_test.cpu()
    y_pred = y_pred.cpu()
    print("Evaluation results")
    
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    
    precision, recall, fscore, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1: {fscore:.4f}")
    
    auc = roc_auc_score(y_test, y_pred)
    print(f"AUC: {auc:.4f}")
    print()

Experiment 1: Bag-of-Words with max_features 1000
Prepare data...
Train model...



Epoch 1: 100%|██████████| 209/209 [00:01<00:00, 179.56batch/s, batch_accuracy=0.714, loss=0.541]
Epoch 2: 100%|██████████| 209/209 [00:00<00:00, 241.61batch/s, batch_accuracy=0.714, loss=0.789]
Epoch 3: 100%|██████████| 209/209 [00:00<00:00, 247.37batch/s, batch_accuracy=1, loss=0.252]    
Epoch 4: 100%|██████████| 209/209 [00:01<00:00, 203.70batch/s, batch_accuracy=0.857, loss=0.615]
Epoch 5: 100%|██████████| 209/209 [00:00<00:00, 229.70batch/s, batch_accuracy=1, loss=0.12]     
Epoch 6: 100%|██████████| 209/209 [00:00<00:00, 217.48batch/s, batch_accuracy=1, loss=0.0549]   
Epoch 7: 100%|██████████| 209/209 [00:00<00:00, 214.79batch/s, batch_accuracy=1, loss=0.315]    
Epoch 8: 100%|██████████| 209/209 [00:01<00:00, 189.18batch/s, batch_accuracy=1, loss=0.0361]    
Epoch 9: 100%|██████████| 209/209 [00:01<00:00, 175.56batch/s, batch_accuracy=1, loss=0.024]     
Epoch 10: 100%|██████████| 209/209 [00:01<00:00, 163.43batch/s, batch_accuracy=1, loss=0.0279]    


Evaluation results
Accuracy: 0.7282
Precision: 0.7448
Recall: 0.7809
F1: 0.7624
AUC: 0.7212

Experiment 2: Bag-of-Words with max_features 5000
Prepare data...
Train model...



Epoch 1: 100%|██████████| 209/209 [00:01<00:00, 147.60batch/s, batch_accuracy=1, loss=0.347]    
Epoch 2: 100%|██████████| 209/209 [00:01<00:00, 199.62batch/s, batch_accuracy=0.857, loss=0.332]
Epoch 3: 100%|██████████| 209/209 [00:01<00:00, 192.75batch/s, batch_accuracy=1, loss=0.201]    
Epoch 4: 100%|██████████| 209/209 [00:01<00:00, 190.23batch/s, batch_accuracy=1, loss=0.187]    
Epoch 5: 100%|██████████| 209/209 [00:01<00:00, 187.53batch/s, batch_accuracy=0.857, loss=0.856] 
Epoch 6: 100%|██████████| 209/209 [00:01<00:00, 179.46batch/s, batch_accuracy=1, loss=0.078]     
Epoch 7: 100%|██████████| 209/209 [00:01<00:00, 183.43batch/s, batch_accuracy=1, loss=0.00176]   
Epoch 8: 100%|██████████| 209/209 [00:01<00:00, 172.39batch/s, batch_accuracy=1, loss=0.0193]    
Epoch 9: 100%|██████████| 209/209 [00:01<00:00, 173.28batch/s, batch_accuracy=1, loss=0.0035]    
Epoch 10: 100%|██████████| 209/209 [00:01<00:00, 173.76batch/s, batch_accuracy=1, loss=0.000325]  


Evaluation results
Accuracy: 0.7401
Precision: 0.7360
Recall: 0.8338
F1: 0.7819
AUC: 0.7277

Experiment 3: Bag-of-Words with max_features 10000
Prepare data...
Train model...



Epoch 1: 100%|██████████| 209/209 [00:01<00:00, 110.72batch/s, batch_accuracy=0.857, loss=0.509]
Epoch 2: 100%|██████████| 209/209 [00:01<00:00, 159.59batch/s, batch_accuracy=0.857, loss=0.349]
Epoch 3: 100%|██████████| 209/209 [00:01<00:00, 155.53batch/s, batch_accuracy=0.857, loss=0.523]
Epoch 4: 100%|██████████| 209/209 [00:01<00:00, 152.59batch/s, batch_accuracy=0.857, loss=0.323]
Epoch 5: 100%|██████████| 209/209 [00:01<00:00, 150.37batch/s, batch_accuracy=1, loss=0.0226]    
Epoch 6: 100%|██████████| 209/209 [00:01<00:00, 148.94batch/s, batch_accuracy=1, loss=0.0332]    
Epoch 7: 100%|██████████| 209/209 [00:01<00:00, 148.56batch/s, batch_accuracy=1, loss=0.00159]   
Epoch 8: 100%|██████████| 209/209 [00:01<00:00, 147.87batch/s, batch_accuracy=1, loss=0.00701]   
Epoch 9: 100%|██████████| 209/209 [00:01<00:00, 147.90batch/s, batch_accuracy=1, loss=6.79e-5]   
Epoch 10: 100%|██████████| 209/209 [00:01<00:00, 140.76batch/s, batch_accuracy=1, loss=0.0032]    


Evaluation results
Accuracy: 0.7344
Precision: 0.7259
Recall: 0.8427
F1: 0.7799
AUC: 0.7200

Experiment 4: Bag-of-Words with max_features 20000
Prepare data...
Train model...



Epoch 1: 100%|██████████| 209/209 [00:02<00:00, 85.67batch/s, batch_accuracy=0.714, loss=0.699] 
Epoch 2: 100%|██████████| 209/209 [00:01<00:00, 115.22batch/s, batch_accuracy=0.857, loss=0.612]
Epoch 3: 100%|██████████| 209/209 [00:01<00:00, 114.12batch/s, batch_accuracy=1, loss=0.198]    
Epoch 4: 100%|██████████| 209/209 [00:01<00:00, 114.95batch/s, batch_accuracy=1, loss=0.265]     
Epoch 5: 100%|██████████| 209/209 [00:01<00:00, 112.03batch/s, batch_accuracy=1, loss=0.0227]    
Epoch 6: 100%|██████████| 209/209 [00:01<00:00, 109.41batch/s, batch_accuracy=1, loss=0.0613]    
Epoch 7: 100%|██████████| 209/209 [00:01<00:00, 106.49batch/s, batch_accuracy=1, loss=0.00408]   
Epoch 8: 100%|██████████| 209/209 [00:01<00:00, 106.60batch/s, batch_accuracy=1, loss=0.00673]   
Epoch 9: 100%|██████████| 209/209 [00:02<00:00, 97.45batch/s, batch_accuracy=1, loss=0.000897]   
Epoch 10: 100%|██████████| 209/209 [00:01<00:00, 111.53batch/s, batch_accuracy=1, loss=0.107]     


Evaluation results
Accuracy: 0.7273
Precision: 0.7471
Recall: 0.7736
F1: 0.7601
AUC: 0.7211

Experiment 5: Bag-of-Words with max_features 50000
Prepare data...
Train model...



Epoch 1: 100%|██████████| 209/209 [00:04<00:00, 49.28batch/s, batch_accuracy=0.714, loss=0.631]
Epoch 2: 100%|██████████| 209/209 [00:03<00:00, 57.18batch/s, batch_accuracy=1, loss=0.145]    
Epoch 3: 100%|██████████| 209/209 [00:03<00:00, 59.71batch/s, batch_accuracy=0.857, loss=0.461]
Epoch 4: 100%|██████████| 209/209 [00:03<00:00, 59.06batch/s, batch_accuracy=1, loss=0.209]     
Epoch 5: 100%|██████████| 209/209 [00:03<00:00, 63.58batch/s, batch_accuracy=1, loss=0.00215]   
Epoch 6: 100%|██████████| 209/209 [00:03<00:00, 63.57batch/s, batch_accuracy=1, loss=0.00718]   
Epoch 7: 100%|██████████| 209/209 [00:03<00:00, 63.87batch/s, batch_accuracy=1, loss=5.2e-6]    
Epoch 8: 100%|██████████| 209/209 [00:03<00:00, 61.70batch/s, batch_accuracy=1, loss=0.00919]   
Epoch 9: 100%|██████████| 209/209 [00:03<00:00, 60.63batch/s, batch_accuracy=1, loss=0.000508]  
Epoch 10: 100%|██████████| 209/209 [00:03<00:00, 63.05batch/s, batch_accuracy=1, loss=0.00249]    


Evaluation results
Accuracy: 0.7261
Precision: 0.7301
Recall: 0.8084
F1: 0.7673
AUC: 0.7152

