# Load packages

In [1]:
import sys
from pathlib import Path

PARENT_DIR = Path.cwd().parent.parent
sys.path.append(str(PARENT_DIR))

import torch
from torch.utils.data import DataLoader
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score

from lib.models import TrainConfig, NeuralNetwork, save_model
from lib.data_processing import load_data, split_data, encode_data

from tqdm import tqdm

if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print("Device: cuda")
        print(torch.cuda.get_device_name(i))
else:
    print("Device: cpu")

Device: cuda
NVIDIA GeForce RTX 3050 Ti Laptop GPU


# Load data

In [2]:
data = load_data(file_path_list=[str(PARENT_DIR) + "/data/power/power-gb-train.tsv"])
train_raw, test_raw = split_data(data, test_size=0.2, random_state=0)

# Set training configurations

In [3]:
train_config = TrainConfig(num_epochs=10,early_stop=False,violation_limit=5)

# Experiment

In [4]:
max_features = [1000, 5000, 10000, 20000, 50000]
models_dir = Path(str(PARENT_DIR) + '/models/feature_engineering')
if not models_dir.exists():
    models_dir.mkdir(parents=True, exist_ok=True)
    
for i in range(len(max_features)):
    print(f"Experiment {i+1}: Bag-of-Words with max_features {max_features[i]}")
    
    # Encode
    print("Prepare data...")
    bow_vectorizer = CountVectorizer(max_features=max_features[i])
    bow_vectorizer.fit(train_raw.texts)
    
    train_data_nn = encode_data(train_raw, bow_vectorizer)
    test_data_nn = encode_data(test_raw, bow_vectorizer)
    
    dataloader = DataLoader(train_data_nn, batch_size=128, shuffle=True)
    
    # Train
    print("Train model...")
    model_nn = NeuralNetwork(input_size=len(bow_vectorizer.vocabulary_), hidden_size=128, device="cuda")
    model_nn.fit(dataloader, train_config, disable_progress_bar=False)
    
    save_model(model_nn, models_dir, f"model_bow_max_features_{max_features[i]}.pt")
    
    # Test
    with torch.no_grad():
        X_test = torch.stack([test[0] for test in test_data_nn]).to(model_nn.device)
        y_test = torch.stack([test[1] for test in test_data_nn]).to(model_nn.device)
        y_pred = model_nn.predict(X_test)
    
    # Evaluate
    y_test = y_test.cpu()
    y_pred = y_pred.cpu()
    print("Evaluation results")
    
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    
    precision, recall, fscore, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1: {fscore:.4f}")
    
    auc = roc_auc_score(y_test, y_pred)
    print(f"AUC: {auc:.4f}")
    print()

Experiment 1: Bag-of-Words with max_features 1000
Prepare data...
Train model...



Epoch 1: 100%|██████████| 209/209 [00:02<00:00, 77.64batch/s, batch_accuracy=0.857, loss=0.454] 
Epoch 2: 100%|██████████| 209/209 [00:01<00:00, 121.01batch/s, batch_accuracy=0.571, loss=0.919]
Epoch 3: 100%|██████████| 209/209 [00:01<00:00, 123.22batch/s, batch_accuracy=0.714, loss=0.601]
Epoch 4: 100%|██████████| 209/209 [00:01<00:00, 120.61batch/s, batch_accuracy=1, loss=0.292]    
Epoch 5: 100%|██████████| 209/209 [00:01<00:00, 126.05batch/s, batch_accuracy=0.857, loss=0.272]
Epoch 6: 100%|██████████| 209/209 [00:01<00:00, 119.04batch/s, batch_accuracy=1, loss=0.221]    
Epoch 7: 100%|██████████| 209/209 [00:01<00:00, 116.62batch/s, batch_accuracy=1, loss=0.138]    
Epoch 8: 100%|██████████| 209/209 [00:01<00:00, 120.54batch/s, batch_accuracy=1, loss=0.348]     
Epoch 9: 100%|██████████| 209/209 [00:01<00:00, 113.39batch/s, batch_accuracy=1, loss=0.125]     
Epoch 10: 100%|██████████| 209/209 [00:01<00:00, 108.78batch/s, batch_accuracy=1, loss=0.109]     


Evaluation results
Accuracy: 0.7360
Precision: 0.7421
Recall: 0.8084
F1: 0.7738
AUC: 0.7264

Experiment 2: Bag-of-Words with max_features 5000
Prepare data...
Train model...



Epoch 1: 100%|██████████| 209/209 [00:03<00:00, 68.36batch/s, batch_accuracy=1, loss=0.409]    
Epoch 2: 100%|██████████| 209/209 [00:02<00:00, 100.30batch/s, batch_accuracy=1, loss=0.572]    
Epoch 3: 100%|██████████| 209/209 [00:02<00:00, 91.32batch/s, batch_accuracy=1, loss=0.208]     
Epoch 4: 100%|██████████| 209/209 [00:02<00:00, 83.80batch/s, batch_accuracy=0.857, loss=0.245]
Epoch 5: 100%|██████████| 209/209 [00:01<00:00, 107.69batch/s, batch_accuracy=1, loss=0.0255]    
Epoch 6: 100%|██████████| 209/209 [00:01<00:00, 125.91batch/s, batch_accuracy=1, loss=0.0745]    
Epoch 7: 100%|██████████| 209/209 [00:01<00:00, 109.13batch/s, batch_accuracy=1, loss=0.0723]    
Epoch 8: 100%|██████████| 209/209 [00:02<00:00, 101.22batch/s, batch_accuracy=1, loss=0.0445]    
Epoch 9: 100%|██████████| 209/209 [00:01<00:00, 117.89batch/s, batch_accuracy=1, loss=0.00195]   
Epoch 10: 100%|██████████| 209/209 [00:01<00:00, 104.81batch/s, batch_accuracy=1, loss=0.0298]    


Evaluation results
Accuracy: 0.7231
Precision: 0.7693
Recall: 0.7201
F1: 0.7439
AUC: 0.7235

Experiment 3: Bag-of-Words with max_features 10000
Prepare data...
Train model...



Epoch 1: 100%|██████████| 209/209 [00:03<00:00, 69.03batch/s, batch_accuracy=1, loss=0.341]    
Epoch 2: 100%|██████████| 209/209 [00:02<00:00, 81.39batch/s, batch_accuracy=0.857, loss=0.429]
Epoch 3: 100%|██████████| 209/209 [00:02<00:00, 79.34batch/s, batch_accuracy=1, loss=0.0486]   
Epoch 4: 100%|██████████| 209/209 [00:02<00:00, 79.06batch/s, batch_accuracy=1, loss=0.109]     
Epoch 5: 100%|██████████| 209/209 [00:02<00:00, 85.80batch/s, batch_accuracy=1, loss=0.0214]    
Epoch 6: 100%|██████████| 209/209 [00:02<00:00, 84.20batch/s, batch_accuracy=1, loss=0.025]     
Epoch 7: 100%|██████████| 209/209 [00:02<00:00, 85.29batch/s, batch_accuracy=1, loss=0.027]     
Epoch 8: 100%|██████████| 209/209 [00:02<00:00, 93.22batch/s, batch_accuracy=1, loss=0.00314]    
Epoch 9: 100%|██████████| 209/209 [00:01<00:00, 111.55batch/s, batch_accuracy=1, loss=9.66e-5]   
Epoch 10: 100%|██████████| 209/209 [00:01<00:00, 113.29batch/s, batch_accuracy=1, loss=0.000224]  


Evaluation results
Accuracy: 0.7341
Precision: 0.7701
Recall: 0.7468
F1: 0.7583
AUC: 0.7324

Experiment 4: Bag-of-Words with max_features 20000
Prepare data...
Train model...



Epoch 1: 100%|██████████| 209/209 [00:02<00:00, 75.21batch/s, batch_accuracy=0.714, loss=0.764]
Epoch 2: 100%|██████████| 209/209 [00:02<00:00, 93.68batch/s, batch_accuracy=1, loss=0.141]    
Epoch 3: 100%|██████████| 209/209 [00:02<00:00, 87.59batch/s, batch_accuracy=1, loss=0.0953]   
Epoch 4: 100%|██████████| 209/209 [00:02<00:00, 84.98batch/s, batch_accuracy=1, loss=0.155]     
Epoch 5: 100%|██████████| 209/209 [00:02<00:00, 99.73batch/s, batch_accuracy=1, loss=0.0119]     
Epoch 6: 100%|██████████| 209/209 [00:02<00:00, 94.74batch/s, batch_accuracy=1, loss=0.00871]    
Epoch 7: 100%|██████████| 209/209 [00:02<00:00, 103.66batch/s, batch_accuracy=1, loss=0.0142]    
Epoch 8: 100%|██████████| 209/209 [00:01<00:00, 105.12batch/s, batch_accuracy=1, loss=0.00806]   
Epoch 9: 100%|██████████| 209/209 [00:01<00:00, 107.40batch/s, batch_accuracy=1, loss=0.00201]    
Epoch 10: 100%|██████████| 209/209 [00:02<00:00, 97.48batch/s, batch_accuracy=1, loss=0.00111]   


Evaluation results
Accuracy: 0.7237
Precision: 0.7667
Recall: 0.7263
F1: 0.7459
AUC: 0.7233

Experiment 5: Bag-of-Words with max_features 50000
Prepare data...
Train model...



Epoch 1: 100%|██████████| 209/209 [00:04<00:00, 43.86batch/s, batch_accuracy=1, loss=0.387]    
Epoch 2: 100%|██████████| 209/209 [00:03<00:00, 54.87batch/s, batch_accuracy=0.857, loss=0.475]
Epoch 3: 100%|██████████| 209/209 [00:03<00:00, 58.77batch/s, batch_accuracy=1, loss=0.174]    
Epoch 4: 100%|██████████| 209/209 [00:03<00:00, 61.03batch/s, batch_accuracy=1, loss=0.0779]    
Epoch 5: 100%|██████████| 209/209 [00:03<00:00, 60.34batch/s, batch_accuracy=1, loss=0.424]     
Epoch 6: 100%|██████████| 209/209 [00:04<00:00, 45.82batch/s, batch_accuracy=1, loss=0.00723]   
Epoch 7: 100%|██████████| 209/209 [00:03<00:00, 52.59batch/s, batch_accuracy=1, loss=0.000196]  
Epoch 8: 100%|██████████| 209/209 [00:03<00:00, 53.25batch/s, batch_accuracy=1, loss=0.00139]   
Epoch 9: 100%|██████████| 209/209 [00:03<00:00, 60.51batch/s, batch_accuracy=1, loss=0.000528]  
Epoch 10: 100%|██████████| 209/209 [00:03<00:00, 56.13batch/s, batch_accuracy=1, loss=0.00104]    


Evaluation results
Accuracy: 0.7360
Precision: 0.7568
Recall: 0.7771
F1: 0.7668
AUC: 0.7306

