# Load packages

In [1]:
import sys
from pathlib import Path

PARENT_DIR = Path.cwd().parent.parent
sys.path.append(str(PARENT_DIR))

import torch
from torch.utils.data import DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score

from lib.models import TrainConfig, NeuralNetwork, save_model
from lib.data_processing import load_data, split_data, encode_data

from tqdm import tqdm

if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print("Device: cuda")
        print(torch.cuda.get_device_name(i))
else:
    print("Device: cpu")

Device: cuda
NVIDIA GeForce RTX 3050 Ti Laptop GPU


# Load data

In [2]:
data = load_data(file_path_list=[str(PARENT_DIR) + "/data/power/power-gb-train.tsv"])
train_raw, test_raw = split_data(data, test_size=0.2, random_state=0)

# Set training configurations

In [3]:
train_config = TrainConfig(num_epochs=10,early_stop=False,violation_limit=5)

# Experiment

In [4]:
max_features = [1000, 5000, 10000, 20000, 50000]
models_dir = Path(str(PARENT_DIR) + '/models/feature_engineering')
if not models_dir.exists():
    models_dir.mkdir(parents=True, exist_ok=True)
    
for i in range(len(max_features)):
    print(f"Experiment {i+1}: TF-IDF with max_features {max_features[i]}")
    
    # Encode
    print("Prepare data...")
    tfidf_vectorizer = TfidfVectorizer(max_features=max_features[i])
    tfidf_vectorizer.fit(train_raw.texts)
    
    train_data_nn = encode_data(train_raw, tfidf_vectorizer)
    test_data_nn = encode_data(test_raw, tfidf_vectorizer)
    
    dataloader = DataLoader(train_data_nn, batch_size=128, shuffle=True)
    
    # Train
    print("Train model...")
    model_nn = NeuralNetwork(input_size=len(tfidf_vectorizer.vocabulary_), hidden_size=128, device="cuda")
    model_nn.fit(dataloader, train_config, disable_progress_bar=False)
    
    save_model(model_nn, models_dir, f"model_tfidf_max_features_{max_features[i]}.pt")
    
    # Test
    with torch.no_grad():
        X_test = torch.stack([test[0] for test in test_data_nn]).to(model_nn.device)
        y_test = torch.stack([test[1] for test in test_data_nn]).to(model_nn.device)
        y_pred = model_nn.predict(X_test)
    
    # Evaluate
    y_test = y_test.cpu()
    y_pred = y_pred.cpu()
    print("Evaluation results")
    
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    
    precision, recall, fscore, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1: {fscore:.4f}")
    
    auc = roc_auc_score(y_test, y_pred)
    print(f"AUC: {auc:.4f}")
    print()

Experiment 1: TF-IDF with max_features 1000
Prepare data...
Train model...



Epoch 1: 100%|██████████| 209/209 [00:01<00:00, 128.91batch/s, batch_accuracy=0.857, loss=0.527]
Epoch 2: 100%|██████████| 209/209 [00:01<00:00, 171.71batch/s, batch_accuracy=1, loss=0.28]     
Epoch 3: 100%|██████████| 209/209 [00:01<00:00, 206.54batch/s, batch_accuracy=0.857, loss=0.549]
Epoch 4: 100%|██████████| 209/209 [00:00<00:00, 212.41batch/s, batch_accuracy=0.571, loss=0.599]
Epoch 5: 100%|██████████| 209/209 [00:01<00:00, 202.14batch/s, batch_accuracy=0.857, loss=0.376]
Epoch 6: 100%|██████████| 209/209 [00:01<00:00, 199.01batch/s, batch_accuracy=1, loss=0.286]    
Epoch 7: 100%|██████████| 209/209 [00:01<00:00, 187.64batch/s, batch_accuracy=0.857, loss=0.267]
Epoch 8: 100%|██████████| 209/209 [00:01<00:00, 143.50batch/s, batch_accuracy=1, loss=0.0225]   
Epoch 9: 100%|██████████| 209/209 [00:01<00:00, 140.58batch/s, batch_accuracy=0.857, loss=0.291]
Epoch 10: 100%|██████████| 209/209 [00:01<00:00, 145.63batch/s, batch_accuracy=1, loss=0.0463]    


Evaluation results
Accuracy: 0.7368
Precision: 0.7508
Recall: 0.7914
F1: 0.7706
AUC: 0.7296
Cleared GPU cache.


Experiment 2: TF-IDF with max_features 5000
Prepare data...
Train model...



Epoch 1: 100%|██████████| 209/209 [00:01<00:00, 120.32batch/s, batch_accuracy=1, loss=0.266]    
Epoch 2: 100%|██████████| 209/209 [00:01<00:00, 159.32batch/s, batch_accuracy=1, loss=0.175]    
Epoch 3: 100%|██████████| 209/209 [00:01<00:00, 174.19batch/s, batch_accuracy=1, loss=0.357]    
Epoch 4: 100%|██████████| 209/209 [00:01<00:00, 179.17batch/s, batch_accuracy=0.857, loss=0.234]
Epoch 5: 100%|██████████| 209/209 [00:01<00:00, 173.06batch/s, batch_accuracy=0.714, loss=0.573]
Epoch 6: 100%|██████████| 209/209 [00:01<00:00, 174.16batch/s, batch_accuracy=1, loss=0.0474]   
Epoch 7: 100%|██████████| 209/209 [00:01<00:00, 171.34batch/s, batch_accuracy=1, loss=0.0622]    
Epoch 8: 100%|██████████| 209/209 [00:01<00:00, 166.79batch/s, batch_accuracy=1, loss=0.0296]    
Epoch 9: 100%|██████████| 209/209 [00:01<00:00, 165.29batch/s, batch_accuracy=1, loss=0.0122]    
Epoch 10: 100%|██████████| 209/209 [00:01<00:00, 165.03batch/s, batch_accuracy=1, loss=0.000791]  


Evaluation results
Accuracy: 0.7344
Precision: 0.7525
Recall: 0.7814
F1: 0.7667
AUC: 0.7281
Cleared GPU cache.


Experiment 3: TF-IDF with max_features 10000
Prepare data...
Train model...



Epoch 1: 100%|██████████| 209/209 [00:02<00:00, 98.05batch/s, batch_accuracy=1, loss=0.28]      
Epoch 2: 100%|██████████| 209/209 [00:01<00:00, 135.97batch/s, batch_accuracy=1, loss=0.201]    
Epoch 3: 100%|██████████| 209/209 [00:01<00:00, 145.67batch/s, batch_accuracy=1, loss=0.167]    
Epoch 4: 100%|██████████| 209/209 [00:01<00:00, 144.77batch/s, batch_accuracy=1, loss=0.227]    
Epoch 5: 100%|██████████| 209/209 [00:01<00:00, 143.33batch/s, batch_accuracy=1, loss=0.395]    
Epoch 6: 100%|██████████| 209/209 [00:01<00:00, 142.89batch/s, batch_accuracy=1, loss=0.0121]    
Epoch 7: 100%|██████████| 209/209 [00:01<00:00, 138.21batch/s, batch_accuracy=1, loss=0.0311]    
Epoch 8: 100%|██████████| 209/209 [00:01<00:00, 141.81batch/s, batch_accuracy=1, loss=0.000887]  
Epoch 9: 100%|██████████| 209/209 [00:01<00:00, 138.73batch/s, batch_accuracy=1, loss=0.000762]  
Epoch 10: 100%|██████████| 209/209 [00:01<00:00, 138.85batch/s, batch_accuracy=1, loss=0.000244]  


Evaluation results
Accuracy: 0.7181
Precision: 0.7374
Recall: 0.7693
F1: 0.7530
AUC: 0.7113
Cleared GPU cache.


Experiment 4: TF-IDF with max_features 20000
Prepare data...
Train model...



Epoch 1: 100%|██████████| 209/209 [00:02<00:00, 75.86batch/s, batch_accuracy=0.857, loss=0.637]
Epoch 2: 100%|██████████| 209/209 [00:02<00:00, 101.79batch/s, batch_accuracy=0.714, loss=0.624]
Epoch 3: 100%|██████████| 209/209 [00:02<00:00, 99.13batch/s, batch_accuracy=1, loss=0.323]     
Epoch 4: 100%|██████████| 209/209 [00:02<00:00, 99.01batch/s, batch_accuracy=1, loss=0.0886]    
Epoch 5: 100%|██████████| 209/209 [00:02<00:00, 86.98batch/s, batch_accuracy=1, loss=0.132]     
Epoch 6: 100%|██████████| 209/209 [00:02<00:00, 96.08batch/s, batch_accuracy=1, loss=0.062]      
Epoch 7: 100%|██████████| 209/209 [00:02<00:00, 101.69batch/s, batch_accuracy=1, loss=0.00407]   
Epoch 8: 100%|██████████| 209/209 [00:02<00:00, 101.42batch/s, batch_accuracy=1, loss=0.00185]    
Epoch 9: 100%|██████████| 209/209 [00:02<00:00, 100.65batch/s, batch_accuracy=1, loss=0.00113]    
Epoch 10: 100%|██████████| 209/209 [00:02<00:00, 101.61batch/s, batch_accuracy=1, loss=0.000287]  


Evaluation results
Accuracy: 0.7057
Precision: 0.7291
Recall: 0.7528
F1: 0.7408
AUC: 0.6995
Cleared GPU cache.


Experiment 5: TF-IDF with max_features 50000
Prepare data...


RuntimeError: [enforce fail at alloc_cpu.cpp:114] data. DefaultCPUAllocator: not enough memory: you tried to allocate 5326200000 bytes.