In [4]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, classification_report
import joblib
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import torch.nn.functional as F
from transformers import BertTokenizer, BertModel

# 1. Setup & Data Loading

In [5]:
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), "../.."))
DATA_DIR = os.path.join(BASE_DIR, "data/processed")
RESULTS_DIR = os.path.join(BASE_DIR, "results/tables")
MODEL_DIR = os.path.join(BASE_DIR, "models/deep_learning")
os.makedirs(RESULTS_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)

train_df = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
val_df   = pd.read_csv(os.path.join(DATA_DIR, "val.csv"))
test_df  = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))

# 2. Label Encoding

In [6]:
label_encoder = LabelEncoder()
train_df['label_enc'] = label_encoder.fit_transform(train_df['label'])
val_df['label_enc'] = label_encoder.transform(val_df['label'])
test_df['label_enc'] = label_encoder.transform(test_df['label'])
num_classes = len(label_encoder.classes_)
joblib.dump(label_encoder, os.path.join(MODEL_DIR, "label_encoder.pkl"))
print(f"Encoded labels: {label_encoder.classes_}")

Encoded labels: ['alt.atheism' 'comp.graphics' 'comp.os.ms-windows.misc'
 'comp.sys.ibm.pc.hardware' 'comp.sys.mac.hardware' 'comp.windows.x'
 'misc.forsale' 'rec.autos' 'rec.motorcycles' 'rec.sport.baseball'
 'rec.sport.hockey' 'sci.crypt' 'sci.electronics' 'sci.med' 'sci.space'
 'soc.religion.christian' 'talk.politics.guns' 'talk.politics.mideast'
 'talk.politics.misc' 'talk.religion.misc']


# 3. BERT Tokenizer and Model

In [7]:
print("Loading BERT tokenizer and model...")
model_name = 'bert-base-uncased'  # Or any other BERT variant
tokenizer = BertTokenizer.from_pretrained(model_name)
bert_model = BertModel.from_pretrained(model_name)
max_seq_length = 128  # Or adjust as needed

Loading BERT tokenizer and model...


# 4. Tokenize and Prepare Data

In [8]:
def tokenize_and_pad(texts, tokenizer, max_length):
    input_ids = []
    attention_masks = []
    for text in texts:
        encoded = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'  # Returns PyTorch tensors
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    return input_ids, attention_masks

print("Tokenizing train data...")
train_input_ids, train_attention_masks = tokenize_and_pad(train_df['clean_text'].tolist(), tokenizer, max_seq_length)
print("Tokenizing val data...")
val_input_ids, val_attention_masks = tokenize_and_pad(val_df['clean_text'].tolist(), tokenizer, max_seq_length)
print("Tokenizing test data...")
test_input_ids, test_attention_masks = tokenize_and_pad(test_df['clean_text'].tolist(), tokenizer, max_seq_length)

y_train = train_df['label_enc'].values
y_val = val_df['label_enc'].values
y_test = test_df['label_enc'].values

Tokenizing train data...
Tokenizing val data...
Tokenizing test data...


# 5. Data Loaders

In [9]:
train_dataset = TensorDataset(train_input_ids, train_attention_masks, torch.tensor(y_train, dtype=torch.long))
val_dataset   = TensorDataset(val_input_ids, val_attention_masks, torch.tensor(y_val, dtype=torch.long))
test_dataset  = TensorDataset(test_input_ids, test_attention_masks, torch.tensor(y_test, dtype=torch.long))

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader  = DataLoader(test_dataset, batch_size=32, shuffle=False)

print(f"Train set: {len(train_dataset)} samples")
print(f"Validation set: {len(val_dataset)} samples")
print(f"Test set: {len(test_dataset)} samples")

Train set: 15997 samples
Validation set: 2000 samples
Test set: 2000 samples


# 6. Model Definition

In [10]:
class BERTClassifier(nn.Module):
    def __init__(self, bert_model, output_dim, dropout_rate=0.5):
        super(BERTClassifier, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(self.bert.config.hidden_size, output_dim)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output  # Use pooler_output, or try outputs.last_hidden_state[:, 0, :] for [CLS] token
        x = self.dropout(pooled_output)
        x = self.fc(x)
        return x

# 7. Training and Evaluation

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=20, patience=5, model_name="Model"):
    model = model.to(device)
    best_val_loss = float('inf')
    best_model_state = None
    best_val_acc = 0
    epochs_no_improve = 0

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        correct, total = 0, 0

        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} - {model_name}", leave=False)

        for input_ids, attention_mask, y_batch in progress_bar: #unpack tensors
            input_ids, attention_mask, y_batch = input_ids.to(device), attention_mask.to(device), y_batch.to(device) #Move data to device

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += y_batch.size(0)
            correct += (predicted == y_batch).sum().item()

            progress_bar.set_postfix(loss=loss.item(), acc=100 * correct / total)

        # Validation
        model.eval()
        val_correct, val_total = 0, 0
        val_predictions = []
        val_true_labels = []
        total_val_loss = 0.0
        with torch.no_grad():
            for input_ids, attention_mask, y_val_batch in val_loader: #unpack tensors
                input_ids, attention_mask, y_val_batch = input_ids.to(device), attention_mask.to(device), y_val_batch.to(device) #Move data to device
                val_outputs = model(input_ids, attention_mask)
                loss = criterion(val_outputs, y_val_batch)
                total_val_loss += loss.item()
                _, val_predicted = torch.max(val_outputs, 1)
                val_total += y_val_batch.size(0)
                val_correct += (val_predicted == y_val_batch).sum().item()
                val_predictions.extend(val_predicted.cpu().numpy())
                val_true_labels.extend(y_val_batch.cpu().numpy())

        val_acc = 100 * val_correct / val_total
        avg_val_loss = total_val_loss / len(val_loader)
        val_report = classification_report(val_true_labels, val_predictions, output_dict=True, zero_division=0)

        print(f"Epoch {epoch+1}/{epochs} - Loss: {running_loss/len(train_loader):.4f} - Val Loss: {avg_val_loss:.4f} - Val Acc: {val_acc:.2f}% - {model_name}")

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            best_val_acc = val_acc
            epochs_no_improve = 0
            best_model_state = model.state_dict()
        else:
            epochs_no_improve += 1

        if epochs_no_improve >= patience:
            print(f"Early stopping triggered at epoch {epoch+1} - {model_name}")
            break

    model.load_state_dict(best_model_state)
    print(f"Best Validation Accuracy: {best_val_acc:.2f}% - {model_name}")
    return model, best_val_acc/100, val_report

hàm đánh giá

In [12]:
def evaluate_model(model, test_loader, model_name="Model"):
    model.eval()
    predictions = []
    true_labels = []
    with torch.no_grad():
        for input_ids, attention_mask, y_batch in test_loader: #unpack tensors
            input_ids, attention_mask, y_batch = input_ids.to(device), attention_mask.to(device), y_batch.to(device) #Move data to device
            outputs = model(input_ids, attention_mask)
            _, predicted = torch.max(outputs, 1)
            predictions.extend(predicted.cpu().numpy())
            true_labels.extend(y_batch.cpu().numpy())

    test_accuracy = accuracy_score(true_labels, predictions)
    report = classification_report(true_labels, predictions, output_dict=True, zero_division=0)
    print(f"Test Accuracy: {test_accuracy:.2f}% - {model_name}")
    print(classification_report(true_labels, predictions))
    return test_accuracy, report

lưu kết quả

In [13]:
def save_results(model_name, test_accuracy, report, val_accuracy, val_report):
    results = {
        "Model": model_name,
        "Test_Accuracy": test_accuracy,
        "Test_Precision": report['weighted avg']['precision'],
        "Test_Recall": report['weighted avg']['recall'],
        "Test_F1-Score": report['weighted avg']['f1-score'],
        "Val_Accuracy": val_accuracy,
        "Val_Precision": val_report['weighted avg']['precision'],
        "Val_Recall": val_report['weighted avg']['recall'],
        "Val_F1-Score": val_report['weighted avg']['f1-score']
    }
    results_df = pd.DataFrame([results])
    results_file = os.path.join(RESULTS_DIR, "deep_models_results.csv")
    if not os.path.exists(results_file):
        results_df.to_csv(results_file, index=False, header=True)
    else:
        results_df.to_csv(results_file, index=False, header=False, mode='a')
    print(f"Deep Learning results saved to: {results_file}")

# 8. Model Training and Evaluation

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#Initialise Models
bert_model_classifier = BERTClassifier(bert_model, num_classes)

#Define optimizers and Criterion
criterion = nn.CrossEntropyLoss().to(device)
bert_optimizer = optim.Adam(bert_model_classifier.parameters(), lr=0.001)

#Run Models
bert_model_classifier, val_accuracy_bert, val_report_bert = train_model(bert_model_classifier, train_loader, val_loader, criterion, bert_optimizer, epochs=5, model_name="BERT")
test_accuracy_bert, report_bert = evaluate_model(bert_model_classifier, test_loader, model_name="BERT")
save_results("BERTClassifier", test_accuracy_bert, report_bert, val_accuracy_bert, val_report_bert)

                                                                                        

Epoch 1/5 - Loss: 3.1730 - Val Loss: 3.0611 - Val Acc: 5.00% - BERT


                                                                                        

KeyboardInterrupt: 

# 9. Save Models

In [None]:
torch.save(bert_model_classifier.state_dict(), os.path.join(MODEL_DIR, "bert_model.pth"))

Encoded labels: ['alt.atheism' 'comp.graphics' 'comp.os.ms-windows.misc'
 'comp.sys.ibm.pc.hardware' 'comp.sys.mac.hardware' 'comp.windows.x'
 'misc.forsale' 'rec.autos' 'rec.motorcycles' 'rec.sport.baseball'
 'rec.sport.hockey' 'sci.crypt' 'sci.electronics' 'sci.med' 'sci.space'
 'soc.religion.christian' 'talk.politics.guns' 'talk.politics.mideast'
 'talk.politics.misc' 'talk.religion.misc']
Loading BERT tokenizer and model...
Tokenizing train data...
Tokenizing val data...
Tokenizing test data...
Train set: 15997 samples
Validation set: 2000 samples
Test set: 2000 samples


                                                                                        

KeyboardInterrupt: 