In [7]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, classification_report
import joblib
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import torch.nn.functional as F
from transformers import BertTokenizer, BertModel

In [9]:
# Thư mục làm việc chính trong Colab là /content/
BASE_DIR = "/content/"

# Các file dữ liệu .csv nằm trực tiếp trong BASE_DIR
DATA_DIR = BASE_DIR # Hoặc bạn có thể ghi rõ là "/content/"

# Định nghĩa thư mục lưu kết quả và model bên trong BASE_DIR
# Giữ nguyên cấu trúc thư mục con như trong notebook gốc nếu muốn
RESULTS_DIR = os.path.join(BASE_DIR, "results", "tables")
MODEL_DIR = os.path.join(BASE_DIR, "models", "deep_learning")

# Tạo các thư mục này nếu chúng chưa tồn tại
os.makedirs(RESULTS_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)

# Đọc file CSV trực tiếp từ DATA_DIR (chính là /content/)
train_df = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
val_df   = pd.read_csv(os.path.join(DATA_DIR, "val.csv"))
test_df  = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))

In [10]:
label_encoder = LabelEncoder()
train_df['label_enc'] = label_encoder.fit_transform(train_df['label'])
val_df['label_enc'] = label_encoder.transform(val_df['label'])
test_df['label_enc'] = label_encoder.transform(test_df['label'])
num_classes = len(label_encoder.classes_)
joblib.dump(label_encoder, os.path.join(MODEL_DIR, "label_encoder.pkl"))
print(f"Encoded labels: {label_encoder.classes_}")

Encoded labels: ['alt.atheism' 'comp.graphics' 'comp.os.ms-windows.misc'
 'comp.sys.ibm.pc.hardware' 'comp.sys.mac.hardware' 'comp.windows.x'
 'misc.forsale' 'rec.autos' 'rec.motorcycles' 'rec.sport.baseball'
 'rec.sport.hockey' 'sci.crypt' 'sci.electronics' 'sci.med' 'sci.space'
 'soc.religion.christian' 'talk.politics.guns' 'talk.politics.mideast'
 'talk.politics.misc' 'talk.religion.misc']


In [38]:
print("Loading BERT tokenizer and model...")
model_name = 'bert-base-uncased'  # Or any other BERT variant
tokenizer = BertTokenizer.from_pretrained(model_name)
bert_model = BertModel.from_pretrained(model_name)
max_seq_length = 128  # Or adjust as needed

for param in bert_model.parameters():
    param.requires_grad = True

Loading BERT tokenizer and model...


In [12]:
def tokenize_and_pad(texts, tokenizer, max_length):
    input_ids = []
    attention_masks = []
    for text in texts:
        encoded = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'  # Returns PyTorch tensors
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    return input_ids, attention_masks

print("Tokenizing train data...")
train_input_ids, train_attention_masks = tokenize_and_pad(train_df['clean_text'].tolist(), tokenizer, max_seq_length)
print("Tokenizing val data...")
val_input_ids, val_attention_masks = tokenize_and_pad(val_df['clean_text'].tolist(), tokenizer, max_seq_length)
print("Tokenizing test data...")
test_input_ids, test_attention_masks = tokenize_and_pad(test_df['clean_text'].tolist(), tokenizer, max_seq_length)

y_train = train_df['label_enc'].values
y_val = val_df['label_enc'].values
y_test = test_df['label_enc'].values

Tokenizing train data...
Tokenizing val data...
Tokenizing test data...


In [13]:
train_dataset = TensorDataset(train_input_ids, train_attention_masks, torch.tensor(y_train, dtype=torch.long))
val_dataset   = TensorDataset(val_input_ids, val_attention_masks, torch.tensor(y_val, dtype=torch.long))
test_dataset  = TensorDataset(test_input_ids, test_attention_masks, torch.tensor(y_test, dtype=torch.long))

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader  = DataLoader(test_dataset, batch_size=32, shuffle=False)

print(f"Train set: {len(train_dataset)} samples")
print(f"Validation set: {len(val_dataset)} samples")
print(f"Test set: {len(test_dataset)} samples")

Train set: 15997 samples
Validation set: 2000 samples
Test set: 2000 samples


In [34]:
print("Train label distribution:")
print(train_df['label'].value_counts(normalize=True))
print("Val label distribution:")
print(val_df['label'].value_counts(normalize=True))

Train label distribution:
label
rec.autos                   0.050009
comp.windows.x              0.050009
sci.crypt                   0.050009
alt.atheism                 0.050009
rec.motorcycles             0.050009
comp.graphics               0.050009
talk.politics.mideast       0.050009
comp.sys.ibm.pc.hardware    0.050009
talk.politics.guns          0.050009
sci.electronics             0.050009
misc.forsale                0.050009
sci.med                     0.050009
sci.space                   0.050009
rec.sport.hockey            0.050009
comp.sys.mac.hardware       0.050009
rec.sport.baseball          0.050009
talk.politics.misc          0.050009
comp.os.ms-windows.misc     0.050009
talk.religion.misc          0.050009
soc.religion.christian      0.049822
Name: proportion, dtype: float64
Val label distribution:
label
soc.religion.christian      0.05
comp.sys.ibm.pc.hardware    0.05
rec.sport.baseball          0.05
rec.autos                   0.05
talk.politics.guns          0.05


In [35]:
# Kiểm tra một vài mẫu đã tokenize
sample_idx = 0
print("Sample input:")
print("Text:", train_df['clean_text'].iloc[sample_idx])
print("Label:", train_df['label'].iloc[sample_idx])
print("Tokenized IDs:", train_input_ids[sample_idx][:10], "...")

Sample input:
Text: path cantaloupe srv c cmu edu magnesium club cc cmu edu news sei cmu edu ci ohio state edu zaphod mp ohio state edu sdd hp com foxtail blkhole vllyoak jp jp vllyoak resun com jeff perry newsgroups rec auto subject mustang message id date fri apr pdt reference organization private site san marcos california line jmh hopper virginia edu jeffrey hoffmeister writes article virginia edu blad got remind yes right somtime fall ford granddaddy car introducing new mega cool way fast accord driver mustang supposed streamlined looking similar mach iii concept car ford came around january wait anyone hear anything recently everything read correct ford nothing skinning existing mustang minor suspension modification picture seen indicate good job new mustang nothing cycle year old car saw picture mustang popular mechanic disappointment bombarded picture mach iii jp
Label: rec.autos
Tokenized IDs: tensor([ 101, 4130, 2064, 9080, 7140, 5051, 5034, 2615, 1039, 4642]) ...


In [36]:
print(train_df['label_enc'].value_counts())


label_enc
7     800
5     800
11    800
0     800
8     800
1     800
17    800
3     800
16    800
12    800
6     800
13    800
14    800
10    800
4     800
9     800
18    800
2     800
19    800
15    797
Name: count, dtype: int64


In [39]:
print("Train NaNs:", train_df['label_enc'].isna().sum())
print("Val NaNs:", val_df['label_enc'].isna().sum())
print("Test NaNs:", test_df['label_enc'].isna().sum())


Train NaNs: 0
Val NaNs: 0
Test NaNs: 0


In [40]:
print("Train label_enc min:", train_df['label_enc'].min())
print("Train label_enc max:", train_df['label_enc'].max())
print("Number of classes:", num_classes)
print("Unique labels in train:", train_df['label_enc'].unique())


Train label_enc min: 0
Train label_enc max: 19
Number of classes: 20
Unique labels in train: [ 7  5 11  0  8  1 17  3 16 12  6 15 13 10  4  9 14  2 18 19]


In [41]:
print("Train label distribution:")
print(train_df['label_enc'].value_counts().sort_index())

print("Val label distribution:")
print(val_df['label_enc'].value_counts().sort_index())

print("Test label distribution:")
print(test_df['label_enc'].value_counts().sort_index())


Train label distribution:
label_enc
0     800
1     800
2     800
3     800
4     800
5     800
6     800
7     800
8     800
9     800
10    800
11    800
12    800
13    800
14    800
15    797
16    800
17    800
18    800
19    800
Name: count, dtype: int64
Val label distribution:
label_enc
0     100
1     100
2     100
3     100
4     100
5     100
6     100
7     100
8     100
9     100
10    100
11    100
12    100
13    100
14    100
15    100
16    100
17    100
18    100
19    100
Name: count, dtype: int64
Test label distribution:
label_enc
0     100
1     100
2     100
3     100
4     100
5     100
6     100
7     100
8     100
9     100
10    100
11    100
12    100
13    100
14    100
15    100
16    100
17    100
18    100
19    100
Name: count, dtype: int64


In [42]:
class BERTClassifier(nn.Module):
    def __init__(self, bert_model, output_dim, dropout_rate=0.1):
        super(BERTClassifier, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(self.bert.config.hidden_size, output_dim)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask, return_dict=True)
        # Sử dụng biểu diễn token [CLS]
        cls_output = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_output)
        x = self.fc(x)
        return x

In [43]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=3, patience=1, model_name="Model"):
    model = model.to(device)
    best_val_loss = float('inf')
    best_model_state = None
    best_val_acc = 0
    epochs_no_improve = 0

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        correct, total = 0, 0

        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} - {model_name}", leave=False)

        for input_ids, attention_mask, y_batch in progress_bar: #unpack tensors
            input_ids, attention_mask, y_batch = input_ids.to(device), attention_mask.to(device), y_batch.to(device) #Move data to device

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += y_batch.size(0)
            correct += (predicted == y_batch).sum().item()

            progress_bar.set_postfix(loss=loss.item(), acc=100 * correct / total)

        # Validation
        model.eval()
        val_correct, val_total = 0, 0
        val_predictions = []
        val_true_labels = []
        total_val_loss = 0.0
        with torch.no_grad():
            for input_ids, attention_mask, y_val_batch in val_loader: #unpack tensors
                input_ids, attention_mask, y_val_batch = input_ids.to(device), attention_mask.to(device), y_val_batch.to(device) #Move data to device
                val_outputs = model(input_ids, attention_mask)
                loss = criterion(val_outputs, y_val_batch)
                total_val_loss += loss.item()
                _, val_predicted = torch.max(val_outputs, 1)
                val_total += y_val_batch.size(0)
                val_correct += (val_predicted == y_val_batch).sum().item()
                val_predictions.extend(val_predicted.cpu().numpy())
                val_true_labels.extend(y_val_batch.cpu().numpy())

        val_acc = 100 * val_correct / val_total
        avg_val_loss = total_val_loss / len(val_loader)
        val_report = classification_report(val_true_labels, val_predictions, output_dict=True, zero_division=0)

        print(f"Epoch {epoch+1}/{epochs} - Loss: {running_loss/len(train_loader):.4f} - Val Loss: {avg_val_loss:.4f} - Val Acc: {val_acc:.2f}% - {model_name}")

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            best_val_acc = val_acc
            epochs_no_improve = 0
            best_model_state = model.state_dict()
        else:
            epochs_no_improve += 1

        if epochs_no_improve >= patience:
            print(f"Early stopping triggered at epoch {epoch+1} - {model_name}")
            break

    model.load_state_dict(best_model_state)
    print(f"Best Validation Accuracy: {best_val_acc:.2f}% - {model_name}")
    return model, best_val_acc/100, val_report

In [44]:
def evaluate_model(model, test_loader, model_name="Model"):
    model.eval()
    predictions = []
    true_labels = []
    with torch.no_grad():
        for input_ids, attention_mask, y_batch in test_loader: #unpack tensors
            input_ids, attention_mask, y_batch = input_ids.to(device), attention_mask.to(device), y_batch.to(device) #Move data to device
            outputs = model(input_ids, attention_mask)
            _, predicted = torch.max(outputs, 1)
            predictions.extend(predicted.cpu().numpy())
            true_labels.extend(y_batch.cpu().numpy())

    test_accuracy = accuracy_score(true_labels, predictions)
    report = classification_report(true_labels, predictions, output_dict=True, zero_division=0)
    print(f"Test Accuracy: {test_accuracy:.2f}% - {model_name}")
    print(classification_report(true_labels, predictions))
    return test_accuracy, report

In [45]:
def save_results(model_name, test_accuracy, report, val_accuracy, val_report):
    results = {
        "Model": model_name,
        "Test_Accuracy": test_accuracy,
        "Test_Precision": report['weighted avg']['precision'],
        "Test_Recall": report['weighted avg']['recall'],
        "Test_F1-Score": report['weighted avg']['f1-score'],
        "Val_Accuracy": val_accuracy,
        "Val_Precision": val_report['weighted avg']['precision'],
        "Val_Recall": val_report['weighted avg']['recall'],
        "Val_F1-Score": val_report['weighted avg']['f1-score']
    }
    results_df = pd.DataFrame([results])
    results_file = os.path.join(RESULTS_DIR, "deep_models_results.csv")
    if not os.path.exists(results_file):
        results_df.to_csv(results_file, index=False, header=True)
    else:
        results_df.to_csv(results_file, index=False, header=False, mode='a')
    print(f"Deep Learning results saved to: {results_file}")

In [46]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

#Initialise Models
bert_model_classifier = BERTClassifier(bert_model, num_classes)

#Define optimizers and Criterion
criterion = nn.CrossEntropyLoss().to(device)
bert_optimizer = optim.Adam(bert_model_classifier.parameters(), lr=3e-5)

#Run Models
bert_model_classifier, val_accuracy_bert, val_report_bert = train_model(bert_model_classifier, train_loader, val_loader, criterion, bert_optimizer, epochs=3, model_name="BERT")
test_accuracy_bert, report_bert = evaluate_model(bert_model_classifier, test_loader, model_name="BERT")
save_results("BERTClassifier", test_accuracy_bert, report_bert, val_accuracy_bert, val_report_bert)

cuda




Epoch 1/3 - Loss: 0.3392 - Val Loss: 0.0791 - Val Acc: 97.05% - BERT




Epoch 2/3 - Loss: 0.0749 - Val Loss: 0.0592 - Val Acc: 97.10% - BERT




Epoch 3/3 - Loss: 0.0684 - Val Loss: 0.0576 - Val Acc: 97.10% - BERT
Best Validation Accuracy: 97.10% - BERT
Test Accuracy: 0.97% - BERT
              precision    recall  f1-score   support

           0       1.00      0.83      0.91       100
           1       0.99      1.00      1.00       100
           2       1.00      0.96      0.98       100
           3       1.00      1.00      1.00       100
           4       1.00      1.00      1.00       100
           5       0.98      0.99      0.99       100
           6       0.99      1.00      1.00       100
           7       1.00      0.98      0.99       100
           8       1.00      1.00      1.00       100
           9       1.00      1.00      1.00       100
          10       1.00      1.00      1.00       100
          11       0.99      1.00      1.00       100
          12       0.98      1.00      0.99       100
          13       0.99      1.00      1.00       100
          14       1.00      1.00      1.00       10