In [4]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from transformers import GPT2Tokenizer, GPT2Model
import joblib
from tqdm import tqdm

In [5]:
# Directories
BASE_DIR = "/kaggle/working/"
DATA_DIR = "/kaggle/input/processed/"
RESULTS_DIR = os.path.join(BASE_DIR, "results", "tables")
MODEL_DIR = os.path.join(BASE_DIR, "models", "deep_learning")
os.makedirs(RESULTS_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)

In [6]:
# Load data
train_df = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
val_df = pd.read_csv(os.path.join(DATA_DIR, "val.csv"))
test_df = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))

In [1]:
# Label encoding
label_encoder = LabelEncoder()
train_df['label_enc'] = label_encoder.fit_transform(train_df['label'])
val_df['label_enc'] = label_encoder.transform(val_df['label'])
test_df['label_enc'] = label_encoder.transform(test_df['label'])
num_classes = len(label_encoder.classes_)
joblib.dump(label_encoder, os.path.join(MODEL_DIR, "label_encoder_gpt.pkl"))

NameError: name 'LabelEncoder' is not defined

In [34]:
# Load tokenizer + model
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # GPT2 không có [PAD], dùng eos_token thay thế
gpt_model = GPT2Model.from_pretrained(model_name)

# Freeze GPT2 nếu muốn chỉ huấn luyện classifier
# for param in gpt_model.parameters():
#     param.requires_grad = False

In [28]:
y_train = train_df['label_enc'].values
y_val = val_df['label_enc'].values
y_test = test_df['label_enc'].values

In [29]:
train_dataset = TensorDataset(train_input_ids, train_attention_masks, torch.tensor(y_train, dtype=torch.long))
val_dataset   = TensorDataset(val_input_ids, val_attention_masks, torch.tensor(y_val, dtype=torch.long))
test_dataset  = TensorDataset(test_input_ids, test_attention_masks, torch.tensor(y_test, dtype=torch.long))

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader  = DataLoader(test_dataset, batch_size=32, shuffle=False)

print(f"Train: {len(train_dataset)}, Val: {len(val_dataset)}, Test: {len(test_dataset)}")

Train: 15997, Val: 2000, Test: 2000


In [21]:
print("Train label distribution:")
print(train_df['label'].value_counts(normalize=True))
print("Val label distribution:")
print(val_df['label'].value_counts(normalize=True))

Train label distribution:
label
rec.autos                   0.050009
comp.windows.x              0.050009
sci.crypt                   0.050009
alt.atheism                 0.050009
rec.motorcycles             0.050009
comp.graphics               0.050009
talk.politics.mideast       0.050009
comp.sys.ibm.pc.hardware    0.050009
talk.politics.guns          0.050009
sci.electronics             0.050009
misc.forsale                0.050009
sci.med                     0.050009
sci.space                   0.050009
rec.sport.hockey            0.050009
comp.sys.mac.hardware       0.050009
rec.sport.baseball          0.050009
talk.politics.misc          0.050009
comp.os.ms-windows.misc     0.050009
talk.religion.misc          0.050009
soc.religion.christian      0.049822
Name: proportion, dtype: float64
Val label distribution:
label
soc.religion.christian      0.05
comp.sys.ibm.pc.hardware    0.05
rec.sport.baseball          0.05
rec.autos                   0.05
talk.politics.guns          0.05


In [22]:
# Kiểm tra một vài mẫu đã tokenize
sample_idx = 0
print("Sample input:")
print("Text:", train_df['clean_text'].iloc[sample_idx])
print("Label:", train_df['label'].iloc[sample_idx])
print("Tokenized IDs:", train_input_ids[sample_idx][:10], "...")

Sample input:
Text: path cantaloupe srv c cmu edu magnesium club cc cmu edu news sei cmu edu ci ohio state edu zaphod mp ohio state edu sdd hp com foxtail blkhole vllyoak jp jp vllyoak resun com jeff perry newsgroups rec auto subject mustang message id date fri apr pdt reference organization private site san marcos california line jmh hopper virginia edu jeffrey hoffmeister writes article virginia edu blad got remind yes right somtime fall ford granddaddy car introducing new mega cool way fast accord driver mustang supposed streamlined looking similar mach iii concept car ford came around january wait anyone hear anything recently everything read correct ford nothing skinning existing mustang minor suspension modification picture seen indicate good job new mustang nothing cycle year old car saw picture mustang popular mechanic disappointment bombarded picture mach iii jp
Label: rec.autos
Tokenized IDs: tensor([ 6978, 18548,   282,   280,   431, 19677,    85,   269, 12067,    84]) ...


In [23]:
print(train_df['label_enc'].value_counts())

label_enc
7     800
5     800
11    800
0     800
8     800
1     800
17    800
3     800
16    800
12    800
6     800
13    800
14    800
10    800
4     800
9     800
18    800
2     800
19    800
15    797
Name: count, dtype: int64


In [24]:
print("Train label_enc min:", train_df['label_enc'].min())
print("Train label_enc max:", train_df['label_enc'].max())
print("Number of classes:", num_classes)
print("Unique labels in train:", train_df['label_enc'].unique())

Train label_enc min: 0
Train label_enc max: 19
Number of classes: 20
Unique labels in train: [ 7  5 11  0  8  1 17  3 16 12  6 15 13 10  4  9 14  2 18 19]


In [39]:
# Define model
class GPTClassifier(nn.Module):
    def __init__(self, gpt_model, output_dim, dropout_rate=0.1):
        super(GPTClassifier, self).__init__()
        self.gpt = gpt_model
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(gpt_model.config.hidden_size, output_dim)

    def forward(self, input_ids, attention_mask):
        outputs = self.gpt(input_ids=input_ids, attention_mask=attention_mask)
        # Mean pooling over all tokens
        last_hidden = outputs.last_hidden_state
        pooled = torch.mean(last_hidden, dim=1)
        x = self.dropout(pooled)
        self.fc(x)
        return x

In [44]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = GPTClassifier(gpt_model, num_classes).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)  # nếu fine-tune GPT thì giảm lr

cuda


In [17]:
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=3, patience=1):
    best_model_state = None
    best_val_loss = float('inf')
    best_val_acc = 0
    epochs_no_improve = 0

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        correct, total = 0, 0

        loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")
        for input_ids, attention_mask, labels in loop:
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            _, preds = torch.max(outputs, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
            loop.set_postfix(loss=loss.item(), acc=100 * correct / total)

        # Validation
        model.eval()
        val_loss, val_correct, val_total = 0.0, 0, 0
        val_preds, val_trues = [], []

        with torch.no_grad():
            for input_ids, attention_mask, labels in val_loader:
                input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
                outputs = model(input_ids, attention_mask)
                loss = criterion(outputs, labels)
                val_loss += loss.item()

                _, preds = torch.max(outputs, dim=1)
                val_correct += (preds == labels).sum().item()
                val_total += labels.size(0)
                val_preds.extend(preds.cpu().numpy())
                val_trues.extend(labels.cpu().numpy())

        val_acc = val_correct / val_total
        avg_val_loss = val_loss / len(val_loader)

        print(f"[Epoch {epoch+1}] Val Loss: {avg_val_loss:.4f}, Val Acc: {val_acc*100:.2f}%")

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            best_val_acc = val_acc
            epochs_no_improve = 0
            best_model_state = model.state_dict()
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= patience:
                print("Early stopping.")
                break

    model.load_state_dict(best_model_state)
    return model, best_val_acc



In [18]:
def evaluate_model(model, test_loader):
    model.eval()
    all_preds, all_trues = [], []
    with torch.no_grad():
        for input_ids, attention_mask, labels in test_loader:
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            outputs = model(input_ids, attention_mask)
            _, preds = torch.max(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_trues.extend(labels.cpu().numpy())

    acc = accuracy_score(all_trues, all_preds)
    report = classification_report(all_trues, all_preds, output_dict=True, zero_division=0)
    print(f"Test Accuracy: {acc*100:.2f}%")
    print(classification_report(all_trues, all_preds))
    return acc, report

In [19]:
def save_results(model_name, test_acc, test_report, val_acc):
    results = {
        "Model": model_name,
        "Test_Accuracy": test_acc,
        "Test_F1": test_report["weighted avg"]["f1-score"],
        "Val_Accuracy": val_acc
    }
    df = pd.DataFrame([results])
    out_file = os.path.join(RESULTS_DIR, "deep_models_results.csv")
    if os.path.exists(out_file):
        df.to_csv(out_file, mode='a', index=False, header=False)
    else:
        df.to_csv(out_file, index=False)
    print(f"Saved results to {out_file}")

In [45]:
# Run training
model, val_acc = train_model(model, train_loader, val_loader, criterion, optimizer, epochs=3)
test_acc, test_report = evaluate_model(model, test_loader)
save_results("GPT2Classifier", test_acc, test_report, val_acc)

Epoch 1/3: 100%|██████████| 500/500 [07:13<00:00,  1.15it/s, acc=81.5, loss=0.104] 


[Epoch 1] Val Loss: 0.1196, Val Acc: 97.10%


Epoch 2/3: 100%|██████████| 500/500 [07:11<00:00,  1.16it/s, acc=87.2, loss=0.471] 


[Epoch 2] Val Loss: 0.1040, Val Acc: 97.10%


Epoch 3/3: 100%|██████████| 500/500 [07:13<00:00,  1.15it/s, acc=87.2, loss=0.258] 


[Epoch 3] Val Loss: 0.0969, Val Acc: 97.05%
Test Accuracy: 97.25%
              precision    recall  f1-score   support

           0       1.00      0.83      0.91       100
           1       0.99      1.00      1.00       100
           2       1.00      0.98      0.99       100
           3       1.00      1.00      1.00       100
           4       1.00      1.00      1.00       100
           5       0.99      0.99      0.99       100
           6       0.99      1.00      1.00       100
           7       1.00      0.98      0.99       100
           8       1.00      1.00      1.00       100
           9       1.00      1.00      1.00       100
          10       1.00      1.00      1.00       100
          11       0.99      1.00      1.00       100
          12       0.98      1.00      0.99       100
          13       1.00      1.00      1.00       100
          14       1.00      1.00      1.00       100
          15       1.00      1.00      1.00       100
          16   