In [None]:
# ===========================
# 1. Cài đặt thư viện
# ===========================
!pip install -q transformers datasets evaluate accelerate torch torchvision torchaudio gradio_client

# ===========================
# 2. Kiểm tra GPU và import thư viện
# ===========================
import torch
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU name:", torch.cuda.get_device_name(0))
else:
    print("❌ Không tìm thấy GPU. Vào Runtime → Change runtime type → GPU.")

from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DistilBertForSequenceClassification, DistilBertTokenizer
import evaluate
import numpy as np
import torch.nn.functional as F
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from transformers import get_scheduler
from gradio_client import Client

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hCUDA available: True
GPU name: Tesla T4


In [None]:
# ===========================
# 3. Tải và tiền xử lý dataset
# ===========================
dataset = load_dataset("go_emotions", "simplified")
label_names = dataset["train"].features["labels"].feature.names

NEGATIVE = {"anger", "annoyance", "disapproval", "disgust", "fear", "sadness"}
POSITIVE = {"joy", "love", "admiration", "approval", "gratitude", "relief", "pride", "excitement"}
ALL_LABELS = set(label_names)
NEUTRAL = ALL_LABELS - NEGATIVE - POSITIVE

label2sentiment = {}
for lbl in NEGATIVE: label2sentiment[lbl] = 0
for lbl in POSITIVE: label2sentiment[lbl] = 2
for lbl in NEUTRAL: label2sentiment[lbl] = 1

def map_to_sentiment(example):
    if example["labels"]:
        raw_label = example["labels"][0]
        lbl_name = label_names[raw_label]
        example["labels"] = label2sentiment[lbl_name]
    else:
        example["labels"] = 1 # neutral
    return example

dataset = dataset.map(map_to_sentiment)

tokenizer_name = "distilbert-base-uncased"
student_tokenizer = DistilBertTokenizer.from_pretrained(tokenizer_name)

def tokenize_function(examples):
    return student_tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["text"])

In [None]:
# ===========================
# 4. Tải mô hình học sinh và định nghĩa Dataset
# ===========================
student_model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=3
)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
student_model.to(device)

# Khởi tạo client cho mô hình giáo viên Gradio
try:
    from gradio_client import Client
    teacher_client = Client("dataguychill/sentiment")
except ImportError:
    raise ImportError("Vui lòng cài đặt gradio_client: pip install gradio_client")

# Chuyển đổi nhãn string từ Gradio sang số
label_map = {"negative": 0, "neutral": 1, "positive": 2}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded as API: https://dataguychill-sentiment.hf.space ✔


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm

class HardLabelDistillationDataset(Dataset):
    def __init__(self, tokenized_datasets, original_datasets, teacher_client, label_map):
        # tokenized_datasets: HuggingFace DatasetDict with tokenized fields
        # original_datasets: original HF DatasetDict with "text"
        self.tokenized = tokenized_datasets["train"]
        self.original = original_datasets["train"]
        self.teacher_client = teacher_client
        self.label_map = label_map
        self.hard_labels_cache = {}

    def __len__(self):
        return len(self.tokenized)

    def __getitem__(self, idx):
        item = self.tokenized[idx]
        text_data = self.original[idx]["text"]

        # Normalise text to string
        if isinstance(text_data, list):
            text = str(text_data[0])
        elif isinstance(text_data, str):
            text = text_data
        else:
            text = str(text_data)

        # lấy nhãn hard từ cache hoặc gọi teacher
        if text in self.hard_labels_cache:
            hard_label = self.hard_labels_cache[text]
        else:
            try:
                result = self.teacher_client.predict(text, api_name="/predict")
                hard_label = self.label_map.get(result, 1)  # fallback = 1 (neutral)
            except Exception as e:
                print(f"Warn: teacher call failed for text[:50]={text[:50]}..., err={e}")
                hard_label = 1
            self.hard_labels_cache[text] = hard_label

        # Trả về dict với kiểu an toàn cho DataLoader
        return {
            "input_ids": torch.tensor(item["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(item["attention_mask"], dtype=torch.long),
            # Trả về nhãn là int (Không phải 0-dim tensor)
            "labels": int(hard_label)
        }

# ==== Usage/check ====
distillation_train_dataset = HardLabelDistillationDataset(tokenized_datasets, dataset, teacher_client, label_map)
train_dataloader = DataLoader(distillation_train_dataset, shuffle=True, batch_size=32)

# Quick sanity check shapes (lấy 1 batch)
batch = next(iter(train_dataloader))
print("batch keys:", batch.keys())
print("input_ids shape:", batch["input_ids"].shape)        # (B, seq_len)
print("attention_mask shape:", batch["attention_mask"].shape)
print("labels shape:", batch["labels"].shape)              # (B,)


batch keys: dict_keys(['input_ids', 'attention_mask', 'labels'])
input_ids shape: torch.Size([32, 128])
attention_mask shape: torch.Size([32, 128])
labels shape: torch.Size([32])


In [None]:
from transformers import get_scheduler
import torch.nn.functional as F
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
student_model.to(device)

optimizer = torch.optim.AdamW(student_model.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

progress_bar = tqdm(range(num_training_steps))
student_model.train()

for epoch in range(num_epochs):
    epoch_loss = 0.0
    for batch in train_dataloader:
        # batch["labels"] is tensor shaped (B,) if DataLoader collated ints -> tensor
        batch = {k: v.to(device) for k, v in batch.items()}
        labels = batch.pop("labels")  # shape (B,)
        outputs = student_model(**batch)
        logits = outputs.logits  # (B, C)
        loss = F.cross_entropy(logits, labels)
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        epoch_loss += loss.item()
        progress_bar.update(1)

    print(f"Epoch {epoch+1} loss: {epoch_loss/len(train_dataloader):.4f}")


  0%|          | 0/4071 [00:00<?, ?it/s]