In [4]:
import pandas as pd
from transformers import BertTokenizerFast
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertForTokenClassification
from torch.optim import AdamW

from transformers import get_scheduler
from seqeval.metrics import classification_report
import numpy as np
from tqdm import tqdm


In [2]:
df = pd.read_csv('Train.csv')

sentences = []
aspect_terms = []

for sent_id, group in df.groupby('id'):
    sent = group['Sentence'].iloc[0]
    aspects = group['Aspect Term'].tolist()
    sentences.append(sent)
    aspect_terms.append(aspects)

# --- 3. Initialize tokenizer ---
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# --- 4. Create BIO labels ---
def encode_tags(tags, encodings):
    labels = []
    idx = 0
    for word_ids in encodings.word_ids():
        if word_ids is None:
            labels.append(-100)  
        else:
            labels.append(tags[word_ids])
    return labels

def get_bio_tags(sentence, aspects):
    words = sentence.split()
    tags = ['O'] * len(words)

    for asp in aspects:
        asp_words = asp.split()
        for i in range(len(words) - len(asp_words) + 1):
            if words[i:i+len(asp_words)] == asp_words:
                tags[i] = 'B-ASP'
                for j in range(1, len(asp_words)):
                    tags[i+j] = 'I-ASP'

    tag2id = {'O':0, 'B-ASP':1, 'I-ASP':2}
    tag_ids = [tag2id[tag] for tag in tags]
    return tag_ids

# --- 5. Prepare dataset class ---
class AspectTermDataset(Dataset):
    def __init__(self, sentences, aspect_terms, tokenizer):
        self.sentences = sentences
        self.aspect_terms = aspect_terms
        self.tokenizer = tokenizer
        self.tag2id = {'O':0, 'B-ASP':1, 'I-ASP':2}

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        aspects = self.aspect_terms[idx]
        words = sentence.split()

        tags = ['O'] * len(words)
        for asp in aspects:
            asp_words = asp.split()
            for i in range(len(words) - len(asp_words) + 1):
                if words[i:i+len(asp_words)] == asp_words:
                    tags[i] = 'B-ASP'
                    for j in range(1, len(asp_words)):
                        tags[i+j] = 'I-ASP'
        encoding = self.tokenizer(sentence.split(),
                                  is_split_into_words=True,
                                  return_offsets_mapping=True,
                                  padding='max_length',
                                  truncation=True,
                                  max_length=128,
                                  return_tensors="pt")

        labels = []
        word_ids = encoding.word_ids(batch_index=0)  
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                labels.append(-100)  
            elif word_idx != previous_word_idx:
                labels.append(self.tag2id[tags[word_idx]])
            else:
                labels.append(self.tag2id[tags[word_idx]] if tags[word_idx].startswith('I') else -100)
            previous_word_idx = word_idx

        item = {key: val.squeeze() for key, val in encoding.items()}
        item['labels'] = torch.tensor(labels)
        return item

# --- 6. Prepare train/test split ---
train_sentences, val_sentences, train_aspects, val_aspects = train_test_split(
    sentences, aspect_terms, test_size=0.1, random_state=42
)

train_dataset = AspectTermDataset(train_sentences, train_aspects, tokenizer)
val_dataset = AspectTermDataset(val_sentences, val_aspects, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)



In [6]:
# --- 1. Chuẩn bị device ---
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# --- 2. Load model ---
num_labels = 3  # Ví dụ 3 nhãn: B, I, O
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)
model.to(device)

# --- 3. Optimizer và scheduler ---
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 1
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

# --- 4. Training loop ---
from sklearn.metrics import accuracy_score

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []
    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}")

    for batch in loop:
        batch = {k: v.to(device) for k, v in batch.items() if k != 'offset_mapping'}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        logits = outputs.logits 
        preds = torch.argmax(logits, dim=-1)  
        labels = batch["labels"]  
        mask = labels != -100
        preds = preds[mask].detach().cpu().numpy()
        labels = labels[mask].detach().cpu().numpy()

        all_preds.extend(preds)
        all_labels.extend(labels)

        loop.set_postfix(loss=loss.item())

    avg_loss = total_loss / len(train_loader)
    acc = accuracy_score(all_labels, all_preds)
    print(f"Epoch {epoch+1} - Avg Loss: {avg_loss:.4f} - Accuracy: {acc:.4f}")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|███████████████| 152/152 [32:44<00:00, 12.92s/it, loss=0.225]

Epoch 1 - Avg Loss: 0.2029 - Accuracy: 0.9249





In [13]:
# --- 10. Evaluation / Prediction helper ---
def predict_aspect_terms(sentence):
    model.eval()
    tokens = tokenizer.tokenize(sentence)
    encoding = tokenizer(sentence.split(), is_split_into_words=True,
                         return_offsets_mapping=True, return_tensors='pt')
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=2)
    predictions = predictions[0].cpu().numpy()

    word_ids = encoding.word_ids(batch_index=0)

    aspect_terms = []
    current_term = []
    for idx, pred in enumerate(predictions):
        word_idx = word_ids[idx]
        if word_idx is None:
            continue
        label = pred
        token = tokens[idx-1] if idx > 0 else tokens[idx]
        if label == 1:  # B-ASP
            if current_term:
                aspect_terms.append(tokenizer.convert_tokens_to_string(current_term))
                current_term = []
            current_term = [token]
        elif label == 2 and current_term:  # I-ASP
            current_term.append(token)
        else:
            if current_term:
                aspect_terms.append(tokenizer.convert_tokens_to_string(current_term))
                current_term = []
    # Add last term
    if current_term:
        aspect_terms.append(tokenizer.convert_tokens_to_string(current_term))

    return aspect_terms

# --- 11. Test predict ---
test_sentences = [
    # Chủ đề phim ảnh
    "The acting was phenomenal, but the storyline was predictable.",
    "I loved the cinematography, but the pacing was too slow.",
    "The soundtrack was amazing and the dialogue felt natural.",
    "The plot twists were unexpected, but the ending was disappointing.",
    "The characters were well-developed and very relatable.",
    "The visual effects were stunning, but the script was weak.",
    "The movie had a great message, but the direction lacked focus.",
    "The performance by the lead actor was top-notch.",
    "The editing was smooth and the scenes transitioned well.",
    "The humor was forced, and the romance felt unnecessary.",

    # Chủ đề công nghệ
    "The battery life of this smartphone is incredible, but the camera quality is mediocre.",
    "I really appreciate the fast processor, but the device heats up quickly.",
    "The screen resolution is crystal clear, yet the speaker sound is disappointing.",
    "Charging is fast but the charger cable feels fragile.",
    "The software update fixed many bugs but introduced new ones.",

    # Chủ đề dịch vụ khách hàng
    "The customer service was very helpful and resolved my issue quickly.",
    "Waiting time was too long, but the representative was polite.",
    "They responded promptly, but the solution was not satisfactory.",
    "The support team was unprofessional and rude.",
    "I appreciated the follow-up calls after the purchase.",

    # Chủ đề ẩm thực
    "The pizza crust was crispy and delicious, but the toppings were sparse.",
    "Service was quick, but the waiter forgot our drinks.",
    "The dessert was heavenly, and the coffee was perfectly brewed.",
    "Portions were generous but the main dish lacked flavor.",
    "The ambiance was cozy, and the music set the perfect mood.",

    # Chủ đề du lịch
    "The hotel room was spacious and clean, but the Wi-Fi connection was poor.",
    "I loved the guided tour, but the transportation was uncomfortable.",
    "The beach was pristine and beautiful, though a bit crowded.",
    "The local food was delicious, but the prices were a bit high.",
    "The museum had an impressive collection but lacked clear explanations."
]


for sent in test_sentences:
    aspects = predict_aspect_terms(sent)
    print(f"Sentence: {sent}")
    print(f"Extracted Aspects: {aspects}")
    print('-'*60)


Sentence: The acting was phenomenal, but the storyline was predictable.
Extracted Aspects: ['acting', 'storyline']
------------------------------------------------------------
Sentence: I loved the cinematography, but the pacing was too slow.
Extracted Aspects: ['cinematography', 'pacing']
------------------------------------------------------------
Sentence: The soundtrack was amazing and the dialogue felt natural.
Extracted Aspects: ['soundtrack', 'dialogue']
------------------------------------------------------------
Sentence: The plot twists were unexpected, but the ending was disappointing.
Extracted Aspects: ['plot twists', 'ending']
------------------------------------------------------------
Sentence: The characters were well-developed and very relatable.
Extracted Aspects: ['characters']
------------------------------------------------------------
Sentence: The visual effects were stunning, but the script was weak.
Extracted Aspects: ['visual effects', 'script']
-------------

In [7]:
model.save_pretrained('./saved_model')
tokenizer.save_pretrained('./saved_model')


('./saved_model\\tokenizer_config.json',
 './saved_model\\special_tokens_map.json',
 './saved_model\\vocab.txt',
 './saved_model\\added_tokens.json',
 './saved_model\\tokenizer.json')