In [2]:
train_directory = '/kaggle/input/vlsp2021-fixed/Train-Conll'
dev_directory = '/kaggle/input/vlsp2021-fixed/Dev-Conll'

import os
import copy
import json
import logging

import torch
from torch.utils.data import Dataset

from datasets import Dataset as HFDataset  # Lưu ý phân biệt Dataset của HuggingFace với torch.utils.data
from sklearn.metrics import classification_report


train_sentences, train_labels = read_conll_directory(train_directory)
dev_sentences, dev_labels = read_conll_directory(dev_directory)

# Xây dựng tập dữ liệu Train/Dev dưới dạng dict
train_dataset_dict = prepare_dataset(train_sentences, train_labels)
dev_dataset_dict = prepare_dataset(dev_sentences, dev_labels)

# Chuyển tokens và labels về dạng list
train_dataset_dict = process_string_to_array(train_dataset_dict)
dev_dataset_dict = process_string_to_array(dev_dataset_dict)

# Chuyển sang Hugging Face Dataset
train_dataset = HFDataset.from_dict(train_dataset_dict)
dev_dataset = HFDataset.from_dict(dev_dataset_dict)

Processing file: /kaggle/input/vlsp2021-fixed/Train-Conll/giaitri_0072.conll
Processing file: /kaggle/input/vlsp2021-fixed/Train-Conll/giaitri_0002.conll
Processing file: /kaggle/input/vlsp2021-fixed/Train-Conll/kinhte_0059.conll
Processing file: /kaggle/input/vlsp2021-fixed/Train-Conll/thegioi_0073.conll
Processing file: /kaggle/input/vlsp2021-fixed/Train-Conll/00_add_0047.conll
Processing file: /kaggle/input/vlsp2021-fixed/Train-Conll/doisong_0067.conll
Processing file: /kaggle/input/vlsp2021-fixed/Train-Conll/doisong_0027.conll
Processing file: /kaggle/input/vlsp2021-fixed/Train-Conll/doisong_0013.conll
Processing file: /kaggle/input/vlsp2021-fixed/Train-Conll/thethao_0005.conll
Processing file: /kaggle/input/vlsp2021-fixed/Train-Conll/vanhoa_0050.conll
Processing file: /kaggle/input/vlsp2021-fixed/Train-Conll/00_add_0233.conll
Processing file: /kaggle/input/vlsp2021-fixed/Train-Conll/xahoi_0024.conll
Processing file: /kaggle/input/vlsp2021-fixed/Train-Conll/khcn_0005.conll
Processi

In [15]:
print(model.config)

T5Config {
  "_name_or_path": "VietAI/vit5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "torch_dtype": "float32",
  "transformers_version": "4.44.2",
  "use_cache": true,
  "vocab_size": 36096
}



In [18]:
import gc
import torch
gc.collect()
torch.cuda.empty_cache()

In [19]:
import os
import copy
import json
import logging
import torch
from datasets import Dataset
import wandb
from transformers import (
    AutoTokenizer, 
    T5ForTokenClassification, 
    TrainingArguments, 
    Trainer,
    DataCollatorForTokenClassification
)
from sklearn.metrics import classification_report
import numpy as np

def read_conll(file_path):
    sentences = []
    sentence_labels = []
    unique_labels = set()

    with open(file_path, 'r', encoding='utf-8') as file:
        current_sentence_tokens = []
        current_sentence_labels = []

        for line in file:
            line = line.strip()
            if not line:
                if current_sentence_tokens:
                    sentences.append(' '.join(current_sentence_tokens))
                    sentence_labels.append(' '.join(current_sentence_labels))
                current_sentence_tokens = []
                current_sentence_labels = []
            else:
                line_parts = line.split()
                current_sentence_tokens.append(line_parts[0])
                if len(line_parts) >= 4:
                    current_sentence_labels.append(line_parts[3])
                    unique_labels.add(line_parts[3])
                else:
                    current_sentence_labels.append('O')

    if current_sentence_tokens:
        sentences.append(' '.join(current_sentence_tokens))
        sentence_labels.append(' '.join(current_sentence_labels))

    return sentences, sentence_labels

def read_conll_directory(directory_path):
    all_sentences = []
    all_sentence_labels = []

    for file_name in os.listdir(directory_path):
        if file_name.endswith('.conll'):
            file_path = os.path.join(directory_path, file_name)
            print(f"Processing file: {file_path}")
            sentences, sentence_labels = read_conll(file_path)
            all_sentences.extend(sentences)
            all_sentence_labels.extend(sentence_labels)

    return all_sentences, all_sentence_labels

def extract_labels(label_sentences):
    unique_labels = set()
    for sentence in label_sentences:
        labels = sentence.split()
        unique_labels.update(labels)
    return unique_labels

# Load data
train_directory = '/kaggle/input/vlsp2021-fixed/Train-Conll'  # Replace with your path
dev_directory = '/kaggle/input/vlsp2021-fixed/Dev-Conll'      # Replace with your path

train_sentences, train_labels = read_conll_directory(train_directory)
dev_sentences, dev_labels = read_conll_directory(dev_directory)

# Get unique labels and create label mapping
train_unique_labels = extract_labels(train_labels)
dev_unique_labels = extract_labels(dev_labels)
label_list = sorted(train_unique_labels.union(dev_unique_labels))
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

print(f"Number of labels: {len(label_list)}")
print(f"Train size: {len(train_sentences)}")
print(f"Dev size: {len(dev_sentences)}")

# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("VietAI/vit5-base")
model = T5ForTokenClassification.from_pretrained(
    "VietAI/vit5-base",
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)
for param in model.parameters(): param.data = param.data.contiguous()

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["text"],
        padding=True,
        truncation=True,
        max_length=256,
        return_tensors="pt"
    )

    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Prepare datasets
train_dataset = Dataset.from_dict({
    "text": train_sentences,
    "labels": [label.split() for label in train_labels]
})

dev_dataset = Dataset.from_dict({
    "text": dev_sentences,
    "labels": [label.split() for label in dev_labels]
})

# Tokenize datasets
tokenized_train = train_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=train_dataset.column_names
)

tokenized_dev = dev_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dev_dataset.column_names
)

def compute_metrics(eval_preds):
    predictions, labels = eval_preds
    predictions = np.argmax(predictions, axis=2)

    # Map predictions and labels to their string values (ignore padding tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # Flatten the true and predicted labels for the classification report
    true_labels_flat = [label for sublist in true_labels for label in sublist]
    pred_labels_flat = [label for sublist in true_predictions for label in sublist]

    # Generate classification report
    report = classification_report(
        true_labels_flat,
        pred_labels_flat,
        labels=label_list,
        zero_division=0,
        output_dict=True
    )

    # Print detailed classification report for per-class metrics
    print("\nDetailed Classification Report:")
    print(classification_report(
        true_labels_flat,
        pred_labels_flat,
        labels=label_list,
        zero_division=0
    ))

    # Extract micro-averaged metrics
    precision = report["micro avg"]["precision"]
    recall = report["micro avg"]["recall"]
    f1 = report["micro avg"]["f1-score"]

    # Return results as a dictionary
    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }


# Training arguments
training_args = TrainingArguments(
    output_dir="./t5-token-classification",
    evaluation_strategy="epoch",
    per_device_train_batch_size=48,
    per_device_eval_batch_size=48,
    num_train_epochs=2,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=100,
    save_steps=500,
    fp16=True,
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_dev,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=DataCollatorForTokenClassification(tokenizer)
)

# Train the model
trainer.train()

# Save the model
trainer.save_model("./final-model")





Processing file: /kaggle/input/vlsp2021-fixed/Train-Conll/giaitri_0072.conll
Processing file: /kaggle/input/vlsp2021-fixed/Train-Conll/giaitri_0002.conll
Processing file: /kaggle/input/vlsp2021-fixed/Train-Conll/kinhte_0059.conll
Processing file: /kaggle/input/vlsp2021-fixed/Train-Conll/thegioi_0073.conll
Processing file: /kaggle/input/vlsp2021-fixed/Train-Conll/00_add_0047.conll
Processing file: /kaggle/input/vlsp2021-fixed/Train-Conll/doisong_0067.conll
Processing file: /kaggle/input/vlsp2021-fixed/Train-Conll/doisong_0027.conll
Processing file: /kaggle/input/vlsp2021-fixed/Train-Conll/doisong_0013.conll
Processing file: /kaggle/input/vlsp2021-fixed/Train-Conll/thethao_0005.conll
Processing file: /kaggle/input/vlsp2021-fixed/Train-Conll/vanhoa_0050.conll
Processing file: /kaggle/input/vlsp2021-fixed/Train-Conll/00_add_0233.conll
Processing file: /kaggle/input/vlsp2021-fixed/Train-Conll/xahoi_0024.conll
Processing file: /kaggle/input/vlsp2021-fixed/Train-Conll/khcn_0005.conll
Processi

Some weights of T5ForTokenClassification were not initialized from the model checkpoint at VietAI/vit5-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/12887 [00:00<?, ? examples/s]

Map:   0%|          | 0/6806 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,4.4895,0.613986,0.92416,0.92416,0.92416
2,0.9034,0.54196,0.928382,0.928382,0.928382



Detailed Classification Report:
                       precision    recall  f1-score   support

            B-ADDRESS       0.00      0.00      0.00         4
           B-DATETIME       0.57      0.52      0.54      1158
      B-DATETIME-DATE       0.50      0.40      0.44       363
 B-DATETIME-DATERANGE       0.00      0.00      0.00       111
  B-DATETIME-DURATION       0.73      0.58      0.65       436
       B-DATETIME-SET       0.00      0.00      0.00        16
      B-DATETIME-TIME       0.19      0.30      0.23        37
 B-DATETIME-TIMERANGE       0.05      0.01      0.02        80
              B-EMAIL       0.00      0.00      0.00         1
              B-EVENT       0.16      0.07      0.10       109
          B-EVENT-CUL       0.08      0.02      0.03        58
     B-EVENT-GAMESHOW       0.08      0.04      0.05        48
      B-EVENT-NATURAL       0.00      0.00      0.00        17
        B-EVENT-SPORT       0.15      0.49      0.23        43
                 B-IP




Detailed Classification Report:
                       precision    recall  f1-score   support

            B-ADDRESS       0.00      0.00      0.00         4
           B-DATETIME       0.56      0.52      0.54      1158
      B-DATETIME-DATE       0.55      0.52      0.53       363
 B-DATETIME-DATERANGE       0.00      0.00      0.00       111
  B-DATETIME-DURATION       0.66      0.77      0.71       436
       B-DATETIME-SET       0.00      0.00      0.00        16
      B-DATETIME-TIME       0.21      0.59      0.31        37
 B-DATETIME-TIMERANGE       0.44      0.05      0.09        80
              B-EMAIL       0.00      0.00      0.00         1
              B-EVENT       0.20      0.09      0.13       109
          B-EVENT-CUL       0.00      0.00      0.00        58
     B-EVENT-GAMESHOW       0.15      0.21      0.17        48
      B-EVENT-NATURAL       1.00      0.06      0.11        17
        B-EVENT-SPORT       0.19      0.63      0.29        43
                 B-IP

In [24]:
def predict_ner(text, model, tokenizer, max_length=256):
    # Prepare input
    inputs = tokenizer(
        text,
        max_length=max_length,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )
    
    # Move to GPU if available
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Get predictions
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = outputs.logits.argmax(dim=-1)
    
    # Convert predictions to labels
    input_tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    raw_predictions = [(token, label_list[pred.item()]) 
                      for token, pred in zip(input_tokens, predictions[0])
                      if token not in [tokenizer.pad_token, tokenizer.eos_token, tokenizer.bos_token]]
    
    # Clean up predictions
    cleaned_predictions = []
    i = 0
    while i < len(raw_predictions):
        token, label = raw_predictions[i]
        
        if token.startswith('▁') and (label.startswith('B-') or label.startswith('I-')):
            # Start of a new word
            full_token = token.replace('▁', '')
            full_label = label
            
            # Look ahead for continuation of the same entity
            j = i + 1
            while j < len(raw_predictions) and not raw_predictions[j][0].startswith('▁'):
                full_token += raw_predictions[j][0]
                j += 1
            i = j - 1
            
            cleaned_predictions.append((full_token, full_label))
        elif not token.startswith('▁') and label != 'O':
            # Handle non-word-initial subwords
            continue
        else:
            # Handle other tokens
            if label != 'O':
                cleaned_predictions.append((token.replace('▁', ''), label))
        i += 1
    
    return cleaned_predictions



# Test the model with sample text
test_text = """Sofascore là hãng thống kê hàng đầu thế giới, do người Croatia sáng lập năm 2010 
và có hàng chục triệu người dùng. Theo mô hình của hãng, Xuân Son nhận điểm 10 và 
dĩ nhiên là cầu thủ hay nhất trận. Vĩ Hào đứng thứ hai với 8,4 điểm, nhờ một bàn 
và một đường kiến tạo, còn Nguyễn Quang Hải nhận 8,2 điểm với một bàn."""

# Get predictions
predictions = predict_ner(test_text, model, tokenizer)

# Print results in a readable format
print("\nNER Predictions:")
print("-" * 50)
for token, label in predictions:
    if label != "O":
        print(f"Token: {token:} Label: {label}")


NER Predictions:
--------------------------------------------------
Token: Sofascore Label: B-ORGANIZATION
Token: Croatia Label: B-LOCATION-GPE
Token: năm Label: B-DATETIME
Token: 2010 Label: I-DATETIME
Token: Xuân Label: B-PERSON
Token: Son Label: I-PERSON
Token: 10 Label: B-QUANTITY-NUM
Token: Vĩ Label: B-PERSON
Token: Hào Label: I-PERSON
Token: hai Label: I-QUANTITY-ORD
Token: 8,4 Label: B-QUANTITY-NUM
Token: một Label: B-QUANTITY-NUM
Token: Nguyễn Label: B-PERSON
Token: Quang Label: I-PERSON
Token: Hải Label: I-PERSON
Token: 8,2 Label: B-QUANTITY-NUM
Token: một Label: B-QUANTITY-NUM


In [41]:
###################################
# 1. Read Test data and check labels
###################################
test_directory = '/kaggle/input/vlsp-2021-test'
test_sentences, test_labels = read_conll_directory(test_directory)

# Check for unknown labels
test_unique_labels = extract_labels(test_labels)
unknown_labels = test_unique_labels - set(label_list)
if unknown_labels:
    print(f"Warning: Found unknown labels in test set: {unknown_labels}")
    print("These will be mapped to 'O'")

###################################
# 2. Prepare Test Dataset
###################################
test_dataset = Dataset.from_dict({
    "text": test_sentences,
    "labels": [label.split() for label in test_labels]
})

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["text"],
        padding=True,
        truncation=True,
        max_length=256,
        return_tensors="pt"
    )

    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                # Handle unknown labels by mapping them to 'O'
                label_id = label2id.get(label[word_idx], label2id['O'])
                label_ids.append(label_id)
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Tokenize test dataset
tokenized_test = test_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=test_dataset.column_names
)

###################################
# 3. Evaluate on Test Set
###################################
test_trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir="./test-output",
        per_device_eval_batch_size=48,
        fp16=True,
    ),
    tokenizer=tokenizer,
    data_collator=DataCollatorForTokenClassification(tokenizer),
    compute_metrics=compute_metrics,
)

# Run prediction
print("Starting evaluation on test set...")
predictions = test_trainer.predict(tokenized_test)

# Process predictions
pred_labels = np.argmax(predictions.predictions, axis=2)

# Convert predictions to labels
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(pred_labels, predictions.label_ids)
]

# Process predictions and ensure alignment
print("\nProcessing predictions...")
processed_true = []
processed_pred = []

for sentence, true_lab, pred_lab in zip(test_sentences, test_labels, true_predictions):
    true_tokens = true_lab.split()
    
    # Ensure predictions match the number of true labels
    if len(pred_lab) > len(true_tokens):
        pred_lab = pred_lab[:len(true_tokens)]
    elif len(pred_lab) < len(true_tokens):
        pred_lab = pred_lab + ['O'] * (len(true_tokens) - len(pred_lab))
    
    processed_true.extend(true_tokens)
    processed_pred.extend(pred_lab)

# Verify lengths match
assert len(processed_true) == len(processed_pred), \
    f"Mismatch in lengths: true={len(processed_true)}, pred={len(processed_pred)}"

# Print evaluation metrics
print("\nTest Set Evaluation:")
print("-" * 50)
print(classification_report(
    processed_true,
    processed_pred,
    labels=[l for l in label_list if l != 'O'],
    digits=4,
    zero_division=0
))

# Save detailed analysis with aligned predictions
with open('test_predictions_detailed.txt', 'w', encoding='utf-8') as f:
    for i, (sentence, true_lab, pred_lab) in enumerate(zip(test_sentences, test_labels, true_predictions)):
        true_tokens = true_lab.split()
        # Ensure predictions are aligned
        pred_aligned = pred_lab[:len(true_tokens)] if len(pred_lab) > len(true_tokens) else pred_lab + ['O'] * (len(true_tokens) - len(pred_lab))
        
        f.write(f"Example {i+1}:\n")
        f.write(f"Text: {sentence}\n")
        f.write(f"True: {true_lab}\n")
        f.write(f"Pred: {' '.join(pred_aligned)}\n")
        
        # Add token-level comparison
        f.write("\nToken-level comparison:\n")
        f.write("Token\tTrue\tPred\n")
        tokens = sentence.split()
        for token, true_label, pred_label in zip(tokens, true_tokens, pred_aligned):
            if true_label != pred_label:
                f.write(f"{token}\t{true_label}\t{pred_label}\n")
        f.write("-" * 50 + "\n\n")

# Save aligned predictions in CONLL format
with open('test_predictions.conll', 'w', encoding='utf-8') as f:
    for sentence, true_lab, pred_lab in zip(test_sentences, test_labels, true_predictions):
        tokens = sentence.split()
        true_tokens = true_lab.split()
        # Ensure predictions are aligned
        pred_aligned = pred_lab[:len(true_tokens)] if len(pred_lab) > len(true_tokens) else pred_lab + ['O'] * (len(true_tokens) - len(pred_lab))
        
        for token, pred in zip(tokens, pred_aligned):
            f.write(f"{token}\t_\t_\t{pred}\tO\n")
        f.write("\n")

print("\nPredictions have been saved to:")
print("1. test_predictions.conll - CONLL format")
print("2. test_predictions_detailed.txt - Detailed analysis")

Processing file: /kaggle/input/vlsp-2021-test/1_test_vnexpress_phapluat_0006.conll
Processing file: /kaggle/input/vlsp-2021-test/0_test_vnexpress_giaoduc_0003.conll
Processing file: /kaggle/input/vlsp-2021-test/0_test_vnexpress_phapluat_0002.conll
Processing file: /kaggle/input/vlsp-2021-test/2_test_dantri_batdongsan_0003.conll
Processing file: /kaggle/input/vlsp-2021-test/2_test_zingnews_giaitri_0005.conll
Processing file: /kaggle/input/vlsp-2021-test/1_test_vnexpress_phapluat_0003.conll
Processing file: /kaggle/input/vlsp-2021-test/2_test_zingnews_xahoi_0003.conll
Processing file: /kaggle/input/vlsp-2021-test/2_test_baomoi_giaoduc_0002.conll
Processing file: /kaggle/input/vlsp-2021-test/2_test_baomoi_xahoi_0003.conll
Processing file: /kaggle/input/vlsp-2021-test/2_test_dantri_suckhoe_0001.conll
Processing file: /kaggle/input/vlsp-2021-test/2_test_zingnews_xahoi_0005.conll
Processing file: /kaggle/input/vlsp-2021-test/1_test_vnexpress_doisong_0008.conll
Processing file: /kaggle/input/

Map:   0%|          | 0/3990 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Starting evaluation on test set...



Detailed Classification Report:
                       precision    recall  f1-score   support

            B-ADDRESS       0.00      0.00      0.00        23
           B-DATETIME       0.49      0.55      0.52       625
      B-DATETIME-DATE       0.59      0.47      0.52       581
 B-DATETIME-DATERANGE       0.00      0.00      0.00       142
  B-DATETIME-DURATION       0.84      0.69      0.76       489
       B-DATETIME-SET       0.00      0.00      0.00         4
      B-DATETIME-TIME       0.62      0.80      0.70        54
 B-DATETIME-TIMERANGE       0.67      0.02      0.03       132
              B-EMAIL       0.00      0.00      0.00         2
              B-EVENT       0.62      0.13      0.22       181
          B-EVENT-CUL       0.00      0.00      0.00        16
     B-EVENT-GAMESHOW       0.14      0.05      0.08        57
      B-EVENT-NATURAL       0.00      0.00      0.00         9
        B-EVENT-SPORT       0.53      0.49      0.51       153
                 B-IP

In [28]:
def predict_ner(text):
    # Ensure the model is on the correct device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Tokenize input and move to the same device
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=256
    ).to(device)

    # Get predictions
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=2)

    # Convert predictions to labels
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    word_ids = inputs.word_ids()

    results = []
    current_word = ""
    current_label = None

    for token, pred_idx, word_idx in zip(tokens, predictions[0].cpu(), word_ids):
        if word_idx is None:
            continue

        if word_idx != current_label:
            if current_word:
                results.append((current_word.strip(), id2label[current_label]))
            current_word = token.replace('▁', '')
            current_label = pred_idx.item()
        else:
            current_word += token.replace('▁', '')

    if current_word:  # Add the last word
        results.append((current_word.strip(), id2label[current_label]))

    return results

# Test prediction
def test_prediction():
    test_text = "Xem Chung kết Sao Mai : Thí sinh hát 'hit' của danh ca Whitney Houston"
    predictions = predict_ner(test_text)
    print("\nTest Prediction:")
    print("Input:", test_text)
    print("\nPredictions:")
    for token, label in predictions:
        print(f"{token}: {label}")

# Run test prediction after training
test_prediction()



Test Prediction:
Input: Xem Chung kết Sao Mai : Thí sinh hát 'hit' của danh ca Whitney Houston

Predictions:
Xem: O
Chung: O
kết: O
Sao: B-ORGANIZATION-SPORTS
Mai: I-EVENT-GAMESHOW
:: O
Thí: O
sinh: O
hát: O
': O
h: O
it: O
': O
của: O
danh: B-PERSONTYPE
ca: I-PERSONTYPE
Whitney: B-PERSON
Houston: I-PERSON
