In [None]:
pip install transformers torch pandas scikit-learn rouge_score

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import (
    BartForConditionalGeneration,
    BartTokenizer,
    Trainer,
    TrainingArguments,
    AutoModel,
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    EarlyStoppingCallback
)
from sklearn.model_selection import train_test_split
import os
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from rouge_score import rouge_scorer

In [None]:
class DialectDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        source_text = str(self.data.iloc[idx]['target'])
        target_text = str(self.data.iloc[idx]['input'])
        
        source_encoding = self.tokenizer(
            source_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        target_encoding = self.tokenizer(
            target_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': source_encoding['input_ids'].squeeze(),
            'attention_mask': source_encoding['attention_mask'].squeeze(),
            'labels': target_encoding['input_ids'].squeeze(),
            'source_text': source_text,
            'target_text': target_text
        }

In [None]:
def evaluate_metrics(model, tokenizer, dataset, device, max_length=128):
    model.eval()
    predictions = []
    references = []
    
    for item in dataset:
        source_text = item['source_text']
        target_text = item['target_text']
        pred_text = predict(model, tokenizer, source_text, device, max_length)
        predictions.append(pred_text.split())
        references.append([target_text.split()])
    
    smoothie = SmoothingFunction().method4
    bleu_score = corpus_bleu(references, predictions, smoothing_function=smoothie)
    
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = {'rouge1': 0, 'rouge2': 0, 'rougeL': 0}
    for pred, ref in zip([pred for pred in predictions], [ref[0] for ref in references]):
        scores = scorer.score(' '.join(ref), ' '.join(pred))
        for key in rouge_scores:
            rouge_scores[key] += scores[key].fmeasure / len(predictions)
    
    return bleu_score, rouge_scores

In [None]:
# Thiết lập thiết bị
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')
from transformers import MT5Tokenizer, MT5ForConditionalGeneration
# Tải tokenizer và mô hình
tokenizer = AutoTokenizer.from_pretrained("VietAI/vit5-base")  
model = AutoModelForSeq2SeqLM.from_pretrained("VietAI/vit5-base")
# tokenizer = AutoTokenizer.from_pretrained("vinai/bartPho-word")  
# model = AutoModelForSeq2SeqLM.from_pretrained("vinai/bartPho-word") 
# Tải dữ liệu
data = pd.read_csv('/kaggle/input/bart-test/sentence_for_training2 - sentences (2).csv (1).csv')
train_data, val_data = train_test_split(data, test_size=0.1, random_state=42)

# Tạo Dataset
train_dataset = DialectDataset(train_data, tokenizer)
val_dataset = DialectDataset(val_data, tokenizer)

In [None]:
training_args = TrainingArguments(
    output_dir='/kaggle/working/finetuned_bart',
    num_train_epochs=15,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=200,
    weight_decay=0.05,
    logging_dir='/kaggle/working/logs',
    logging_steps=100,
    eval_strategy='epoch',
    save_strategy='best',
    save_total_limit=1,
    load_best_model_at_end=False,
    metric_for_best_model='eval_loss',
    report_to='none',
    learning_rate=3e-5,

)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

In [None]:
trainer.train()

output_dir = '/kaggle/working/finetuned_vit5'
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f'Model saved to {output_dir}')

In [None]:
# # Hàm dự đoán
def predict(model, tokenizer, text, device, max_length=256):
    model.eval()
    inputs = tokenizer(text, max_length=max_length, padding='max_length', truncation=True, return_tensors='pt').to(device)
    outputs = model.generate(**inputs, max_length=max_length, num_beams=4, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)
# def predict(model, tokenizer, text, device, max_length=128):
#     model.eval()
#     inputs = tokenizer(text, max_length=max_length, padding='max_length', truncation=True, return_tensors='pt')
#     # Chỉ lấy những key phù hợp với model.generate
#     inputs = {k: v.to(device) for k, v in inputs.items() if k in ['input_ids', 'attention_mask']}
#     outputs = model.generate(**inputs, max_length=max_length, num_beams=8, early_stopping=True)
#     return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
bleu_score, rouge_scores = evaluate_metrics(model, tokenizer, val_dataset, device)
print("\nĐánh giá mô hình trên tập validation:")
print(f"BLEU Score: {bleu_score:.4f}")
print(f"ROUGE-1 F1: {rouge_scores['rouge1']:.4f}")
print(f"ROUGE-2 F1: {rouge_scores['rouge2']:.4f}")
print(f"ROUGE-L F1: {rouge_scores['rougeL']:.4f}")

test_texts = [
"Cần đảm bảo mật độ gieo sạ hợp lý để cây lúa sinh trưởng đồng đều và đạt năng suất cao"
]
print("\nKết quả dự đoán:")
for text in test_texts:
    prediction = predict(model, tokenizer, text, device)
    print(f"Input: {text}")
    print(f"Output: {prediction}\n")