In [1]:
!pip install evaluate

import pandas as pd
import numpy as np
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    DistilBertTokenizer, DistilBertForSequenceClassification,
    RobertaTokenizer, RobertaForSequenceClassification,
    Trainer, TrainingArguments,
    DataCollatorWithPadding
)
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import evaluate
import os

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, evaluate
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.5.1
    Uninstalling fsspec-2025.5.1:
      Successfully uninstalled fsspec-2025.5.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.8.0 requires google-cloud

2025-07-31 13:31:30.692104: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753968691.065047      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753968691.168845      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# Đọc và xử lý dữ liệu
data = pd.read_csv('/kaggle/input/pretrained-data/preprocessed_results.csv')
data['sentiment'] = data['sentiment'].map({'positive': 1, 'negative': 0})

In [3]:
# Chia tập train, validation, test
train_data, temp_data = train_test_split(data, test_size=0.3, random_state=42, stratify=data['sentiment'])
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42, stratify=temp_data['sentiment'])

In [4]:
# Lưu các tập dữ liệu vào CSV
os.makedirs('/kaggle/working/datasets', exist_ok=True)
train_data.to_csv('/kaggle/working/datasets/train_data.csv', index=False)
val_data.to_csv('/kaggle/working/datasets/val_data.csv', index=False)
test_data.to_csv('/kaggle/working/datasets/test_data.csv', index=False)

In [5]:
# Chuyển thành DatasetDict
raw_datasets = DatasetDict({
    'train': Dataset.from_pandas(train_data[['preprocessed_tokens', 'sentiment']].rename(columns={'preprocessed_tokens': 'text', 'sentiment': 'labels'})),
    'valid': Dataset.from_pandas(val_data[['preprocessed_tokens', 'sentiment']].rename(columns={'preprocessed_tokens': 'text', 'sentiment': 'labels'})),
    'test': Dataset.from_pandas(test_data[['preprocessed_tokens', 'sentiment']].rename(columns={'preprocessed_tokens': 'text', 'sentiment': 'labels'})),
})

In [6]:
# Định nghĩa hàm compute_metrics
def compute_metrics(eval_pred):
    metric_acc = evaluate.load("accuracy")
    metric_f1 = evaluate.load("f1")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = metric_acc.compute(predictions=predictions, references=labels)
    f1 = metric_f1.compute(predictions=predictions, references=labels, average="weighted")
    return {"accuracy": acc["accuracy"], "f1": f1["f1"]}

In [7]:
# Hàm để đánh giá mô hình trước khi fine-tune
def evaluate_pre_finetuned_model(model_name, tokenizer_class, model_class, tokenized_datasets):
    tokenizer = tokenizer_class.from_pretrained(model_name)
    model = model_class.from_pretrained(model_name, num_labels=2, ignore_mismatched_sizes=True)
    
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, max_length=256)
    
    trainer = Trainer(
        model=model,
        args=TrainingArguments(
            output_dir=f"/kaggle/working/{model_name.split('/')[-1]}_prefinetuned",
            eval_strategy="epoch",
            report_to="none"
        ),
        eval_dataset=tokenized_datasets['test'],
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    
    # Đánh giá trên tập test
    test_results = trainer.evaluate()
    print(f"Kết quả đánh giá pre-fine-tuned model ({model_name}) trên tập test: {test_results}")
    
    # Dự đoán để tạo classification report
    y_pred = trainer.predict(tokenized_datasets['test']).predictions
    y_pred = np.argmax(y_pred, axis=-1)
    y_true = tokenized_datasets['test']['labels']
    print(f"\nClassification Report pre-fine-tuned ({model_name}):")
    print(classification_report(y_true, y_pred, digits=3))

In [8]:
# # Hàm để fine-tune mô hình
# def fine_tune_model(model_name, tokenizer_class, model_class, output_dir):
#     # Khởi tạo tokenizer và model
#     tokenizer = tokenizer_class.from_pretrained(model_name)
#     model = model_class.from_pretrained(model_name, num_labels=2)

#     # Tokenize dữ liệu
#     def tokenize_function(examples):
#         return tokenizer(examples['text'], truncation=True)

#     tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
#     tokenized_datasets = tokenized_datasets.remove_columns(['text'])

#     # Thiết lập DataCollator
#     data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

#     # Thiết lập TrainingArguments
#     training_args = TrainingArguments(
#         output_dir=output_dir,
#         num_train_epochs=3,
#         eval_strategy="epoch",
#         weight_decay=5e-4,
#         optim="adamw_torch",
#         learning_rate=5e-5,
#         save_strategy="no",
#         fp16=True,
#         push_to_hub=False,
#         report_to="none"
#     )

#     # Khởi tạo Trainer
#     trainer = Trainer(
#         model=model,
#         args=training_args,
#         train_dataset=tokenized_datasets['train'],
#         eval_dataset=tokenized_datasets['valid'],
#         data_collator=data_collator,
#         tokenizer=tokenizer,
#         compute_metrics=compute_metrics
#     )

#     # Huấn luyện mô hình
#     trainer.train()

#     # Đánh giá trên tập validation
#     val_results = trainer.evaluate()
#     print(f"Kết quả đánh giá trên tập validation ({model_name}): {val_results}")

#     # Đánh giá trên tập test
#     test_results = trainer.evaluate(tokenized_datasets['test'])
#     print(f"Kết quả đánh giá trên tập test ({model_name}): {test_results}")

#     # Lưu mô hình
#     model.save_pretrained(output_dir)
#     tokenizer.save_pretrained(output_dir)
#     print(f"Mô hình {model_name} đã được lưu tại {output_dir}")

#     # Dự đoán trên tập test để có classification report
#     y_pred = trainer.predict(tokenized_datasets['test']).predictions
#     y_pred = np.argmax(y_pred, axis=-1)
#     y_true = tokenized_datasets['test']['labels']
#     print(f"\nClassification Report ({model_name}):")
#     print(classification_report(y_true, y_pred, digits=3))

# Hàm để fine-tune và đánh giá mô hình
def fine_tune_model(model_name, tokenizer_class, model_class, output_dir, tokenized_datasets):
    tokenizer = tokenizer_class.from_pretrained(model_name)
    model = model_class.from_pretrained(model_name, num_labels=2, ignore_mismatched_sizes=True)

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, max_length=256)

    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=3,
        eval_strategy="epoch",
        weight_decay=5e-4,
        optim="adamw_torch",
        learning_rate=5e-5,
        save_strategy="no",
        fp16=True,
        push_to_hub=False,
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets['train'],
        eval_dataset=tokenized_datasets['valid'],
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    # Huấn luyện mô hình
    trainer.train()

    # Đánh giá trên tập validation
    val_results = trainer.evaluate()
    print(f"Kết quả đánh giá fine-tuned model ({model_name}) trên tập validation: {val_results}")

    # Đánh giá trên tập test
    test_results = trainer.evaluate(tokenized_datasets['test'])
    print(f"Kết quả đánh giá fine-tuned model ({model_name}) trên tập test: {test_results}")

    # Lưu mô hình
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f"Mô hình fine-tuned {model_name} đã được lưu tại {output_dir}")

    # Dự đoán để tạo classification report
    y_pred = trainer.predict(tokenized_datasets['test']).predictions
    y_pred = np.argmax(y_pred, axis=-1)
    y_true = tokenized_datasets['test']['labels']
    print(f"\nClassification Report fine-tuned ({model_name}):")
    print(classification_report(y_true, y_pred, digits=3))

In [9]:
# Tokenize dữ liệu cho cả hai mô hình
def tokenize_function(examples, tokenizer):
    # return tokenizer(examples['text'], truncation=True)
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512, return_token_type_ids=False)

# Cấu hình mô hình DistilBERT
distilbert_config = {
    'model_name': "distilbert-base-uncased-finetuned-sst-2-english",
    'tokenizer_class': DistilBertTokenizer,
    'model_class': DistilBertForSequenceClassification,
    'output_dir': "/kaggle/working/distilbert_finetuned"
}

# Cấu hình mô hình RoBERTa
twitter_roberta_config = {
    'model_name': "cardiffnlp/twitter-roberta-base-sentiment-latest",
    'tokenizer_class': AutoTokenizer,
    'model_class': AutoModelForSequenceClassification,
    'output_dir': "/kaggle/working/twitter_roberta_finetuned"
}

# # Tokenize dữ liệu cho từng mô hình
# for config in [distilbert_config, roberta_config]:
#     tokenizer = config['tokenizer_class'].from_pretrained(config['model_name'])
#     tokenized_datasets = raw_datasets.map(lambda x: tokenize_function(x, tokenizer), batched=True)
#     tokenized_datasets = tokenized_datasets.remove_columns(['text'])
    
#     print(f"\nĐánh giá pre-fine-tuned model: {config['model_name']}")
#     evaluate_pre_finetuned_model(config['model_name'], config['tokenizer_class'], config['model_class'], tokenized_datasets)
    
#     print(f"\nFine-tuning model: {config['model_name']}")
#     fine_tune_model(config['model_name'], config['tokenizer_class'], config['model_class'], config['output_dir'], tokenized_datasets)

In [10]:
# Xử lý DistilBERT
print("\n=== Xử lý DistilBERT ===")
# Tokenize dữ liệu cho DistilBERT
distilbert_tokenizer = distilbert_config['tokenizer_class'].from_pretrained(distilbert_config['model_name'])
distilbert_tokenized_datasets = raw_datasets.map(lambda x: tokenize_function(x, distilbert_tokenizer), batched=True)
distilbert_tokenized_datasets = distilbert_tokenized_datasets.remove_columns(['text'])


=== Xử lý DistilBERT ===


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Map:   0%|          | 0/35000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7500 [00:00<?, ? examples/s]

Map:   0%|          | 0/7500 [00:00<?, ? examples/s]

In [11]:
# Đánh giá pre-fine-tuned DistilBERT
print("\nĐánh giá pre-fine-tuned model: distilbert-base-uncased-finetuned-sst-2-english")
evaluate_pre_finetuned_model(
    distilbert_config['model_name'],
    distilbert_config['tokenizer_class'],
    distilbert_config['model_class'],
    distilbert_tokenized_datasets
)


Đánh giá pre-fine-tuned model: distilbert-base-uncased-finetuned-sst-2-english


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

  trainer = Trainer(


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Kết quả đánh giá pre-fine-tuned model (distilbert-base-uncased-finetuned-sst-2-english) trên tập test: {'eval_loss': 0.7170329689979553, 'eval_accuracy': 0.7956, 'eval_f1': 0.7919230151894299, 'eval_runtime': 64.5374, 'eval_samples_per_second': 116.212, 'eval_steps_per_second': 7.267}





Classification Report pre-fine-tuned (distilbert-base-uncased-finetuned-sst-2-english):
              precision    recall  f1-score   support

           0      0.734     0.929     0.820      3750
           1      0.903     0.663     0.764      3750

    accuracy                          0.796      7500
   macro avg      0.818     0.796     0.792      7500
weighted avg      0.818     0.796     0.792      7500



In [12]:
# Fine-tuning DistilBERT
print("\nFine-tuning model: distilbert-base-uncased-finetuned-sst-2-english")
fine_tune_model(
    distilbert_config['model_name'],
    distilbert_config['tokenizer_class'],
    distilbert_config['model_class'],
    distilbert_config['output_dir'],
    distilbert_tokenized_datasets
)


Fine-tuning model: distilbert-base-uncased-finetuned-sst-2-english


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2723,0.250442,0.905067,0.905063
2,0.3567,0.947509,0.902133,0.902126
3,0.2467,1.401354,0.906267,0.906265




Kết quả đánh giá fine-tuned model (distilbert-base-uncased-finetuned-sst-2-english) trên tập validation: {'eval_loss': 1.4013543128967285, 'eval_accuracy': 0.9062666666666667, 'eval_f1': 0.9062648519542005, 'eval_runtime': 68.5727, 'eval_samples_per_second': 109.373, 'eval_steps_per_second': 6.839, 'epoch': 3.0}




Kết quả đánh giá fine-tuned model (distilbert-base-uncased-finetuned-sst-2-english) trên tập test: {'eval_loss': 1.3239593505859375, 'eval_accuracy': 0.9113333333333333, 'eval_f1': 0.9113332560947476, 'eval_runtime': 68.4343, 'eval_samples_per_second': 109.594, 'eval_steps_per_second': 6.853, 'epoch': 3.0}
Mô hình fine-tuned distilbert-base-uncased-finetuned-sst-2-english đã được lưu tại /kaggle/working/distilbert_finetuned





Classification Report fine-tuned (distilbert-base-uncased-finetuned-sst-2-english):
              precision    recall  f1-score   support

           0      0.912     0.910     0.911      3750
           1      0.911     0.912     0.911      3750

    accuracy                          0.911      7500
   macro avg      0.911     0.911     0.911      7500
weighted avg      0.911     0.911     0.911      7500



In [13]:
# Xử lý Twitter RoBERTa
print("\n=== Xử lý Twitter RoBERTa ===")
# Tokenize dữ liệu cho Twitter RoBERTa
twitter_roberta_tokenizer = twitter_roberta_config['tokenizer_class'].from_pretrained(twitter_roberta_config['model_name'])
twitter_roberta_tokenized_datasets = raw_datasets.map(lambda x: tokenize_function(x, twitter_roberta_tokenizer), batched=True)
twitter_roberta_tokenized_datasets = twitter_roberta_tokenized_datasets.remove_columns(['text'])


=== Xử lý Twitter RoBERTa ===


config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Map:   0%|          | 0/35000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7500 [00:00<?, ? examples/s]

Map:   0%|          | 0/7500 [00:00<?, ? examples/s]

In [14]:
# Đánh giá pre-fine-tuned Twitter RoBERTa
print("\nĐánh giá pre-fine-tuned model: cardiffnlp/twitter-roberta-base-sentiment-latest")
evaluate_pre_finetuned_model(
    twitter_roberta_config['model_name'],
    twitter_roberta_config['tokenizer_class'],
    twitter_roberta_config['model_class'],
    twitter_roberta_tokenized_datasets
)


Đánh giá pre-fine-tuned model: cardiffnlp/twitter-roberta-base-sentiment-latest


pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpo

model.safetensors:   0%|          | 0.00/501M [00:00<?, ?B/s]



Kết quả đánh giá pre-fine-tuned model (cardiffnlp/twitter-roberta-base-sentiment-latest) trên tập test: {'eval_loss': 0.7688709497451782, 'eval_accuracy': 0.2976, 'eval_f1': 0.26086555692611063, 'eval_runtime': 128.9806, 'eval_samples_per_second': 58.148, 'eval_steps_per_second': 3.636}





Classification Report pre-fine-tuned (cardiffnlp/twitter-roberta-base-sentiment-latest):
              precision    recall  f1-score   support

           0      0.135     0.075     0.096      3750
           1      0.360     0.521     0.426      3750

    accuracy                          0.298      7500
   macro avg      0.247     0.298     0.261      7500
weighted avg      0.247     0.298     0.261      7500



In [15]:
# Fine-tuning Twitter RoBERTa
print("\nFine-tuning model: cardiffnlp/twitter-roberta-base-sentiment-latest")
fine_tune_model(
    twitter_roberta_config['model_name'],
    twitter_roberta_config['tokenizer_class'],
    twitter_roberta_config['model_class'],
    twitter_roberta_config['output_dir'],
    twitter_roberta_tokenized_datasets
)


Fine-tuning model: cardiffnlp/twitter-roberta-base-sentiment-latest


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpo

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2887,0.288155,0.901333,0.901302
2,0.2707,0.351296,0.9112,0.911192
3,0.4202,1.190983,0.9164,0.916389




Kết quả đánh giá fine-tuned model (cardiffnlp/twitter-roberta-base-sentiment-latest) trên tập validation: {'eval_loss': 1.1909832954406738, 'eval_accuracy': 0.9164, 'eval_f1': 0.9163892606650366, 'eval_runtime': 126.8, 'eval_samples_per_second': 59.148, 'eval_steps_per_second': 3.699, 'epoch': 3.0}




Kết quả đánh giá fine-tuned model (cardiffnlp/twitter-roberta-base-sentiment-latest) trên tập test: {'eval_loss': 1.086658239364624, 'eval_accuracy': 0.9237333333333333, 'eval_f1': 0.9237321130471421, 'eval_runtime': 126.9619, 'eval_samples_per_second': 59.073, 'eval_steps_per_second': 3.694, 'epoch': 3.0}
Mô hình fine-tuned cardiffnlp/twitter-roberta-base-sentiment-latest đã được lưu tại /kaggle/working/twitter_roberta_finetuned





Classification Report fine-tuned (cardiffnlp/twitter-roberta-base-sentiment-latest):
              precision    recall  f1-score   support

           0      0.920     0.928     0.924      3750
           1      0.927     0.920     0.923      3750

    accuracy                          0.924      7500
   macro avg      0.924     0.924     0.924      7500
weighted avg      0.924     0.924     0.924      7500

