In [1]:
!pip install evaluate



# Import libraries

In [2]:
import pandas as pd
import numpy as np
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    DistilBertTokenizer, DistilBertForSequenceClassification,
    RobertaTokenizer, RobertaForSequenceClassification,
    Trainer, TrainingArguments,
    DataCollatorWithPadding
)
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import evaluate
import os
import warnings

warnings.filterwarnings("ignore")

2025-08-24 09:06:18.211546: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756026378.235103     168 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756026378.242060     168 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Data preprocessing

In [3]:
# Load preprocessed dataset 
data = pd.read_csv('/kaggle/input/pretrained-data/preprocessed_results.csv')
data['sentiment'] = data['sentiment'].map({'positive': 1, 'negative': 0})

In [4]:
# Split the dataset into training, validation, and test sets
train_data, temp_data = train_test_split(data, test_size=0.3, random_state=42, stratify=data['sentiment'])
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42, stratify=temp_data['sentiment'])

In [5]:
# Save datasets to CSV files
os.makedirs('/kaggle/working/datasets', exist_ok=True)
train_data.to_csv('/kaggle/working/datasets/train_data.csv', index=False)
val_data.to_csv('/kaggle/working/datasets/val_data.csv', index=False)
test_data.to_csv('/kaggle/working/datasets/test_data.csv', index=False)

In [6]:
# Convert to DatasetDict
raw_datasets = DatasetDict({
    'train': Dataset.from_pandas(train_data[['preprocessed_tokens', 'sentiment']].rename(columns={'preprocessed_tokens': 'text', 'sentiment': 'labels'})),
    'valid': Dataset.from_pandas(val_data[['preprocessed_tokens', 'sentiment']].rename(columns={'preprocessed_tokens': 'text', 'sentiment': 'labels'})),
    'test': Dataset.from_pandas(test_data[['preprocessed_tokens', 'sentiment']].rename(columns={'preprocessed_tokens': 'text', 'sentiment': 'labels'})),
})

# Define functions 

In [7]:
# Function to compute metrics
def compute_metrics(eval_pred):
    metric_acc = evaluate.load("accuracy")
    metric_f1 = evaluate.load("f1")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = metric_acc.compute(predictions=predictions, references=labels)
    f1 = metric_f1.compute(predictions=predictions, references=labels, average="weighted")
    return {"accuracy": acc["accuracy"], "f1": f1["f1"]}

In [8]:
# Function to fine-tune models
def fine_tune_model(model_name, tokenizer_class, model_class, output_dir, tokenized_datasets):
    tokenizer = tokenizer_class.from_pretrained(model_name)
    model = model_class.from_pretrained(model_name, num_labels=2, ignore_mismatched_sizes=True)

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, max_length=256)

    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=3,
        eval_strategy="epoch",
        weight_decay=5e-4,
        optim="adamw_torch",
        learning_rate=5e-5,
        save_strategy="no",
        fp16=True,
        push_to_hub=False,
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets['train'],
        eval_dataset=tokenized_datasets['valid'],
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    # Fine-tune the model
    trainer.train()

    # Evaluate on the validation set
    val_results = trainer.evaluate()
    print(f"Evaluation results of the fine-tuned model ({model_name}) on the validation set: {val_results}")

    # Evaluate on the test set
    test_results = trainer.evaluate(tokenized_datasets['test'])
    print(f"Evaluation results of the fine-tuned model ({model_name}) on the test set: {test_results}")

    # Save the fine-tuned model and tokenizer
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f"Fine-tuned model {model_name} has been saved to {output_dir}")

    # Predictions for generating a classification report
    y_pred = trainer.predict(tokenized_datasets['test']).predictions
    y_pred = np.argmax(y_pred, axis=-1)
    y_true = tokenized_datasets['test']['labels']
    print(f"\nClassification Report (fine-tuned {model_name}):")
    print(classification_report(y_true, y_pred, digits=3))

In [9]:
# Function to tokenize data
def tokenize_function(examples, tokenizer):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512, return_token_type_ids=False)

# Config DistilBERT model
distilbert_config = {
    'model_name': "distilbert-base-uncased-finetuned-sst-2-english",
    'tokenizer_class': DistilBertTokenizer,
    'model_class': DistilBertForSequenceClassification,
    'output_dir': "/kaggle/working/distilbert_finetuned"
}

# Config RoBERTa model
twitter_roberta_config = {
    'model_name': "cardiffnlp/twitter-roberta-base-sentiment-latest",
    'tokenizer_class': AutoTokenizer,
    'model_class': AutoModelForSequenceClassification,
    'output_dir': "/kaggle/working/twitter_roberta_finetuned"
}

# Fine-tune models

## 1. DistilBERT

In [10]:
print("\nDistilBERT")
# Tokenize data for DistilBERT
distilbert_tokenizer = distilbert_config['tokenizer_class'].from_pretrained(distilbert_config['model_name'])
distilbert_tokenized_datasets = raw_datasets.map(lambda x: tokenize_function(x, distilbert_tokenizer), batched=True)
distilbert_tokenized_datasets = distilbert_tokenized_datasets.remove_columns(['text'])


DistilBERT


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Map:   0%|          | 0/35000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7500 [00:00<?, ? examples/s]

Map:   0%|          | 0/7500 [00:00<?, ? examples/s]

In [11]:
# Fine-tuning DistilBERT
fine_tune_model(
    distilbert_config['model_name'],
    distilbert_config['tokenizer_class'],
    distilbert_config['model_class'],
    distilbert_config['output_dir'],
    distilbert_tokenized_datasets
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3068,0.312422,0.8936,0.893575
2,0.203,0.398923,0.904133,0.904125
3,0.0993,0.463527,0.908,0.907996


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Evaluation results of the fine-tuned model (distilbert-base-uncased-finetuned-sst-2-english) on the validation set: {'eval_loss': 0.46352747082710266, 'eval_accuracy': 0.908, 'eval_f1': 0.9079955772451711, 'eval_runtime': 61.3986, 'eval_samples_per_second': 122.153, 'eval_steps_per_second': 15.277, 'epoch': 3.0}
Evaluation results of the fine-tuned model (distilbert-base-uncased-finetuned-sst-2-english) on the test set: {'eval_loss': 0.4226415455341339, 'eval_accuracy': 0.9141333333333334, 'eval_f1': 0.914133235636037, 'eval_runtime': 62.0033, 'eval_samples_per_second': 120.961, 'eval_steps_per_second': 15.128, 'epoch': 3.0}
Fine-tuned model distilbert-base-uncased-finetuned-sst-2-english has been saved to /kaggle/working/distilbert_finetuned

Classification Report (fine-tuned distilbert-base-uncased-finetuned-sst-2-english):
              precision    recall  f1-score   support

           0      0.915     0.913     0.914      3750
           1      0.913     0.915     0.914      3750

## 2. Twitter RoBERTa

In [12]:
print("\nTwitter RoBERTa")
# Tokenize data for Twitter RoBERTa
twitter_roberta_tokenizer = twitter_roberta_config['tokenizer_class'].from_pretrained(twitter_roberta_config['model_name'])
twitter_roberta_tokenized_datasets = raw_datasets.map(lambda x: tokenize_function(x, twitter_roberta_tokenizer), batched=True)
twitter_roberta_tokenized_datasets = twitter_roberta_tokenized_datasets.remove_columns(['text'])


Twitter RoBERTa


config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Map:   0%|          | 0/35000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7500 [00:00<?, ? examples/s]

Map:   0%|          | 0/7500 [00:00<?, ? examples/s]

In [13]:
# Fine-tuning Twitter RoBERTa
fine_tune_model(
    twitter_roberta_config['model_name'],
    twitter_roberta_config['tokenizer_class'],
    twitter_roberta_config['model_class'],
    twitter_roberta_config['output_dir'],
    twitter_roberta_tokenized_datasets
)

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpo

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3467,0.333933,0.894533,0.894532
2,0.2687,0.330771,0.904933,0.904927
3,0.1838,0.391748,0.909467,0.909415


Evaluation results of the fine-tuned model (cardiffnlp/twitter-roberta-base-sentiment-latest) on the validation set: {'eval_loss': 0.3917475640773773, 'eval_accuracy': 0.9094666666666666, 'eval_f1': 0.9094150678789316, 'eval_runtime': 118.7401, 'eval_samples_per_second': 63.163, 'eval_steps_per_second': 7.9, 'epoch': 3.0}
Evaluation results of the fine-tuned model (cardiffnlp/twitter-roberta-base-sentiment-latest) on the test set: {'eval_loss': 0.3520101010799408, 'eval_accuracy': 0.9194666666666667, 'eval_f1': 0.9194480559455123, 'eval_runtime': 118.6957, 'eval_samples_per_second': 63.187, 'eval_steps_per_second': 7.903, 'epoch': 3.0}
Fine-tuned model cardiffnlp/twitter-roberta-base-sentiment-latest has been saved to /kaggle/working/twitter_roberta_finetuned

Classification Report (fine-tuned cardiffnlp/twitter-roberta-base-sentiment-latest):
              precision    recall  f1-score   support

           0      0.907     0.935     0.921      3750
           1      0.933     0.904  