## Installs and imports

In [1]:
!pip install kagglehub

Defaulting to user installation because normal site-packages is not writeable


In [2]:
!pip install --upgrade transformers

Defaulting to user installation because normal site-packages is not writeable


In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    set_seed
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import kagglehub
from kagglehub import KaggleDatasetAdapter

## Configuration and helpers

In [2]:
MODEL_NAME = "neuralmind/bert-base-portuguese-cased"  # BERTimbau
SEED = 42
set_seed(SEED)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'precision': precision, 'recall': recall, 'f1': f1}

class ReviewDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item['labels'] = torch.tensor(int(self.labels[idx]))
        return item

## Loading, balancing and joining Datasets

In [5]:
### KAGGLE DATASET

file_path = "utlc_movies.csv"

df_kaggle_unbalanced = kagglehub.load_dataset(KaggleDatasetAdapter.PANDAS,
  "fredericods/ptbr-sentiment-analysis-datasets", file_path)

  df_kaggle_unbalanced = kagglehub.load_dataset(KaggleDatasetAdapter.PANDAS,


In [6]:
### HUGGINGFACE DATASET

df_HF = pd.read_parquet("hf://datasets/AiresPucrs/sentiment-analysis-pt/data/train-00000-of-00001.parquet")

In [7]:
df_kaggle_unbalanced = df_kaggle_unbalanced[['review_text', 'polarity']]
df_kaggle_unbalanced.dropna(inplace=True)
df_kaggle_unbalanced.reset_index(inplace=True)
df_kaggle_unbalanced = df_kaggle_unbalanced[['review_text', 'polarity']]

In [8]:
### BALANCING THE KAGGLE DATASET

df_zero = df_kaggle_unbalanced[df_kaggle_unbalanced["polarity"] == 0.0]
df_one  = df_kaggle_unbalanced[df_kaggle_unbalanced["polarity"] == 1.0]

n0 = len(df_zero)  # size of minority class
n1 = len(df_one)   # size of majority class

# Randomly downsample the majority class to match n0
df_one_downsampled = df_one.sample(n=n0, replace=False)#random_state=42

df_balanced = pd.concat([df_zero, df_one_downsampled], axis=0)

# Shuffle the combined DataFrame to mix classes
df_balanced = df_balanced.sample(frac=1.0).reset_index(drop=True)#random_state=42

In [9]:
df_HF.rename(columns = {'text':"review_text", "label":"polarity"}, inplace=True)

In [10]:
df_balanced = df_balanced.astype({"polarity":int})

In [11]:
df = pd.concat([df_balanced, df_HF], axis=0)

In [12]:
df = df.sample(frac=1.0)#, random_state=42)
df.reset_index(inplace=True, drop=True)

In [13]:
# df_train, df_val = train_test_split(
#     df,
#     test_size=0.1,
#     stratify=df['polarity'],
#     random_state=SEED)

## Fine tune function

In [3]:
def fine_tune(
    df: pd.DataFrame,
    output_dir: str,
    epochs: int = 3,
    batch_size: int = 32,
    learning_rate: float = 3e-5,
    eval_strategy: str = "steps",
    eval_steps: int = 1500,
    save_strategy: str = "steps",
    freeze_base: bool = False,
    ewc_lambda: float = 0.0,
    test_size=0.1,
    model_name: str = MODEL_NAME):
    
    # Split
    df_train, df_val = train_test_split(
        df, test_size=test_size, stratify=df['polarity'], random_state=SEED
    )

    # Load tokenizer & model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=2
    )

    # Optional: freeze base layers
    if freeze_base:
        for name, param in model.named_parameters():
            if not name.startswith('classifier'):
                param.requires_grad = False

    # Datasets & DataCollator
    train_dataset = ReviewDataset(
        df_train['review_text'].tolist(),
        df_train['polarity'].tolist(),
        tokenizer
    )
    val_dataset = ReviewDataset(
        df_val['review_text'].tolist(),
        df_val['polarity'].tolist(),
        tokenizer
    )
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # Training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        learning_rate=learning_rate,
        eval_strategy=eval_strategy,
        eval_steps=eval_steps,
        save_strategy=save_strategy,
        save_steps=eval_steps,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        save_total_limit=2,
        logging_steps=100
    )

    # Optional: Elastic Weight Consolidation hook
    compute_ewc = None
    if ewc_lambda > 0.0:
        # Simple L2 towards initial weights
        init_state = {n: p.clone().detach() for n, p in model.named_parameters()}
        def compute_ewc_loss():
            loss = 0.0
            for name, param in model.named_parameters():
                # move init_state[name] to whatever device 'param' is on:
                ref = init_state[name].to(param.device)
                loss += ((param - ref) ** 2).sum()
            return ewc_lambda * loss

    # Custom Trainer to add EWC loss
    class EWCTrainer(Trainer):
        def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
            
            outputs = model(**inputs)
            loss = outputs.loss
            if compute_ewc is not None:
                loss = loss + compute_ewc()
            return (loss, outputs) if return_outputs else loss

    trainer_cls = EWCTrainer if compute_ewc else Trainer
    trainer = trainer_cls(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    # Train & save
    trainer.train()
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f"Model fine-tuned and saved to {output_dir}")

## Inference helpers

In [3]:
def load_model_and_tokenizer(model_path: str):
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model.eval()
    return model, tokenizer


def predict_sentiment(texts: list[str], model, tokenizer, device: str = 'cpu') -> list[int]:
    model.to(device)
    enc = tokenizer(
        texts,
        truncation=True,
        padding=True,
        max_length=512,
        return_tensors='pt'
    )
    with torch.no_grad():
        outputs = model(
            input_ids=enc['input_ids'].to(device),
            attention_mask=enc['attention_mask'].to(device)
        )
    preds = torch.argmax(outputs.logits, dim=1)
    return preds.cpu().tolist()

## Domain-specific Fine Tune dataset loading

In [5]:
comments_link = 'https://raw.githubusercontent.com/joaocarvoli/nlp-symbolic-solution/refs/heads/main/data/all_comments.csv'
df_movies = pd.read_csv(comments_link)

In [6]:
df_movies['rating_label'] = (df_movies['numeric_rating'] >= 2.5).astype(int) ### mudei de 3.0 para 2.5 para ver como ficariam os resultados
df_movies = df_movies[["comment", "rating_label"]]

In [7]:
df_movies.rename(columns={"comment":"review_text", "rating_label":"polarity"}, inplace=True)

In [8]:
df_movies_FT, df_movies_test = train_test_split(
    df_movies, test_size=2704/7704, stratify=df_movies['polarity'], random_state=SEED
)

## Fine Tune and domain-specific Fine Tune

In [None]:
# 1) Initial fine-tune
fine_tune(
    df,
    output_dir='./bertimbau_finetuned',
    epochs=5,
    learning_rate=3e-5
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = trainer_cls(


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1500,0.2832,0.301549,0.873566,0.840123,0.919018,0.877801
3000,0.2852,0.258185,0.890783,0.894968,0.882545,0.888713
4500,0.273,0.256933,0.895088,0.897414,0.889345,0.893361
6000,0.2512,0.256642,0.896976,0.876215,0.921715,0.89839
7500,0.2367,0.243494,0.900169,0.886157,0.91559,0.900633
9000,0.237,0.251065,0.891728,0.851363,0.946049,0.896212
10500,0.1853,0.273207,0.90053,0.878865,0.92638,0.901997
12000,0.1852,0.262046,0.902252,0.887838,0.918175,0.902752
13500,0.1915,0.258647,0.902224,0.90032,0.901989,0.901154
15000,0.203,0.252052,0.903002,0.894467,0.911206,0.902759


In [22]:
# 2) Domain adaptation on new data
fine_tune(
    df_movies,
    model_name='./bertimbau_finetuned/checkpoint-18000',
    output_dir='./bertimbau_adapted_5',
    eval_strategy='epoch',
    save_strategy='epoch',
    epochs=5,
    learning_rate=1e-5,
    batch_size = 8,
    #eval_steps = 500,
    freeze_base=True,       # optional: freeze base BERT layers
    ewc_lambda=0.1,         # optional: small EWC regularization
    test_size=2704/7704      
)

  trainer = trainer_cls(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7565,0.738955,0.762204,0.856439,0.734903,0.79103
2,0.6049,0.534155,0.764053,0.849108,0.747585,0.795119
3,0.5324,0.50008,0.768491,0.835724,0.774155,0.803762
4,0.5591,0.493999,0.77071,0.82868,0.788647,0.808168
5,0.5219,0.493096,0.77145,0.828066,0.791063,0.809141


Model fine-tuned and saved to ./bertimbau_adapted_5


### Inference test

In [9]:
model, tokenizer = load_model_and_tokenizer('./bertimbau_adapted_3')
sample_texts = ["Ótimo filme!", "Não gostei do roteiro."]
print(predict_sentiment(sample_texts, model, tokenizer))

[1, 0]


## Final evaluation of the model

### With only the first fine tune

In [12]:
texts = []
labels = []

for i in range(len(df_movies['review_text'])):
    texts +=[df_movies['review_text'][i]]
    labels +=[df_movies['polarity'][i]]

In [22]:
results = predict_sentiment(texts, model, tokenizer)

In [10]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score)


def compute_classification_metrics(labels, predictions, probabilities=None):
    metrics = {
        'accuracy': accuracy_score(labels, predictions),
        'precision': precision_score(labels, predictions, zero_division=0),
        'recall': recall_score(labels, predictions, zero_division=0),
        'f1_score': f1_score(labels, predictions, zero_division=0)
    }

    # Compute AUC only if probabilities are provided
    if probabilities is not None:
        metrics['auc_score'] = roc_auc_score(labels, probabilities)
    else:
        metrics['auc_score'] = None  # Could raise a warning or log here

    return metrics

In [24]:
metrics = compute_classification_metrics(labels, results)

### With the domain specific fine tune

In [12]:
df_movies_test.reset_index(drop=True, inplace=True)

In [13]:
texts_DS = []
labels_DS = []

for i in range(len(df_movies_test['review_text'])):
    texts_DS +=[df_movies_test['review_text'][i]]
    labels_DS +=[df_movies_test['polarity'][i]]

In [33]:
results = predict_sentiment(texts_DS, model, tokenizer)

In [14]:
model_adap, tokenizer_adap = load_model_and_tokenizer('./bertimbau_adapted_3')

In [15]:
results_DS_FT = predict_sentiment(texts_DS, model_adap, tokenizer_adap)

In [16]:
metrics_DS_FT = compute_classification_metrics(labels_DS, results_DS_FT)

In [37]:
metrics = compute_classification_metrics(labels_DS, results)

In [17]:
metrics_DS_FT

{'accuracy': 0.7548076923076923,
 'precision': 0.8584615384615385,
 'recall': 0.763129102844639,
 'f1_score': 0.8079930495221547,
 'auc_score': None}

In [39]:
metrics

{'accuracy': 0.7267011834319527,
 'precision': 0.8783877692842251,
 'recall': 0.6914660831509847,
 'f1_score': 0.7737985919804101,
 'auc_score': None}