In [1]:
!pip install transformers
!sudo apt-get install git-lfs
!git lfs install
!git clone https://huggingface.co/adalbertojunior/distilbert-portuguese-cased




git-lfs is already the newest version (3.0.2-1ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 39 not upgraded.
Git LFS initialized.
Cloning into 'distilbert-portuguese-cased'...
remote: Enumerating objects: 21, done.[K
remote: Total 21 (delta 0), reused 0 (delta 0), pack-reused 21 (from 1)[K
Unpacking objects: 100% (21/21), 100.55 KiB | 6.28 MiB/s, done.
Filtering content: 100% (2/2), 506.62 MiB | 72.97 MiB/s, done.


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from datasets import Dataset
from sklearn import metrics
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding

In [3]:
df = pd.read_csv("../data/hatebr_and_rationales.csv")

In [4]:
data = pd.DataFrame()
data['normalized_text'] = df['normalized_text']
data['labels'] = df['label final'].astype(int)

In [5]:
## split to train and val
TRAIN_SIZE = 0.8
TEST_SIZE = 0.1
VAL_SIZE = 0.1


train_df, test_df = train_test_split(data, test_size=TEST_SIZE + VAL_SIZE, random_state=0)
test_df, val_df = train_test_split(test_df, test_size=VAL_SIZE/(TEST_SIZE + VAL_SIZE), random_state=0)

In [6]:
tokenizer = AutoTokenizer.from_pretrained('adalbertojunior/distilbert-portuguese-cased')

# Convert datasets to tokenized format
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

def tokenize_data(examples):
    return tokenizer(examples["normalized_text"], truncation=True,  padding='max_length', 
        max_length=128)

tokenized_train = train_dataset.map(tokenize_data, batched=True)
tokenized_val = val_dataset.map(tokenize_data, batched=True)

tokenizer_config.json:   0%|          | 0.00/513 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/210k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



Map:   0%|          | 0/5600 [00:00<?, ? examples/s]

Map:   0%|          | 0/700 [00:00<?, ? examples/s]

In [7]:
model = AutoModelForSequenceClassification.from_pretrained("adalbertojunior/distilbert-portuguese-cased")

model.safetensors:   0%|          | 0.00/266M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at adalbertojunior/distilbert-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# Prepare data collator for padding sequences
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define training arguments
training_args = TrainingArguments(
    output_dir="/working/results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",  # Save checkpoints at the end of each epoch
    logging_strategy="epoch",
    save_total_limit=3,  # Keep only the last 3 checkpoints
    logging_dir='/working/logs',
    report_to=[]
)

# Define Trainer object for training the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train the model
trainer.train()

# Save the trained model
model.save_pretrained('/working/distilbertimbau/model')



Epoch,Training Loss,Validation Loss
1,0.3496,0.278983
2,0.1807,0.405355
3,0.0906,0.4867
4,0.0431,0.558505
5,0.0218,0.551405


In [9]:
test_dataset = Dataset.from_pandas(test_df)
tokenized_test = test_dataset.map(tokenize_data, batched=True)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # Convert logits to probabilities
    logits = torch.tensor(logits)
    probs = logits.softmax(dim=-1)
    # Get predictions from probabilities
    predictions = probs.argmax(axis=-1)
    
    f1 = metrics.f1_score(labels, predictions, zero_division = 0, average='macro')       
    recall = metrics.recall_score(labels, predictions, zero_division = 0, average='macro')
    precision = metrics.precision_score(labels, predictions, zero_division = 0, average='macro')
    acc = metrics.accuracy_score(labels, predictions)
    
    probs = probs[:, 1].numpy()  # Get probabilities for the positive class
    auc = metrics.roc_auc_score(labels, probs)
    
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1, "auc": auc}

# Define Trainer with evaluation dataset
trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

eval_results = trainer.evaluate()

# Print the evaluation results
print("Evaluation results:")
for key, value in eval_results.items():
    print(f"{key}: {value}")

Map:   0%|          | 0/700 [00:00<?, ? examples/s]

Evaluation results:
eval_loss: 0.6295282244682312
eval_model_preparation_time: 0.0033
eval_accuracy: 0.9
eval_precision: 0.9002467105263158
eval_recall: 0.8988821138211383
eval_f1: 0.8994451011294983
eval_auc: 0.964180927091529
eval_runtime: 1.8834
eval_samples_per_second: 371.672
eval_steps_per_second: 46.724


In [10]:
import os
import shutil

# Directory where checkpoints are saved
checkpoint_dir = "/working/results"

# Find the latest checkpoint directory (usually named something like 'checkpoint-xxxx')
latest_checkpoint = max([os.path.join(checkpoint_dir, d) for d in os.listdir(checkpoint_dir)], key=os.path.getmtime)

# Rename the checkpoint directory to a .ckpt file
checkpoint_name = "../models/distilbertimbau/model.ckpt"
shutil.make_archive(checkpoint_name, 'zip', latest_checkpoint)

'/kaggle/working/model.ckpt.zip'

In [11]:
# import zipfile

# # Unzip the .ckpt file
# with zipfile.ZipFile('/kaggle/working/model.ckpt.zip', 'r') as zip_ref:
#     zip_ref.extractall('/kaggle/working/extracted_checkpoint')

# # Load the model from the extracted directory
# from transformers import DistilBertForSequenceClassification

# model = AutoModelForSequenceClassification.from_pretrained('/kaggle/working/extracted_checkpoint')