In [None]:
! pip install transformers datasets torch scikit-learn

In [1]:
import torch

# Verifica se há uma GPU disponível
if torch.cuda.is_available():
    print(f"GPU disponível: {torch.cuda.get_device_name(0)}")
else:
    print("GPU não disponível, usando CPU.")

GPU disponível: NVIDIA GeForce RTX 4060 Laptop GPU


In [105]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm  # Para barra de progresso
from collections import Counter
import json

In [3]:
# 1. Carregar o dataset
# Aqui estamos usando o dataset de reviews do IMDB como exemplo.
dataset = load_dataset("imdb")



## brief exploratory analysis of the dataset

In [4]:
dataset["train"].features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['neg', 'pos'], id=None)}

In [5]:
dataset["train"]["text"][0]

'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between, ev

In [6]:

labels = dataset["train"]["label"]
label_counts = Counter(labels)
print("Distribuição de classes:", label_counts)

Distribuição de classes: Counter({0: 12500, 1: 12500})


In [7]:
text_lengths = [len(text.split()) for text in dataset["train"]["text"]]
print(f"Comprimento médio: {sum(text_lengths) / len(text_lengths):.2f} palavras")

Comprimento médio: 233.79 palavras


In [8]:
max_length = max(len(text.split()) for text in dataset["train"]["text"])
print(f"Comprimento máximo: {max_length} palavras")

Comprimento máximo: 2470 palavras


In [9]:
empty_texts = [text for text in dataset["train"]["text"] if len(text.strip()) == 0]
print(f"Textos vazios: {len(empty_texts)}")

Textos vazios: 0


In [13]:
invalid_texts = [text for text in dataset["train"]["text"] if not any(char.isalpha() for char in text)]
print(f"Textos inválidos: {len(invalid_texts)}")

Textos inválidos: 0


## Instantiating the tokenizer

In [10]:
# 2. Carregar o tokenizer do BERT
# O tokenizer converte texto em tokens compatíveis com o modelo BERT.
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [11]:
# Função para tokenizar os textos
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

# Aplicar a tokenização no dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [12]:
# 3. Preparar o dataset para o treinamento
# O Hugging Face exige colunas específicas: 'input_ids' e 'attention_mask'.
tokenized_datasets = tokenized_datasets.remove_columns(["text"])  # Remove o texto original
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")  # Renomeia a coluna alvo
tokenized_datasets.set_format("torch")  # Converte para tensores



In [13]:
# Dividir em treino e validação
train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(2000))  # Exemplo reduzido
test_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(500))    # Exemplo reduzido



## Instantiating the model

In [14]:
# 4. Carregar o modelo pré-treinado
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# 5. Definir métricas para avaliação
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary")
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}


In [16]:
# 6. Configurar o treinamento
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=10
)





In [None]:
! pip install accelerate>=0.26.0


In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)



In [18]:
# 7. Treinar o modelo
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3276,0.35921,0.878,0.883365,0.833935,0.939024
2,0.251,0.379327,0.904,0.899582,0.926724,0.873984


TrainOutput(global_step=500, training_loss=0.2963378931283951, metrics={'train_runtime': 217.1611, 'train_samples_per_second': 18.42, 'train_steps_per_second': 2.302, 'total_flos': 1052444221440000.0, 'train_loss': 0.2963378931283951, 'epoch': 2.0})

In [19]:
# 8. Avaliar o modelo
metrics = trainer.evaluate()
print(metrics)



{'eval_loss': 0.3793271481990814, 'eval_accuracy': 0.904, 'eval_f1': 0.899581589958159, 'eval_precision': 0.9267241379310345, 'eval_recall': 0.8739837398373984, 'eval_runtime': 7.0195, 'eval_samples_per_second': 71.231, 'eval_steps_per_second': 8.975, 'epoch': 2.0}


In [24]:
# Certificar-se de que o modelo está na GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
# 9. Fazer previsões com novos textos
test_texts = ["This movie was amazing!", "I hated this movie."]
test_encodings = tokenizer(test_texts, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
output = model(**test_encodings)
predictions = output.logits.argmax(dim=1)
print("Predictions:", predictions)

In [26]:
print(f"Modelo no dispositivo: {next(model.parameters()).device}")


Modelo no dispositivo: cuda:0


In [29]:
# Preparar entradas
test_texts = ["This movie was amazing!", "I hated this movie.", "I don't think the movie is really good"]
inputs = tokenizer(test_texts, padding="max_length", truncation=True, max_length=512, return_tensors="pt")

# Mover os inputs para o mesmo dispositivo do modelo
inputs = {key: value.to(device) for key, value in inputs.items()}

In [30]:
# Fazer a inferência
outputs = model(**inputs)

# Obter as previsões
predictions = outputs.logits.argmax(dim=1)
print(predictions)

tensor([1, 0, 0], device='cuda:0')


## Question answering

In [31]:
from datasets import load_dataset

# Carregar dataset
squad = load_dataset("squad_v2")

# Visualizar exemplos
print(squad["train"][0])

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


README.md:   0%|          | 0.00/8.92k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/16.4M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/130319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11873 [00:00<?, ? examples/s]

{'id': '56be85543aeaaa14008c9063', 'title': 'Beyoncé', 'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".', 'question': 'When did Beyonce start becoming popular?', 'answers': {'text': ['in the late 1990s'], 'answer_start': [269]}}


In [None]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_data(examples):
    inputs = tokenizer(
        examples["question"], examples["context"], truncation=True, padding=True, max_length=512
    )
    return inputs

tokenized_squad = squad.map(preprocess_data, batched=True)

# Configurar o treinamento
training_args = TrainingArguments(
    output_dir="./results", evaluation_strategy="epoch", learning_rate=2e-5, num_train_epochs=3
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["validation"],
)

trainer.train()

In [35]:
from transformers import pipeline

qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")


config.json:   0%|          | 0.00/451 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Device set to use cuda:0


In [36]:
question = "O que é BERT?"
context = "BERT é um modelo de linguagem desenvolvido pela Google, usado para tarefas de NLP."

result = qa_pipeline(question=question, context=context)
print(result)

{'score': 0.14506877958774567, 'start': 10, 'end': 29, 'answer': 'modelo de linguagem'}


### até aqui funcionando

## Bertimbau com question answering

In [None]:
! pip list

### traduzindo dataset

In [49]:
dataset = load_dataset("squad")

README.md:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [97]:
subset = dataset["train"].select(range(10))


In [98]:
print(subset[0])

{'id': '5733be284776f41900661182', 'title': 'University_of_Notre_Dame', 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.', 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?', 'answers': {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}}


In [95]:
dataset["train"]

for example in dataset["train"][0:10]:
    print(example)

id
title
context
question
answers


In [None]:
! pip install deep_translator

In [56]:
len(dataset["validation"])

10570

In [99]:
train_subset = dataset["train"].shuffle(seed=42).select(range(int(0.3 * len(dataset["train"]))))


In [None]:
import json
import time
from tqdm import tqdm
from deep_translator import GoogleTranslator
import numpy as np

# Definir a função ensure_serializable antes de usá-la
def ensure_serializable(obj):
    """
    Converte qualquer valor não serializável para tipos compatíveis com JSON.
    """
    if isinstance(obj, np.ndarray):
        return obj.tolist()  # Converter ndarray para lista
    elif isinstance(obj, dict):
        return {key: ensure_serializable(value) for key, value in obj.items()}
    elif isinstance(obj, list):
        return [ensure_serializable(value) for value in obj]
    else:
        return obj  # Retornar o valor original se já for serializável

# Função para traduzir texto com pausa
def translate_text_with_pause(text, source="en", target="pt"):
    time.sleep(1)  # Espera de 1 segundo entre as requisições
    return GoogleTranslator(source=source, target=target).translate(text)

# Traduzir o dataset com salvamento incremental
def translate_dataset(dataset, output_file="translated_dataset.json"):
    translated_examples = []

    # Carregar traduções parciais, se o arquivo já existir
    try:
        with open(output_file, "r") as file:
            content = file.read().strip()
            if content:
                translated_examples = json.loads(content)
                print(f"Carregadas {len(translated_examples)} traduções existentes.")
            else:
                print("Arquivo vazio encontrado. Começando do zero.")
    except FileNotFoundError:
        print("Arquivo de saída não encontrado. Criando um novo.")
        with open(output_file, "w", encoding="utf-8") as file:
            json.dump([], file)

    # Verificar progresso
    already_translated = len(translated_examples)
    dataset = dataset[already_translated:]  # Ignorar exemplos já traduzidos
    print(f"Traduzindo {len(dataset)} exemplos restantes.")

    # Barra de progresso
    for i, example in enumerate(tqdm(dataset, desc="Traduzindo", unit="exemplo")):
        try:
            # Traduzir contexto e pergunta
            context_translated = translate_text_with_pause(example["context"])
            question_translated = translate_text_with_pause(example["question"])

            # Traduzir todas as respostas
            answers_translated = {
                "text": [translate_text_with_pause(ans) for ans in example["answers"]["text"]],
                "answer_start": [int(start) for start in example["answers"]["answer_start"]],  # Converter para int
            }

            # Adicionar exemplo traduzido
            translated_example = {
                "context": context_translated,
                "question": question_translated,
                "answers": answers_translated,
            }

            # Garantir que todos os valores são serializáveis
            translated_example = ensure_serializable(translated_example)
            translated_examples.append(translated_example)

            # Salvar progresso no arquivo JSON
            with open(output_file, "w", encoding="utf-8") as file:
                json.dump(translated_examples, file, ensure_ascii=False, indent=4)

        except Exception as e:
            print(f"Erro ao traduzir exemplo {i}: {e}")
            continue  # Pula para o próximo exemplo

    print(f"Tradução concluída. Total traduzido: {len(translated_examples)} exemplos.")
    return translated_examples

# Exemplo de uso
# Criar subconjunto de treino
train_subset = dataset["train"].shuffle(seed=42).select(range(int(0.3 * len(dataset["train"]))))
train_subset = train_subset.to_pandas().to_dict(orient="records")
print("Total de entradas do dataset: ", len(train_subset))

# Traduzir o conjunto de treino com barra de progresso
translated_train = translate_dataset(train_subset, "translated_train.json")


val_subset = dataset["validation"].shuffle(seed=42).select(range(int(0.3 * len(dataset["validation"]))))
val_subset = val_subset.to_pandas().to_dict(orient="records")
# Traduzir o conjunto de validação
translated_validation = translate_dataset(val_subset)



In [None]:
from concurrent.futures import ThreadPoolExecutor
from deep_translator import GoogleTranslator
import json
from tqdm import tqdm
import numpy as np
from datasets import load_dataset

# Função para garantir que os valores são serializáveis
def ensure_serializable(obj):
    """
    Converte valores incompatíveis com JSON, como ndarrays, para tipos serializáveis.
    """
    if isinstance(obj, np.ndarray):  # Converte ndarray para lista
        return obj.tolist()
    elif isinstance(obj, dict):  # Converte dicionários recursivamente
        return {key: ensure_serializable(value) for key, value in obj.items()}
    elif isinstance(obj, list):  # Converte listas recursivamente
        return [ensure_serializable(value) for value in obj]
    else:
        return obj  # Retorna o valor original se já for serializável

# Função para traduzir texto
def translate_text(text, source="en", target="pt"):
    try:
        return GoogleTranslator(source=source, target=target).translate(text)
    except Exception as e:
        print(f"Erro ao traduzir texto: {text}\n{e}")
        return text  # Retorna o texto original em caso de falha

# Função para tradução paralela de um conjunto de textos
def parallel_translate(texts, max_workers=4):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        translations = list(tqdm(executor.map(translate_text, texts), total=len(texts), desc="Traduzindo textos"))
    return translations

# Função para traduzir o dataset
def translate_dataset(dataset, output_file="translated_dataset.json", max_workers=4):
    translated_examples = []

    # Carregar traduções parciais de forma robusta
    try:
        with open(output_file, "r", encoding="utf-8") as file:
            content = file.read().strip()  # Remover espaços em branco
            if content:  # Verifica se o arquivo não está vazio
                translated_examples = json.loads(content)
                print(f"Carregadas {len(translated_examples)} traduções existentes.")
            else:
                print("Arquivo vazio encontrado. Começando do zero.")
                translated_examples = []
    except (FileNotFoundError, json.JSONDecodeError):
        print("Arquivo inexistente ou corrompido. Começando do zero.")
        translated_examples = []

    # Verificar progresso
    already_translated = len(translated_examples)
    dataset = dataset[already_translated:]  # Ignorar exemplos já traduzidos
    print(f"Traduzindo {len(dataset)} exemplos restantes.")

    # Preparar os campos para tradução paralela
    contexts = [example["context"] for example in dataset]
    questions = [example["question"] for example in dataset]
    answers = [example["answers"]["text"] for example in dataset]

    # Traduzir paralelamente os campos
    print("Traduzindo contextos...")
    translated_contexts = parallel_translate(contexts, max_workers=max_workers)

    print("Traduzindo perguntas...")
    translated_questions = parallel_translate(questions, max_workers=max_workers)

    print("Traduzindo respostas...")
    translated_answers = [parallel_translate(answer, max_workers=max_workers) for answer in answers]

    # Reconstruir os exemplos traduzidos
    for i in range(len(dataset)):
        translated_example = {
            "context": translated_contexts[i],
            "question": translated_questions[i],
            "answers": {
                "text": translated_answers[i],
                "answer_start": ensure_serializable(dataset[i]["answers"]["answer_start"]),  # Serializável
            },
        }

        # Garantir que todo o exemplo é serializável
        translated_example = ensure_serializable(translated_example)
        translated_examples.append(translated_example)

        # Salvamento incremental
        with open(output_file, "w", encoding="utf-8") as file:
            json.dump(translated_examples, file, ensure_ascii=False, indent=4)

    print(f"Tradução concluída. Total traduzido: {len(translated_examples)} exemplos.")
    return translated_examples

# Carregar o SQuAD
dataset = load_dataset("squad")

# Selecionar um subconjunto (opcional)
train_subset = dataset["train"].shuffle(seed=42).select(range(500))  # Traduzir 500 exemplos
train_subset = train_subset.to_pandas().to_dict(orient="records")

# Traduzir o conjunto de treino com paralelismo
translated_train = translate_dataset(train_subset, output_file="translated_train.json", max_workers=8)


In [None]:
from concurrent.futures import ThreadPoolExecutor
from deep_translator import GoogleTranslator
import json
from tqdm import tqdm
import numpy as np
import os
from datasets import load_dataset

# Função para garantir que os valores são serializáveis
def ensure_serializable(obj):
    if isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, dict):
        return {key: ensure_serializable(value) for key, value in obj.items()}
    elif isinstance(obj, list):
        return [ensure_serializable(value) for value in obj]
    else:
        return obj

# Salvamento seguro
def safe_save_json(data, output_file):
    temp_file = output_file + ".tmp"
    with open(temp_file, "w", encoding="utf-8") as file:
        json.dump(data, file, ensure_ascii=False, indent=4)
    os.replace(temp_file, output_file)

# Função para traduzir texto
def translate_text(text, source="en", target="pt"):
    try:
        return GoogleTranslator(source=source, target=target).translate(text)
    except Exception as e:
        print(f"Erro ao traduzir texto: {text}\n{e}")
        return text

# Tradução paralela
def parallel_translate(texts, max_workers=4):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        translations = list(tqdm(executor.map(translate_text, texts), total=len(texts), desc="Traduzindo textos"))
    return translations

# Função principal de tradução
def translate_dataset_in_batches(dataset, output_file="translated_dataset.json", max_workers=4, batch_size=500, max_samples=5000):
    # Carregar traduções parciais
    try:
        with open(output_file, "r", encoding="utf-8") as file:
            content = file.read().strip()  # Verifica se o arquivo está vazio
            if content:
                translated_examples = json.loads(content)
                print(f"Carregadas {len(translated_examples)} traduções existentes.")
            else:
                print("Arquivo vazio encontrado. Começando do zero.")
                translated_examples = []
    except (FileNotFoundError, json.JSONDecodeError):
        print("Arquivo inexistente ou corrompido. Começando do zero.")
        translated_examples = []

    # Determinar quantos exemplos já foram traduzidos
    already_translated = len(translated_examples)
    if already_translated >= max_samples:
        print(f"Já foram traduzidos {already_translated} exemplos. Nenhuma tradução adicional necessária.")
        return translated_examples

    # Traduzir em lotes
    for start in range(already_translated, min(max_samples, len(dataset)), batch_size):
        end = min(start + batch_size, max_samples)
        print(f"Traduzindo exemplos {start} a {end}...")

        # Preparar lote atual
        batch = dataset[start:end]
        contexts = [example["context"] for example in batch]
        questions = [example["question"] for example in batch]
        answers = [example["answers"]["text"] for example in batch]

        # Traduzir os campos
        translated_contexts = parallel_translate(contexts, max_workers=max_workers)
        translated_questions = parallel_translate(questions, max_workers=max_workers)
        translated_answers = [parallel_translate(answer, max_workers=max_workers) for answer in answers]

        # Reconstruir exemplos traduzidos
        for i in range(len(batch)):
            translated_example = {
                "context": translated_contexts[i],
                "question": translated_questions[i],
                "answers": {
                    "text": translated_answers[i],
                    "answer_start": ensure_serializable(batch[i]["answers"]["answer_start"]),
                },
            }
            translated_example = ensure_serializable(translated_example)
            translated_examples.append(translated_example)

        # Salvamento incremental
        safe_save_json(translated_examples, output_file)

        print(f"Tradução de {end} exemplos concluída. Total traduzido até agora: {len(translated_examples)} exemplos.")

        # Verifica se o limite foi atingido
        if len(translated_examples) >= max_samples:
            break

    print(f"Tradução concluída. Total traduzido: {len(translated_examples)} exemplos.")
    return translated_examples

# Carregar o SQuAD
dataset = load_dataset("squad")["train"].to_pandas().to_dict(orient="records")  # Carregar como lista de dicionários

# Traduzir em lotes até atingir 5.000 exemplos
translated_train = translate_dataset_in_batches(
    dataset,
    output_file="translated_train.json",
    max_workers=8,
    batch_size=500,
    max_samples=5000
)


In [121]:
! ls

'=0.26.0'	    huggingread.txt	     LLM
 api_key.txt	    hugging.txt		     results
 bert.ipynb	    langchain_test.ipynb     translated_train.json
 first_test.ipynb   langchain_to_git.ipynb


In [141]:
import json

# Carregar o dataset traduzido
with open("translated_train.json", "r", encoding="utf-8") as file:
    data = json.load(file)

# Exemplo de estrutura de um dado
print(data[0])

{'context': 'O Pew Forum on Religion & Public Life classifica o Egito como o quinto pior país do mundo em liberdade religiosa. A Comissão dos Estados Unidos sobre Liberdade Religiosa Internacional, uma agência independente bipartidária do governo dos EUA, colocou o Egito em sua lista de observação de países que exigem monitoramento rigoroso devido à natureza e extensão das violações da liberdade religiosa praticadas ou toleradas pelo governo. De acordo com uma pesquisa Pew Global Attitudes de 2010, 84% dos egípcios entrevistados apoiaram a pena de morte para aqueles que abandonam o islamismo; 77% apoiaram chicotadas e cortes de mãos por roubo e furto; e 82% apoiam o apedrejamento de uma pessoa que comete adultério.', 'question': 'Qual a porcentagem de egípcios entrevistados que apoiam a pena de morte para aqueles que abandonam o islamismo?', 'answers': {'text': ['84%'], 'answer_start': [468]}}


In [142]:
from datasets import Dataset

# Converter para o formato do datasets
dataset = Dataset.from_dict({
    "context": [example["context"] for example in data],
    "question": [example["question"] for example in data],
    "answers": [example["answers"] for example in data],
})

# Dividir em treino e validação
dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = dataset["train"]
val_dataset = dataset["test"]

print(f"Treino: {len(train_dataset)} exemplos, Validação: {len(val_dataset)} exemplos")


Treino: 4500 exemplos, Validação: 500 exemplos


### Tokenizador funcionando

In [143]:
from transformers import AutoTokenizer

# Carregar o tokenizador do BERTimbau
tokenizer = AutoTokenizer.from_pretrained("neuralmind/bert-base-portuguese-cased")

def preprocess_function(examples):
    tokenized = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Ajustar os rótulos (start_positions, end_positions)
    offset_mapping = tokenized.pop("offset_mapping")
    answers = examples["answers"]

    start_positions = []
    end_positions = []

    for i, offsets in enumerate(offset_mapping):
        # Verificar se o índice i é válido para answers
        if i >= len(answers):
            print(f"Exemplo com inconsistente: {examples}")
            start_positions.append(0)
            end_positions.append(0)
            continue

        input_ids = tokenized["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Resposta
        answer = answers[i]
        if not answer["text"] or not answer["answer_start"]:
            print(f"Resposta inválida em exemplo: {examples}")
            start_positions.append(cls_index)
            end_positions.append(cls_index)
            continue

        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer["text"][0])

        # Identificar a posição dos tokens correspondentes
        token_start_index, token_end_index = 0, 0
        for idx, (start, end) in enumerate(offsets):
            if start <= start_char < end:
                token_start_index = idx
            if start < end_char <= end:
                token_end_index = idx

        # Caso a resposta esteja fora do contexto
        if start_char < offsets[0][0] or end_char > offsets[-1][1]:
            start_positions.append(cls_index)
            end_positions.append(cls_index)
        else:
            start_positions.append(token_start_index)
            end_positions.append(token_end_index)

    tokenized["start_positions"] = start_positions
    tokenized["end_positions"] = end_positions
    return tokenized            


# Aplicar a tokenização
tokenized_train = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)
tokenized_val = val_dataset.map(preprocess_function, batched=True, remove_columns=val_dataset.column_names)


Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub data rate exceeded.
The Jupyter

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



### Tokenizador não funcionando

In [None]:
def preprocess_function(example):
    # Tokenizar um único exemplo
    tokenized = tokenizer(
        example["question"],
        example["context"],
        truncation="longest_first",  # Truncamento equilibrado
        max_length=512,             # Aumentar o limite de tokens
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Ajustar os rótulos (start_positions, end_positions)
    offset_mapping = tokenized.pop("offset_mapping")
    answer = example["answers"]

    # Validar e filtrar offsets inválidos
    valid_offsets = []
    for mapping in offset_mapping:
        # Garantir que mapping é uma tupla/lista com exatamente dois elementos
        if isinstance(mapping, (tuple, list)) and len(mapping) == 2:
            valid_offsets.append(mapping)
        else:
            valid_offsets.append((0, 0))  # Substituir valores inválidos por (0, 0)

    # Substituir o offset_mapping pelos valores validados
    offset_mapping = valid_offsets

    # Obter os índices de início e fim da resposta no contexto
    start_char = answer["answer_start"][0]
    end_char = start_char + len(answer["text"][0])

    # Verificar se a resposta está dentro dos limites do contexto truncado
    first_token_start, _ = offset_mapping[0]
    _, last_token_end = offset_mapping[-1]

    if start_char < first_token_start or end_char > last_token_end:
        print(f"Ignorando exemplo: Resposta fora do contexto truncado.\nExemplo: {example}\n")
        return {}

    # Identificar os índices dos tokens correspondentes
    token_start_index = 0
    token_end_index = 0
    for idx, mapping in enumerate(offset_mapping):
        # Ignorar offsets inválidos como (0, 0)
        if mapping == (0, 0):
            continue

        start, end = mapping
        if start <= start_char < end:
            token_start_index = idx
        if start < end_char <= end:
            token_end_index = idx

    # Adicionar as posições calculadas
    tokenized["start_positions"] = token_start_index
    tokenized["end_positions"] = token_end_index

    return tokenized


# Aplicar tokenização
tokenized_train = train_dataset.map(preprocess_function, batched=False)
tokenized_val = val_dataset.map(preprocess_function, batched=False)


In [144]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

# Carregar o modelo pré-treinado
model = AutoModelForQuestionAnswering.from_pretrained("neuralmind/bert-base-portuguese-cased")

# Configurar o treinamento
training_args = TrainingArguments(
    output_dir="./bertimbau-finetuned-qa",
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=500,
    logging_steps=500,
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    fp16=True,  # Use mixed precision se sua GPU suportar
)

# Criar o Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
)

# Treinar o modelo
trainer.train()


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss
500,0.4722,0.000299
1000,0.0005,0.000161
1500,0.0003,0.000126


TrainOutput(global_step=1746, training_loss=0.1358893238955347, metrics={'train_runtime': 268.9523, 'train_samples_per_second': 51.868, 'train_steps_per_second': 6.492, 'total_flos': 2733817317350400.0, 'train_loss': 0.1358893238955347, 'epoch': 3.0})

In [145]:
model.save_pretrained("./bertimbau-finetuned-qa")
tokenizer.save_pretrained("./bertimbau-finetuned-qa")

('./bertimbau-finetuned-qa/tokenizer_config.json',
 './bertimbau-finetuned-qa/special_tokens_map.json',
 './bertimbau-finetuned-qa/vocab.txt',
 './bertimbau-finetuned-qa/added_tokens.json',
 './bertimbau-finetuned-qa/tokenizer.json')

In [146]:
# from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering

# # Caminho para o modelo treinado
# model_path = "caminho/para/seu/modelo"

# # Carregar o modelo e o tokenizador
# tokenizer = AutoTokenizer.from_pretrained(model_path)
# model = AutoModelForQuestionAnswering.from_pretrained(model_path)

# Criar o pipeline para Question Answering
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

Device set to use cuda:0


In [147]:
# Definir contexto e pergunta
context = """
Inicialmente, as autoridades não conseguiram entrar em contato com a Reserva Natural Nacional de Wolong, lar de cerca de 280 pandas gigantes.
No entanto, o Ministério das Relações Exteriores disse mais tarde que um grupo de 31 turistas britânicos que visitavam a Reserva de Pandas de Wolong
na área atingida pelo terremoto retornaram sãos e salvos para Chengdu. No entanto, o bem-estar de um número ainda maior de pandas nas reservas
vizinhas de pandas permaneceu desconhecido. Cinco guardas de segurança da reserva foram mortos pelo terremoto.
"""
question = "Quantos seguranças morreram na reserva?"

# Fazer a pergunta
result = qa_pipeline({"context": context, "question": question})

# Exibir a resposta
print(f"Resposta: {result['answer']}")
print(f"Pontuação: {result['score']}")

Resposta: .
No entanto
Pontuação: 6.368967311008722e-13




In [149]:
context = """
O panda desaparecido foi encontrado morto sob os escombros de um recinto.
Mao Mao, de nove anos, mãe de cinco filhos no centro de criação, foi descoberta na segunda-feira, seu corpo esmagado por uma parede em seu recinto.
"""
question = "Quantos anos tinha o panda Mao Mao?"

result = qa_pipeline({"context": context, "question": question})
print(f"Resposta: {result['answer']}")
print(f"Pontuação: {result['score']}")


Resposta: .
Pontuação: 6.054051032190755e-13


In [150]:
context = """
Cinco guardas de segurança da reserva foram mortos pelo terremoto.
"""
question = "Quantos seguranças morreram na reserva?"

result = qa_pipeline({"context": context, "question": question})
print(f"Resposta: {result['answer']}")
print(f"Pontuação: {result['score']}")

Resposta: .
Pontuação: 1.2422687661536869e-12


In [152]:
train_dataset[1]

{'context': 'Em 2015, Beyoncé assinou uma carta aberta para a qual a Campanha ONE estava coletando assinaturas; a carta foi endereçada a Angela Merkel e Nkosazana Dlamini-Zuma, pedindo que elas se concentrassem nas mulheres enquanto atuassem como chefes do G7 na Alemanha e da UA na África do Sul, respectivamente, que começarão a definir as prioridades no financiamento do desenvolvimento antes de uma cúpula principal da ONU em setembro de 2015, que estabelecerá novas metas de desenvolvimento para a geração.',
 'question': 'O que precisava ser definido no desenvolvimento do financiamento?',
 'answers': {'answer_start': [313], 'text': ['prioridades']}}

# Exemplo em português não funcionou bem. Traduzir mais exemplos.