# Aplicando BERT para Detecção de Bots do Twitter

# 1. Carregando os Dados do Google Drive


In [4]:
from google.colab import drive
import pandas as pd

# Monta o Google Drive para carregar o arquivo
drive.mount('/content/drive')

# Carrega o arquivo CSV
df = pd.read_csv('/content/drive/MyDrive/Atividades M11 DADOS/bot_detection_data.csv')

# Verifica as primeiras linhas do dataset
df.head()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,User ID,Username,Tweet,Retweet Count,Mention Count,Follower Count,Verified,Bot Label,Location,Created At,Hashtags
0,132131,flong,Station activity person against natural majori...,85,1,2353,False,1,Adkinston,2020-05-11 15:29:50,
1,289683,hinesstephanie,Authority research natural life material staff...,55,5,9617,True,0,Sanderston,2022-11-26 05:18:10,both live
2,779715,roberttran,Manage whose quickly especially foot none to g...,6,2,4363,True,0,Harrisonfurt,2022-08-08 03:16:54,phone ahead
3,696168,pmason,Just cover eight opportunity strong policy which.,54,5,2242,True,1,Martinezberg,2021-08-14 22:27:05,ever quickly new I
4,704441,noah87,Animal sign six data good or.,26,3,8438,False,1,Camachoville,2020-04-13 21:24:21,foreign mention


# 2. Preparando os Dados para o BERT

In [5]:
from transformers import BertTokenizer

# Carrega o tokenizer do BERT pré-treinado
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Seleciona os textos (coluna 'Tweet') e rótulos (coluna 'Bot Label')
texts = df['Tweet'].values
labels = df['Bot Label'].values

# Tokeniza os textos com padding e truncamento
encoded_inputs = tokenizer(texts.tolist(), padding=True, truncation=True, max_length=128, return_tensors='pt')

# Obtém os tokens e as máscaras de atenção
input_ids = encoded_inputs['input_ids']
attention_masks = encoded_inputs['attention_mask']

# Converte os rótulos para tensores
import torch
labels = torch.tensor(labels)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



# 3. Dividindo os Dados em Conjuntos de Treinamento e Teste

In [6]:
from sklearn.model_selection import train_test_split

# Divide os dados em 80% treino e 20% teste
train_inputs, test_inputs, train_labels, test_labels = train_test_split(input_ids, labels, test_size=0.2, random_state=42)
train_masks, test_masks, _, _ = train_test_split(attention_masks, labels, test_size=0.2, random_state=42)


# 4. Criando DataLoaders

In [7]:
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler

# Cria datasets para treino e teste
train_data = TensorDataset(train_inputs, train_masks, train_labels)
test_data = TensorDataset(test_inputs, test_masks, test_labels)

# DataLoader para pegar os dados em lotes
train_dataloader = DataLoader(train_data, sampler=RandomSampler(train_data), batch_size=32)
test_dataloader = DataLoader(test_data, sampler=SequentialSampler(test_data), batch_size=32)


# 5. Carregando o Modelo BERT

In [8]:
from transformers import BertForSequenceClassification

# Carrega o modelo BERT com uma camada de classificação para 2 classes
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2,  # Tarefa binária (bot ou não bot)
    output_attentions=False,
    output_hidden_states=False
)

# Move o modelo para a GPU, se disponível
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

# 6. Configurando o Otimizador e a Função de Perda

In [9]:
from transformers import AdamW

# Configura o otimizador AdamW
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)




# 7. Treinando o Modelo

In [None]:
import torch
from torch.nn import CrossEntropyLoss
from torch.optim import AdamW
from tqdm import tqdm
from transformers import get_linear_schedule_with_warmup

# Função de perda
loss_fn = CrossEntropyLoss()

# Número de épocas
epochs = 2

# Otimizador e scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Configuração de precision mista para acelerar
scaler = torch.cuda.amp.GradScaler()

# Congelando as primeiras camadas do BERT para acelerar
for param in model.bert.encoder.layer[:8].parameters():
    param.requires_grad = False

# Loop de treinamento
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(train_dataloader):
        b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)

        # Zera os gradientes
        optimizer.zero_grad()

        with torch.cuda.amp.autocast():
            outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs.loss

        # Acumula a perda
        total_loss += loss.item()

        # Backpropagation com precision mista
        scaler.scale(loss).backward()

        # Atualiza os parâmetros
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

    print(f"Epoch {epoch+1}/{epochs} - Loss: {total_loss/len(train_dataloader)}")


  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():
 57%|█████▋    | 707/1250 [1:01:17<49:10,  5.43s/it]

# 8. Avaliando o Modelo

In [None]:
model.eval()  # Coloca o modelo no modo de avaliação
correct = 0
total = 0

for batch in test_dataloader:
    b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)

    with torch.no_grad():
        outputs = model(b_input_ids, attention_mask=b_input_mask)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    correct += (predictions == b_labels).sum().item()
    total += b_labels.size(0)

accuracy = correct / total
print(f"Acurácia no conjunto de teste: {accuracy * 100:.2f}%")
