<a href="https://colab.research.google.com/github/kgreed4/no_hate_transformer/blob/kgreed/bert_encoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split

def train_model(model, train_loader, criterion, optimizer, num_epochs=2):
    train_losses = []
    train_accuracies = []
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        correct_predictions = 0
        total_predictions = 0
        for inputs, attention_masks, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs, attention_mask=attention_masks)
            loss = criterion(outputs.logits, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * inputs.size(0)

            _, predicted = torch.max(outputs.logits, 1)
            correct_predictions += (predicted == labels).sum().item()
            total_predictions += labels.size(0)

        epoch_loss = running_loss / len(train_loader.dataset)
        epoch_accuracy = correct_predictions / total_predictions

        train_losses.append(epoch_loss)
        train_accuracies.append(epoch_accuracy)

        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}")

    return model, train_losses, train_accuracies

# Load data
# Assuming df is your DataFrame with 'tweet' and 'class' columns

df = pd.read_csv('cleaned_data_sw.csv')

# Drop every column that isn't tweet or class
df = df.drop(df.columns.difference(['tweet', 'class']), axis=1)

# Convert labels to numeric
df['class'] = df['class'].astype(int)

print(df.iloc[16])

# Split data into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(df['tweet'], df['class'], test_size=0.2, random_state=42)

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Tokenize input texts
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, return_tensors='tf')
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, return_tensors='tf')

print(len(train_encodings['input_ids']))
print(len(train_encodings['attention_mask']))
print(len(train_labels))

# Check data types of train_encodings
input_ids_type = type(train_encodings['input_ids'][0])
attention_mask_type = type(train_encodings['attention_mask'][0])
labels_type = type(train_labels[0])

print("Data type of input_ids:", input_ids_type)
print("Data type of attention_mask:", attention_mask_type)
print("Data type of labels:", labels_type)


# Create PyTorch datasets
train_dataset = TensorDataset(train_encodings['input_ids'],
                              train_encodings['attention_mask'],
                              torch.tensor(train_labels))
val_dataset = TensorDataset(torch.tensor(val_encodings['input_ids']),
                            torch.tensor(val_encodings['attention_mask']),
                            torch.tensor(val_labels))

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# Fine-tune BERT model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = torch.nn.CrossEntropyLoss()

# Train the model
model, train_losses, train_accuracies = train_model(model, train_loader, criterion, optimizer, num_epochs=2)


class                     0
tweet    bitch plz whatever
Name: 16, dtype: object


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


19826
19826
19826


NameError: name 'tf' is not defined