This file contains a prelim BERT fine tuning implementation with the Adam algo for optimization (basically fancy gradient descent)

In [33]:
from sklearn.model_selection import train_test_split
import numpy as np
import requests
import json 

import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

We load an example csv with "label" and "text" columns. Then it fine-tunes BERT on that data.

In [34]:
df_data = pd.read_csv("test_text.csv")

0 corresponds to 0 (neutral). 1 corresponds to -1 (bad). 2 corresponds to 1 (good)

In [37]:
#All key inputs up here
num_labels = 3  # Number of labels (right now it's neutral 0, bad 1, good 2)
MAX_LENGTH = 128
batch_size = 10 # Number for minibatch training here
num_epochs = 3 # Number of training epochs

# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define the device we want to use
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Step 1: Preprocessing the data
# Tokenize the text data
tokenized_texts = []
labels = []
for i, row in df_data.iterrows():
    tokenized_text = tokenizer.encode(row['text'], add_special_tokens=True, max_length=512, truncation=True)
    tokenized_texts.append(tokenized_text)
    labels.append(row['label'])

# Define the label mapping
label_map = {0: 0, -1: 1, 1: 2}

# Change labels to be consistent with label mapping above
labels = [label_map[label] for label in labels]

# Step 2: Create dataloader

input_ids = torch.tensor([tokenized_text[:MAX_LENGTH] + [0] * (MAX_LENGTH - len(tokenized_text[:MAX_LENGTH])) for tokenized_text in tokenized_texts])
labels = torch.tensor(labels)

# Create dataloader

data = TensorDataset(input_ids, labels)
dataloader = DataLoader(data, batch_size=batch_size, shuffle=True)

# Step 3: Define the BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)
model.to(device)  # Move the model to the right device

# Step 4: Define the optimizer
optimizer = AdamW(model.parameters(), lr=0.001)

# Step 5: Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in dataloader:
        batch = tuple(t.to(device) for t in batch)
        input_ids, labels = batch

        optimizer.zero_grad()
        outputs = model(input_ids, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}, Average Training Loss: {avg_train_loss}")

# Step 6: Define a prediction function
def predict(text):
    # Tokenize the input text
    tokenized_text = tokenizer.encode(text, add_special_tokens=True, max_length=512, truncation=True)
    
    # Convert tokenized input to tensor and move it to the device
    input_ids = torch.tensor(tokenized_text).unsqueeze(0).to(device)
    
    # Set the model to eval mode
    model.eval()
    
    # Apparently turning off grad saves memory and computation
    with torch.no_grad():
        # Give model the inputs
        outputs = model(input_ids)
        
        # Get the logits from the model's output
        logits = outputs.logits
        
        # Calculate the probabilities using softmax
        probabilities = torch.softmax(logits, dim=-1).squeeze(0)
        
        # Get the predicted label
        predicted_label = torch.argmax(probabilities).item()
        
        # Return the predicted label and its probability
        return predicted_label, probabilities[predicted_label].item()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Average Training Loss: 0.9035760164260864
Epoch 2/3, Average Training Loss: 2.1034080386161804
Epoch 3/3, Average Training Loss: 1.3249844312667847


In [38]:
# Example usage:
text = "terrible awful unexpected gurdns" #used fake word gurdns in -1 labeled training data to make sure training is actually working
predicted_label, probability = predict(text)
print(f"Predicted Label: {predicted_label}, Probability: {probability}")

Predicted Label: 1, Probability: 0.6080817580223083
