This file contains a prelim BERT fine tuning implementation with the Adam algo for optimization (basically fancy gradient descent)

In [26]:
from sklearn.model_selection import train_test_split
import numpy as np
import requests
import json 

import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

We load an example csv with "label" and "text" columns. Then it fine-tunes BERT on that data.

In [27]:
#df_data = pd.read_csv("test_text.csv")

In [28]:
df_data = pd.read_json("prelim-data.json")

X = df_data.drop(columns=['label'])
y = df_data['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

df_train = X_train.join(y_train)

X_test

Unnamed: 0,text
15,Eric Sheridan UBS Analyst Thanks for taking th...
9,Eric Sheridan Goldman Sachs Thank you very muc...
0,Brian Nowak Morgan Stanley Great thanks for ta...
8,Doug Anmuth JP Morgan Thanks for taking the qu...


0 corresponds to 0 (neutral). 1 corresponds to -1 (bad). 2 corresponds to 1 (good)

In [29]:
#All key inputs up here
num_labels = 3  # Number of labels (right now it's neutral 0, bad 1, good 2)
MAX_LENGTH = 128
batch_size = 7  # Number for minibatch training here
num_epochs = 30 # Number of training epochs

# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define the device we want to use
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Step 1: Preprocessing the data
# Tokenize the text data
tokenized_texts = []
labels = []
for i, row in df_train.iterrows():
    tokenized_text = tokenizer.encode(row['text'], add_special_tokens=True, max_length=512, truncation=True)
    tokenized_texts.append(tokenized_text)
    labels.append(row['label'])

# Define the label mapping
label_map = {0: 0, -1: 1, 1: 2}

# Change labels to be consistent with label mapping above
labels = [label_map[label] for label in labels]

# Step 2: Create dataloader
input_ids = torch.tensor([tokenized_text[:MAX_LENGTH] + [0] * (MAX_LENGTH - len(tokenized_text[:MAX_LENGTH])) for tokenized_text in tokenized_texts])
labels = torch.tensor(labels)

# Create dataloader
data = TensorDataset(input_ids, labels)
dataloader = DataLoader(data, batch_size=batch_size, shuffle=True)

# Step 3: Define the BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)
model.to(device)  # Move the model to the right device

# Step 4: Define the optimizer
optimizer = AdamW(model.parameters(), lr=0.00003)

# Step 5: Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in dataloader:
        batch = tuple(t.to(device) for t in batch)
        input_ids, labels = batch

        optimizer.zero_grad()
        outputs = model(input_ids, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}, Average Training Loss: {avg_train_loss}")

# Step 6: Define a prediction function
def predict(text):
    # Tokenize the input text
    tokenized_text = tokenizer.encode(text, add_special_tokens=True, max_length = 512, truncation=True)
    
    # Convert tokenized input to tensor and move it to the device
    input_ids = torch.tensor(tokenized_text).unsqueeze(0).to(device)
    
    # Set the model to eval mode
    model.eval()
    
    # Apparently turning off grad saves memory and computation
    with torch.no_grad():
        # Give model the inputs
        outputs = model(input_ids)
        
        # Get the logits from the model's output
        logits = outputs.logits
        
        # Calculate the probabilities using softmax
        probabilities = torch.softmax(logits, dim=-1).squeeze(0)
        
        # Get the predicted label
        predicted_label = torch.argmax(probabilities).item()
        
        # Return the predicted label and probabilities
        return probabilities, predicted_label

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/30, Average Training Loss: 1.1795010964075725
Epoch 2/30, Average Training Loss: 1.0549249450365703
Epoch 3/30, Average Training Loss: 0.9830617904663086
Epoch 4/30, Average Training Loss: 0.9564996163050333
Epoch 5/30, Average Training Loss: 0.8210522532463074
Epoch 6/30, Average Training Loss: 0.6882428328196207
Epoch 7/30, Average Training Loss: 0.600182056427002
Epoch 8/30, Average Training Loss: 0.44618914524714154
Epoch 9/30, Average Training Loss: 0.35177040100097656
Epoch 10/30, Average Training Loss: 0.27607104182243347
Epoch 11/30, Average Training Loss: 0.21334673464298248
Epoch 12/30, Average Training Loss: 0.16591536502043405
Epoch 13/30, Average Training Loss: 0.14265652745962143
Epoch 14/30, Average Training Loss: 0.10577250520388286
Epoch 15/30, Average Training Loss: 0.0890015537540118
Epoch 16/30, Average Training Loss: 0.06503093490997951
Epoch 17/30, Average Training Loss: 0.05720130602518717
Epoch 18/30, Average Training Loss: 0.04246301328142484
Epoch 19/3

In [30]:
# Assuming model is your trained model
# Initialize a list to store the predictions
predictions_label0 = []
predictions_label1 = []
predictions_label2 = []
predictions_predict = []
# Iterate over each observation in X_test

def getBERTScores(df):
    predictions_label0 = []
    predictions_label1 = []
    predictions_label2 = []
    predictions_predict = []

    for index, row in df.iterrows():
        predictions = predict(row["text"])
        
        # Append the prediction to the list
        #print(predictions)
        #print(predictions[0])
        predictions_label0.append(predictions[0][0])
        predictions_label1.append(predictions[0][1])
        predictions_label2.append(predictions[0][2])
        predictions_predict.append(predictions[1])

    # Add the predictions as a new feature to X_test
    rev_label_map = {0: 0, 1: -1, 2: 1}
    predictions_predict = [rev_label_map[label] for label in predictions_predict]

    df['neutral'] = predictions_label0
    df['bad'] = predictions_label1
    df['good'] = predictions_label2
    df["predict"] = predictions_predict


for index, row in X_test.iterrows():
    predictions = predict(row["text"])
    
    # Append the prediction to the list
    #print(predictions)
    #print(predictions[0])
    predictions_label0.append(predictions[0][0])
    predictions_label1.append(predictions[0][1])
    predictions_label2.append(predictions[0][2])
    predictions_predict.append(predictions[1])

# Add the predictions as a new feature to X_test
rev_label_map = {0: 0, 1: -1, 2: 1}
predictions_predict = [rev_label_map[label] for label in predictions_predict]

X_test['neutral'] = predictions_label0
X_test['bad'] = predictions_label1
X_test['good'] = predictions_label2
X_test["predict"] = predictions_predict


# Now X_test contains the original features along with the predicted labels as a new feature


In [31]:
print(X_test)
print(y_test)

(X_test["predict"] == y_test).mean()

                                                 text         neutral  \
15  Eric Sheridan UBS Analyst Thanks for taking th...  tensor(0.0759)   
9   Eric Sheridan Goldman Sachs Thank you very muc...  tensor(0.0499)   
0   Brian Nowak Morgan Stanley Great thanks for ta...  tensor(0.0840)   
8   Doug Anmuth JP Morgan Thanks for taking the qu...  tensor(0.0382)   

               bad            good  predict  
15  tensor(0.5981)  tensor(0.3260)       -1  
9   tensor(0.2932)  tensor(0.6569)        1  
0   tensor(0.7306)  tensor(0.1854)       -1  
8   tensor(0.1091)  tensor(0.8527)        1  
15    1
9     1
0    -1
8    -1
Name: label, dtype: int64


0.5

In [32]:
getBERTScores(X_train)

(X_train["predict"] == y_train).mean()

0.7368421052631579

In [33]:
getBERTScores(X_train)
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression(max_iter=100)
LR.fit(X_train[["neutral", "good", "bad"]], y_train)
print(LR.score(X_train[["neutral", "good", "bad"]], y_train))
getBERTScores(X_test)
print(LR.score(X_test[["neutral", "good", "bad"]], y_test))

0.7894736842105263
0.5


In [34]:
most_frequent_items = y_train.value_counts()
most_frequent = most_frequent_items.head(1)
print(most_frequent/len(y_train))
most_frequent_items = y_test.value_counts()
most_frequent = most_frequent_items.head(1)
print(most_frequent/len(y_test))


1    0.526316
Name: label, dtype: float64
1    0.5
Name: label, dtype: float64
