This file contains a prelim BERT fine tuning implementation with the Adam algo for optimization (basically fancy gradient descent)

In [112]:
from sklearn.model_selection import train_test_split
import numpy as np
import requests
import json 

import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

We load an example csv with "label" and "text" columns. Then it fine-tunes BERT on that data.

In [113]:
#df_data = pd.read_csv("test_text.csv")

In [142]:
df_data = pd.read_json("prelim-data.json")

X = df_data.drop(columns=['label'])
y = df_data['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

df_train = X_train.join(y_train)



0 corresponds to 0 (neutral). 1 corresponds to -1 (bad). 2 corresponds to 1 (good)

In [143]:
#All key inputs up here
num_labels = 3  # Number of labels (right now it's neutral 0, bad 1, good 2)
MAX_LENGTH = 128
batch_size = 5  # Number for minibatch training here
num_epochs = 40 # Number of training epochs

# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define the device we want to use
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Step 1: Preprocessing the data
# Tokenize the text data
tokenized_texts = []
labels = []
for i, row in df_train.iterrows():
    tokenized_text = tokenizer.encode(row['text'], add_special_tokens=True, max_length=512, truncation=True)
    tokenized_texts.append(tokenized_text)
    labels.append(row['label'])

# Define the label mapping
label_map = {0: 0, -1: 1, 1: 2}

# Change labels to be consistent with label mapping above
labels = [label_map[label] for label in labels]

# Step 2: Create dataloader
input_ids = torch.tensor([tokenized_text[:MAX_LENGTH] + [0] * (MAX_LENGTH - len(tokenized_text[:MAX_LENGTH])) for tokenized_text in tokenized_texts])
labels = torch.tensor(labels)

# Create dataloader
data = TensorDataset(input_ids, labels)
dataloader = DataLoader(data, batch_size=batch_size, shuffle=True)

# Step 3: Define the BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)
model.to(device)  # Move the model to the right device

# Step 4: Define the optimizer
optimizer = AdamW(model.parameters(), lr=0.00003)

# Step 5: Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in dataloader:
        batch = tuple(t.to(device) for t in batch)
        input_ids, labels = batch

        optimizer.zero_grad()
        outputs = model(input_ids, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}, Average Training Loss: {avg_train_loss}")

# Step 6: Define a prediction function
def predict(text):
    # Tokenize the input text
    tokenized_text = tokenizer.encode(text, add_special_tokens=True, max_length = 512, truncation=True)
    
    # Convert tokenized input to tensor and move it to the device
    input_ids = torch.tensor(tokenized_text).unsqueeze(0).to(device)
    
    # Set the model to eval mode
    model.eval()
    
    # Apparently turning off grad saves memory and computation
    with torch.no_grad():
        # Give model the inputs
        outputs = model(input_ids)
        
        # Get the logits from the model's output
        logits = outputs.logits
        
        # Calculate the probabilities using softmax
        probabilities = torch.softmax(logits, dim=-1).squeeze(0)
        
        # Get the predicted label
        predicted_label = torch.argmax(probabilities).item()
        
        # Return the predicted label and probabilities
        return probabilities, predicted_label

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/40, Average Training Loss: 1.0919163227081299
Epoch 2/40, Average Training Loss: 1.0466260612010956
Epoch 3/40, Average Training Loss: 0.9444463849067688
Epoch 4/40, Average Training Loss: 0.9714152216911316
Epoch 5/40, Average Training Loss: 0.936492383480072
Epoch 6/40, Average Training Loss: 0.9065832197666168
Epoch 7/40, Average Training Loss: 0.8538501858711243
Epoch 8/40, Average Training Loss: 0.8783121705055237
Epoch 9/40, Average Training Loss: 0.8767186403274536
Epoch 10/40, Average Training Loss: 0.8046156167984009
Epoch 11/40, Average Training Loss: 0.6115048825740814
Epoch 12/40, Average Training Loss: 0.5217722654342651
Epoch 13/40, Average Training Loss: 0.42679503560066223
Epoch 14/40, Average Training Loss: 0.4116516262292862
Epoch 15/40, Average Training Loss: 0.3496115654706955
Epoch 16/40, Average Training Loss: 0.2995358556509018
Epoch 17/40, Average Training Loss: 0.25498688966035843
Epoch 18/40, Average Training Loss: 0.2298404909670353
Epoch 19/40, Avera

In [153]:
# Assuming model is your trained model
# Initialize a list to store the predictions
predictions_label0 = []
predictions_label1 = []
predictions_label2 = []
predictions_predict = []
# Iterate over each observation in X_test

def getBERTScores(df):
    predictions_label0 = []
    predictions_label1 = []
    predictions_label2 = []
    predictions_predict = []

    for index, row in df.iterrows():
        predictions = predict(row["text"])
        
        # Append the prediction to the list
        #print(predictions)
        #print(predictions[0])
        predictions_label0.append(predictions[0][0])
        predictions_label1.append(predictions[0][1])
        predictions_label2.append(predictions[0][2])
        predictions_predict.append(predictions[1])

    # Add the predictions as a new feature to X_test
    rev_label_map = {0: 0, 1: -1, 2: 1}
    predictions_predict = [rev_label_map[label] for label in predictions_predict]

    df['neutral'] = predictions_label0
    df['bad'] = predictions_label1
    df['good'] = predictions_label2
    df["predict"] = predictions_predict


for index, row in X_test.iterrows():
    predictions = predict(row["text"])
    
    # Append the prediction to the list
    #print(predictions)
    #print(predictions[0])
    predictions_label0.append(predictions[0][0])
    predictions_label1.append(predictions[0][1])
    predictions_label2.append(predictions[0][2])
    predictions_predict.append(predictions[1])

# Add the predictions as a new feature to X_test
rev_label_map = {0: 0, 1: -1, 2: 1}
predictions_predict = [rev_label_map[label] for label in predictions_predict]

X_test['neutral'] = predictions_label0
X_test['bad'] = predictions_label1
X_test['good'] = predictions_label2
X_test["predict"] = predictions_predict


# Now X_test contains the original features along with the predicted labels as a new feature


In [150]:
print(X_test)
print(y_test)

(X_test["predict"] == y_test).mean()

                                                 text         neutral  \
10  Brian Nowak Morgan Stanley Thanks for taking m...  tensor(0.0160)   
9   Eric Sheridan Goldman Sachs Thank you very muc...  tensor(0.0134)   
0   Brian Nowak Morgan Stanley Great thanks for ta...  tensor(0.0203)   

               bad            good  predict  
10  tensor(0.8972)  tensor(0.0868)       -1  
9   tensor(0.9387)  tensor(0.0479)       -1  
0   tensor(0.9457)  tensor(0.0340)       -1  
10    1
9     1
0    -1
Name: label, dtype: int64


0.3333333333333333

In [165]:
print(X_train)
print(y_train)

(X_train["predict"] == y_train).mean()

                                                 text         neutral  \
8   Doug Anmuth JP Morgan Thanks for taking the qu...  tensor(0.0107)   
5   Eric Sheridan Goldman Sachs Thank you very muc...  tensor(0.0210)   
2   Brian Nowak Morgan Stanley Thanks for taking m...  tensor(0.0248)   
1   Brian Nowak Morgan Stanley Great Thanks for ta...  tensor(0.0606)   
11  Brian Nowak Morgan Stanley Thanks for taking m...  tensor(0.0169)   
4   Brian Nowak Morgan Stanley Thanks for taking m...  tensor(0.0132)   
7   Brian Nowak Morgan Stanley Thanks for taking m...  tensor(0.0137)   
3   Brian Nowak Morgan Stanley Great Thanks for ta...  tensor(0.0159)   
6   Doug Anmuth JP Morgan Great Thank you Hopefull...  tensor(0.0185)   

               bad            good  predict  
8   tensor(0.9441)  tensor(0.0452)       -1  
5   tensor(0.8388)  tensor(0.1402)       -1  
2   tensor(0.5846)  tensor(0.3907)       -1  
1   tensor(0.7574)  tensor(0.1820)       -1  
11  tensor(0.4634)  tensor(0.5197)     

0.5555555555555556

In [161]:
getBERTScores(X_train)
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression(max_iter=100)
print(X_train["neutral"])
LR.fit(X_train[["neutral", "good", "bad"]], y_train)
print(LR.score(X_train[["neutral", "good", "bad"]], y_train))
getBERTScores(X_test)
print(LR.score(X_test[["neutral", "good", "bad"]], y_test))

8     tensor(0.0107)
5     tensor(0.0210)
2     tensor(0.0248)
1     tensor(0.0606)
11    tensor(0.0169)
4     tensor(0.0132)
7     tensor(0.0137)
3     tensor(0.0159)
6     tensor(0.0185)
Name: neutral, dtype: object
0.7777777777777778
0.3333333333333333
