In [15]:
from sklearn.model_selection import train_test_split
import numpy as np
import requests
import json
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, BertModel
from joblib import dump, load
from sklearn.model_selection import train_test_split

We load a csv with "label" and "text" columns. Then it fine-tunes BERT on that data.

In [16]:
df_data = pd.read_csv("GOOG_transcripts.csv")

In [17]:
df_data.columns = ["text", "label"]
df_data = df_data.dropna()
X = df_data.drop(columns=['label'])
y = df_data['label']

# Split the data into train, dev, and test sets (80-10-10 split)
X_train, X_devtest, y_train, y_devtest = train_test_split(X, y, test_size=0.2)

X_dev, X_test, y_dev, y_test = train_test_split(X_devtest, y_devtest, test_size=0.5)

df_train = X_train.join(y_train)
df_dev = X_dev.join(y_dev)
df_test = X_test.join(y_test)

(363, 363)

In [50]:
#Saving the created sub-datasets
df_train.to_csv('GOOG_final_train_set.csv', index=False)
df_dev.to_csv('GOOG_final_dev_set.csv', index=False)
df_test.to_csv('GOOG_final_test_set.csv', index=False)

0 corresponds to 0 (neutral). 1 corresponds to -1 (bad). 2 corresponds to 1 (good)

In [18]:
#All key inputs up here
num_labels = 3  # Number of labels (neutral 0, bad 1, good 2)
MAX_LENGTH = 128
batch_size = 5  # Number for minibatch training here
num_epochs = 20 # Number of training epochs

# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define the device we want to use
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")




In [19]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)
model.to(device)  # Move the model to the right device

def data_preprocess(df):
    # Preprocessing the data
    # Tokenize the text data
    tokenized_texts = []
    labels = []
    for i, row in df.iterrows():
        tokenized_text = tokenizer.encode(row['text'], add_special_tokens=True, max_length=512, truncation=True)
        tokenized_texts.append(tokenized_text)
        labels.append(row['label'])

    # Define the label mapping
    label_map = {0: 0, -1: 1, 1: 2}

    # Change labels to be consistent with label mapping above
    labels = [label_map[label] for label in labels]

    input_ids = torch.tensor([tokenized_text[:MAX_LENGTH] + [0] * (MAX_LENGTH - len(tokenized_text[:MAX_LENGTH])) for tokenized_text in tokenized_texts])
    labels = torch.tensor(labels)
    # Output data that's ready to be put into a dataloader
    data = TensorDataset(input_ids, labels)
    return data

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
# Training loop
loss_vec = []
def train_model(lr, batch_size, num_epochs, data):
    dataloader = DataLoader(data, batch_size=batch_size, shuffle=True)
    optimizer = AdamW(model.parameters(), lr=lr) # Define the optimizer
    avg_train_loss = None
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for batch in dataloader:
            batch = tuple(t.to(device) for t in batch)
            input_ids, labels = batch

            optimizer.zero_grad()
            outputs = model(input_ids, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()
            optimizer.step()

        avg_train_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch+1}/{num_epochs}, Average Training Loss: {avg_train_loss}")
    loss_vec.append(avg_train_loss)

# Prediction function
def predict(text):
    # Tokenize text
    tokenized_text = tokenizer.encode(text, add_special_tokens=True, max_length=512, truncation=True)
    
    # Convert tokenized input to tensor and move it to the device
    input_ids = torch.tensor(tokenized_text).unsqueeze(0).to(device)
    
    # Set the model to eval mode
    model.eval()
    
    with torch.no_grad():
        # Give model the inputs
        outputs = model(input_ids)
        
        # Get the logits from the model's output
        logits = outputs.logits
    
    # Calculate the probabilities using softmax
    probabilities = torch.softmax(logits, dim=-1).squeeze(0)
    
    # Get the predicted label
    predicted_label = torch.argmax(probabilities).item()
    
    # Return the predicted label and probabilities
    return probabilities, predicted_label

In [21]:
#Below is our dev loop: finding the best hyperparameters to use for the training loop
data = data_preprocess(df_dev)
lr_vec = []
batch_size_vec = []
for i in range(1,6):
    print(f"Learning Rate: {i*1e-5}")
    for j in range (1,4):
        print(f"Batch Size: {j*16}")
        lr_vec.append(i*1e-5)
        batch_size_vec.append(j*16)
        train_model(lr=i*1e-5, batch_size=j*16, num_epochs=3, data=data)
        model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

Learning Rate: 1e-05
Batch Size: 16




Epoch 1/3, Average Training Loss: 1.0973673877508745
Epoch 2/3, Average Training Loss: 1.0841116671976836
Epoch 3/3, Average Training Loss: 1.0827311510625093


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size: 32
Epoch 1/3, Average Training Loss: 1.0848345359166462
Epoch 2/3, Average Training Loss: 1.068823218345642
Epoch 3/3, Average Training Loss: 1.052112028002739


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size: 48
Epoch 1/3, Average Training Loss: 1.0955205708742142
Epoch 2/3, Average Training Loss: 1.0713668167591095
Epoch 3/3, Average Training Loss: 1.0669596642255783


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Learning Rate: 2e-05
Batch Size: 16
Epoch 1/3, Average Training Loss: 1.0927387527797534
Epoch 2/3, Average Training Loss: 1.080745132073112
Epoch 3/3, Average Training Loss: 1.0476503735003264


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size: 32
Epoch 1/3, Average Training Loss: 1.1033319632212322
Epoch 2/3, Average Training Loss: 1.0801902115345001
Epoch 3/3, Average Training Loss: 1.089866171280543


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size: 48
Epoch 1/3, Average Training Loss: 1.1004212945699692
Epoch 2/3, Average Training Loss: 1.0750447809696198
Epoch 3/3, Average Training Loss: 1.0720806121826172


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Learning Rate: 3.0000000000000004e-05
Batch Size: 16
Epoch 1/3, Average Training Loss: 1.1241115798120913
Epoch 2/3, Average Training Loss: 1.0773768450902856
Epoch 3/3, Average Training Loss: 1.0681819760281106


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size: 32
Epoch 1/3, Average Training Loss: 1.0841793020566304
Epoch 2/3, Average Training Loss: 1.0966764986515045
Epoch 3/3, Average Training Loss: 1.0867029031117756


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size: 48
Epoch 1/3, Average Training Loss: 1.0827585011720657
Epoch 2/3, Average Training Loss: 1.0672740638256073
Epoch 3/3, Average Training Loss: 1.0567666888237


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Learning Rate: 4e-05
Batch Size: 16
Epoch 1/3, Average Training Loss: 1.1033117745233618
Epoch 2/3, Average Training Loss: 1.0838865845099739
Epoch 3/3, Average Training Loss: 1.082493932350822


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size: 32
Epoch 1/3, Average Training Loss: 1.098680466413498
Epoch 2/3, Average Training Loss: 1.1126365264256795
Epoch 3/3, Average Training Loss: 1.0829573074976604


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size: 48
Epoch 1/3, Average Training Loss: 1.1407432854175568
Epoch 2/3, Average Training Loss: 1.0871445536613464
Epoch 3/3, Average Training Loss: 1.0827119946479797


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Learning Rate: 5e-05
Batch Size: 16
Epoch 1/3, Average Training Loss: 1.0859093769736912
Epoch 2/3, Average Training Loss: 1.0805319625398386
Epoch 3/3, Average Training Loss: 1.05078619459401


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size: 32
Epoch 1/3, Average Training Loss: 1.119448572397232
Epoch 2/3, Average Training Loss: 1.097564309835434
Epoch 3/3, Average Training Loss: 1.078416183590889


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size: 48
Epoch 1/3, Average Training Loss: 1.1220953315496445
Epoch 2/3, Average Training Loss: 1.0897968709468842
Epoch 3/3, Average Training Loss: 1.0725366324186325


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
# We find the minimum final epoch loss. Then we set best_lr and best_bsize to the learning rate and batch sizes that correspond to that minimum loss
best_lr = lr_vec[loss_vec.index(min(loss_vec))]
best_bsize = batch_size_vec[loss_vec.index(min(loss_vec))]

In [24]:
# Training the model
print(f"Lowest loss, best learning rate, batch size {min(loss_vec), best_lr, best_bsize}")
data = data_preprocess(df_train)
train_model(lr=best_lr, batch_size=best_bsize, num_epochs=10, data=data)

Lowest loss, best learning rate, batch size (1.0476503735003264, 2e-05, 16)
Epoch 1/10, Average Training Loss: 1.0755329941000258
Epoch 2/10, Average Training Loss: 1.0543964125297882
Epoch 3/10, Average Training Loss: 1.0258540717455058
Epoch 4/10, Average Training Loss: 0.9773261514338818
Epoch 5/10, Average Training Loss: 0.8816695282092462
Epoch 6/10, Average Training Loss: 0.7084242532868962
Epoch 7/10, Average Training Loss: 0.5140244630830628
Epoch 8/10, Average Training Loss: 0.36389172916392704
Epoch 9/10, Average Training Loss: 0.23798746844896904
Epoch 10/10, Average Training Loss: 0.21808725665067577


In [25]:
# Optional code to save training model
dump(model, 'bert_train.joblib')

['bert_train.joblib']

In [26]:
#Commented out: code to load an already trained model
#model = load('bert_train.joblib')

In [27]:
# Initialize lists to store the predictions
predictions_label0 = []
predictions_label1 = []
predictions_label2 = []
predictions_predict = []
# Iterate over each observation in input df
def getBERTScores(df):
    predictions_label0 = []
    predictions_label1 = []
    predictions_label2 = []
    predictions_predict = []

    for _, row in df.iterrows():
        predictions = predict(row["text"])
        
        # Append the prediction to the list
        predictions_label0.append(predictions[0][0])
        predictions_label1.append(predictions[0][1])
        predictions_label2.append(predictions[0][2])
        predictions_predict.append(predictions[1])

    # Add the predictions as a new feature to X_test
    rev_label_map = {0: 0, 1: -1, 2: 1}
    predictions_predict = [rev_label_map[label] for label in predictions_predict]

    df['neutral'] = predictions_label0
    df['bad'] = predictions_label1
    df['good'] = predictions_label2
    df["predict"] = predictions_predict

getBERTScores(X_test)
# Now X_test contains the original features along with the predicted labels as a new feature


In [41]:
#Find our overall accuracy
(X_test["predict"] == y_test).mean()

0.4793388429752066

In [29]:
getBERTScores(X_train)
getBERTScores(X_test)

0.7144827586206897

In [47]:
print(f"Training Accuracy: {(X_train['predict'] == y_train).mean()}")
print(f"Testing Accuracy: {(X_test['predict'] == y_test).mean()}")

Training Accuracy: 0.7144827586206897
Testing Accuracy: 0.4793388429752066


In [42]:
most_frequent_items = y_train.value_counts()
most_frequent = most_frequent_items.head(1)
print(f"Benchmark for Train Set {most_frequent/len(y_train)}")
most_frequent_items = y_test.value_counts()
most_frequent = most_frequent_items.head(1)
print(f"Benchmark for Test Set {most_frequent/len(y_test)}")

Benchmark for Train Set 0    0.433448
Name: label, dtype: float64
Benchmark for Test Set 1    0.399449
Name: label, dtype: float64


In [48]:
print("Proportion of test data labeled 0:", y_test[y_test == 0].shape[0] / y_test.shape[0])
print("Proportion of test data labeled 1:", y_test[y_test == 1].shape[0] / y_test.shape[0])
print("Proportion of test data labeled -1:", y_test[y_test == -1].shape[0] / y_test.shape[0])
print(f"Number of test observations: {y_test.shape[0]}")

print("Proportion of training data labeled 0:", y_train[y_train == 0].shape[0] / y_train.shape[0])
print("Proportion of training data labeled 1:", y_train[y_train == 1].shape[0] / y_train.shape[0])
print("Proportion of training data labeled -1:", y_train[y_train == -1].shape[0] / y_train.shape[0])
print(f"Number of training observations: {y_train.shape[0]}")

Proportion of test data labeled 0: 0.39118457300275483
Proportion of test data labeled 1: 0.39944903581267216
Proportion of test data labeled -1: 0.209366391184573
Number of test observations: 363
Proportion of training data labeled 0: 0.43344827586206897
Proportion of training data labeled 1: 0.3510344827586207
Proportion of training data labeled -1: 0.21551724137931033
Number of training observations: 2900


In [49]:
train_predict = X_train['predict']
test_predict = X_test['predict']

print("Proportion of test data labeled 0:", test_predict[test_predict == 0].shape[0] / test_predict.shape[0])
print("Proportion of test data labeled 1:", test_predict[test_predict == 1].shape[0] / test_predict.shape[0])
print("Proportion of test data labeled -1:", test_predict[test_predict == -1].shape[0] / test_predict.shape[0])
print(f"Number of test observations: {test_predict.shape[0]}")

print("Proportion of training data labeled 0:", train_predict[train_predict == 0].shape[0] / train_predict.shape[0])
print("Proportion of training data labeled 1:", train_predict[train_predict == 1].shape[0] / train_predict.shape[0])
print("Proportion of training data labeled -1:", train_predict[train_predict == -1].shape[0] / train_predict.shape[0])
print(f"Number of training observations: {train_predict.shape[0]}")

Proportion of test data labeled 0: 0.8484848484848485
Proportion of test data labeled 1: 0.11570247933884298
Proportion of test data labeled -1: 0.03581267217630854
Number of test observations: 363
Proportion of training data labeled 0: 0.6913793103448276
Proportion of training data labeled 1: 0.21551724137931033
Proportion of training data labeled -1: 0.09310344827586207
Number of training observations: 2900


In [76]:
df_AAPL = pd.read_csv("AAPL_transcripts.csv")
df_AAPL.columns = ["text", "label"]
df_AAPL = df_AAPL.dropna()

#df_AAPL_sample = df_AAPL #df_AAPL.sample(frac=0.3)
#df_AAPL_sample.shape[0]

In [77]:
getBERTScores(df_AAPL)

In [78]:
AAPL_predict = df_AAPL['predict']

print("Proportion of AAPL data labeled 0:", AAPL_predict[AAPL_predict == 0].shape[0] / AAPL_predict.shape[0])
print("Proportion of AAPL data labeled 1:", AAPL_predict[AAPL_predict == 1].shape[0] / AAPL_predict.shape[0])
print("Proportion of AAPL data labeled -1:", AAPL_predict[AAPL_predict == -1].shape[0] / AAPL_predict.shape[0])
print(f"Number of AAPL observations: {AAPL_predict.shape[0]}")

print(f"AAPL Accuracy: {(df_AAPL['predict'] == df_AAPL['label']).mean()}")

most_frequent_items = df_AAPL['label'].value_counts()
most_frequent = most_frequent_items.head(1)
print(f"Benchmark for Test Set {most_frequent/len(df_AAPL['label'])}")

Proportion of AAPL data labeled 0: 0.8379863706309079
Proportion of AAPL data labeled 1: 0.12398329303143547
Proportion of AAPL data labeled -1: 0.03803033633765663
Number of AAPL observations: 4549
AAPL Accuracy: 0.43635963948120465
Benchmark for Test Set 0    0.43636
Name: label, dtype: float64


In [83]:
df_MSFT = pd.read_csv("MSFT_transcripts.csv")
df_MSFT.columns = ["text", "label"]
df_MSFT = df_MSFT.dropna()

#df_AAPL_sample = df_AAPL #df_AAPL.sample(frac=0.3)
#df_AAPL_sample.shape[0]
getBERTScores(df_MSFT)

MSFT_predict = df_MSFT['predict']

In [84]:
print("Proportion of MSFT data with predicted label 0:", MSFT_predict[MSFT_predict == 0].shape[0] / MSFT_predict.shape[0])
print("Proportion of MSFT data with predicted label 1:", MSFT_predict[MSFT_predict == 1].shape[0] / MSFT_predict.shape[0])
print("Proportion of MSFT data with predicted label -1:", MSFT_predict[MSFT_predict == -1].shape[0] / MSFT_predict.shape[0])
print(f"Number of MSFT observations: {MSFT_predict.shape[0]}")

print(f"MSFT Accuracy: {(df_MSFT['predict'] == df_MSFT['label']).mean()}")

most_frequent_items = df_MSFT['label'].value_counts()
most_frequent = most_frequent_items.head(1)
print(f"Benchmark for Test Set {most_frequent/len(df_MSFT['label'])}")

print("Proportion of MSFT data actually labeled 0:", df_MSFT[df_MSFT["label"] == 0].shape[0] / df_MSFT.shape[0])
print("Proportion of MSFT data actually labeled 1:", df_MSFT[df_MSFT["label"] == 1].shape[0] / df_MSFT.shape[0])
print("Proportion of MSFT data actually labeled -1:", df_MSFT[df_MSFT["label"] == -1].shape[0] / df_MSFT.shape[0])
print(df_MSFT.shape[0])

Proportion of MSFT data with predicted label 0: 0.8280318091451292
Proportion of MSFT data with predicted label 1: 0.13286944996686548
Proportion of MSFT data with predicted label -1: 0.0390987408880053
Number of MSFT observations: 3018
MSFT Accuracy: 0.4188204108681246
Benchmark for Test Set 0    0.429755
Name: label, dtype: float64
Proportion of MSFT data actually labeled 0: 0.42975480450629555
Proportion of MSFT data actually labeled 1: 0.40059642147117297
Proportion of MSFT data actually labeled -1: 0.16964877402253148
3018
