In [12]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, AdamW
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import requests
import json 
import torch

In [13]:
%load_ext autoreload
%autoreload 2
from functions import get_tickers, get_stock_dict, get_companies_by_50
QUARTERS = ["Q1", "Q2", "Q3", "Q4"]
YEARS = [str(2005 + i) for i in range(18)]


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [16]:
tickers = get_tickers()
dict_of_df = get_stock_dict()
t_50, t_100, t_150, t_200, t_250 = get_companies_by_50()

In [30]:
final_texts = []
final_labels = []
bad_starts = ["Operator:", "TRANSCRIPT", "Operator", "Related:", "Executives"]

#Iterates through the first 300 companies
for i, tick in enumerate(tickers[:300]):
    #print(i)
    #Check to select the correct company transcript files ("t_50" variable)
    if i < 50:
        #Get company price and transcript date
        company = t_50[tick]
        company_price_df = dict_of_df[tick]
        #Transform Date column so that the time of day is not included for price data (makes look up easier)
        company_price_df["Date"] = company_price_df["Date"].map(lambda x: x.split()[0])
        #Iterate through years
        for year in YEARS:
            #Check to see if a earnings call has been reported for the given year
            if len(company[year].keys()) != 0:
                #Iterate through all the quarters that have a released earnings call
                for quarter in company[year].keys():
                    #Grabs date and transcript of the earnings call
                    date = company[year][quarter]["date"].split()
                    transcript = company[year][quarter]["transcript"]

                    #Checks to see when the call is released
                    #If the call is in the middle of the day we want the close price of the previous day
                    #If call is after hours we use the close price of that day
                    time = date[1].split(":")
                    date_0 = 0
                    if int(time[0] + time[1]) < 1600:
                        date_0 = (company_price_df["Date"] == date[0]).shift(-1, fill_value = False)
                        #print("first", company_price_df["Date"], date[0])
                    else:
                        date_0 = company_price_df["Date"] == date[0]
                        #print("second", sum(date_0))
                    if sum(date_0) !=0:
                        #Grabs the Close price prior to earnings call, 20 days after, and 60 days after
                        date_20 = date_0.shift(20, fill_value = False)
                        date_60 = date_0.shift(60, fill_value = False)
                        data_on_dates = company_price_df[date_0 + date_20 + date_60]
                        close_price_0, close_price_20, close_price_60 = data_on_dates["Close"]

                        #Calculates the one month and three month price change
                        one_month_change = ((close_price_20 - close_price_0) / close_price_0) * 100
                        three_month_change = ((close_price_60 - close_price_0) / close_price_0) * 100

                        #Creates a label for whether the stock has gone up, down, or stayed the same after the call release
                        label = 0
                        if one_month_change > 2 and three_month_change > 4:
                            label = 1
                        elif one_month_change < -2 and three_month_change < -4:
                            label = -1

                        sentences = transcript.split("\n")
                        for sentence in sentences:
                            check = sentence.split()
                            check1 = sentence.split(":")
                            if len(check) !=0 and len(check) < 513:
                                if check[0] not in bad_starts and len(check1) == 2:
                                    #print(sentence.split(":"))
                                    final_texts.append(sentence.split(":")[1])
                                    final_labels.append(label)
    """elif i < 100 and i >= 50:
        #Get company price and transcript date
        company = t_100[tick]
        company_price_df = dict_of_df[tick]
        #Transform Date column so that the time of day is not included for price data (makes look up easier)
        company_price_df["Date"] = company_price_df["Date"].map(lambda x: x.split()[0])
        #Iterate through years
        for year in YEARS:
            #Check to see if a earnings call has been reported for the given year
            if len(company[year].keys()) != 0:
                #Iterate through all the quarters that have a released earnings call
                for quarter in company[year].keys():
                    #Grabs date and transcript of the earnings call
                    date = company[year][quarter]["date"].split()
                    transcript = company[year][quarter]["transcript"]

                    #Checks to see when the call is released
                    #If the call is in the middle of the day we want the close price of the previous day
                    #If call is after hours we use the close price of that day
                    time = date[1].split(":")
                    date_0 = 0
                    if int(time[0] + time[1]) < 1600:
                        date_0 = (company_price_df["Date"] == date[0]).shift(-1, fill_value = False)
                        #print("first", company_price_df["Date"], date[0])
                    else:
                        date_0 = company_price_df["Date"] == date[0]
                        #print("second", sum(date_0))
                    if sum(date_0) !=0:
                        #Grabs the Close price prior to earnings call, 20 days after, and 60 days after
                        date_20 = date_0.shift(20, fill_value = False)
                        date_60 = date_0.shift(60, fill_value = False)
                        data_on_dates = company_price_df[date_0 + date_20 + date_60]
                        close_price_0, close_price_20, close_price_60 = data_on_dates["Close"]

                        #Calculates the one month and three month price change
                        one_month_change = ((close_price_20 - close_price_0) / close_price_0) * 100
                        three_month_change = ((close_price_60 - close_price_0) / close_price_0) * 100

                        #Creates a label for whether the stock has gone up, down, or stayed the same after the call release
                        label = 0
                        if one_month_change > 5 and three_month_change > 10:
                            label = 1
                        elif one_month_change < -5 and three_month_change < -10:
                            label = -1

                        sentences = transcript.split("\n")
                        for sentence in sentences:
                            check = sentence.split()
                            check1 = sentence.split(":")
                            if len(check) !=0 and len(check) < 513:
                                if check[0] not in bad_starts and len(check1) == 2:
                                    #print(sentence.split(":"))
                                    final_texts.append(sentence.split(":")[1])
                                    final_labels.append(label)
    elif i < 150 and i >= 100:
        #Get company price and transcript date
        company = t_150[tick]
        company_price_df = dict_of_df[tick]
        #Transform Date column so that the time of day is not included for price data (makes look up easier)
        company_price_df["Date"] = company_price_df["Date"].map(lambda x: x.split()[0])
        #Iterate through years
        for year in YEARS:
            #Check to see if a earnings call has been reported for the given year
            if len(company[year].keys()) != 0:
                #Iterate through all the quarters that have a released earnings call
                for quarter in company[year].keys():
                    #Grabs date and transcript of the earnings call
                    date = company[year][quarter]["date"].split()
                    transcript = company[year][quarter]["transcript"]

                    #Checks to see when the call is released
                    #If the call is in the middle of the day we want the close price of the previous day
                    #If call is after hours we use the close price of that day
                    time = date[1].split(":")
                    date_0 = 0
                    if int(time[0] + time[1]) < 1600:
                        date_0 = (company_price_df["Date"] == date[0]).shift(-1, fill_value = False)
                        #print("first", company_price_df["Date"], date[0])
                    else:
                        date_0 = company_price_df["Date"] == date[0]
                        #print("second", sum(date_0))
                    if sum(date_0) !=0:
                        #Grabs the Close price prior to earnings call, 20 days after, and 60 days after
                        date_20 = date_0.shift(20, fill_value = False)
                        date_60 = date_0.shift(60, fill_value = False)
                        data_on_dates = company_price_df[date_0 + date_20 + date_60]
                        close_price_0, close_price_20, close_price_60 = data_on_dates["Close"]

                        #Calculates the one month and three month price change
                        one_month_change = ((close_price_20 - close_price_0) / close_price_0) * 100
                        three_month_change = ((close_price_60 - close_price_0) / close_price_0) * 100

                        #Creates a label for whether the stock has gone up, down, or stayed the same after the call release
                        label = 0
                        if one_month_change > 5 and three_month_change > 10:
                            label = 1
                        elif one_month_change < -5 and three_month_change < -10:
                            label = -1

                        sentences = transcript.split("\n")
                        for sentence in sentences:
                            check = sentence.split()
                            check1 = sentence.split(":")
                            if len(check) !=0 and len(check) < 513:
                                if check[0] not in bad_starts and len(check1) == 2:
                                    #print(sentence.split(":"))
                                    final_texts.append(sentence.split(":")[1])
                                    final_labels.append(label)
    elif i < 200 and i >= 150:
        #Get company price and transcript date
        company = t_200[tick]
        company_price_df = dict_of_df[tick]
        #Transform Date column so that the time of day is not included for price data (makes look up easier)
        company_price_df["Date"] = company_price_df["Date"].map(lambda x: x.split()[0])
        #Iterate through years
        for year in YEARS:
            #Check to see if a earnings call has been reported for the given year
            if len(company[year].keys()) != 0:
                #Iterate through all the quarters that have a released earnings call
                for quarter in company[year].keys():
                    #Grabs date and transcript of the earnings call
                    date = company[year][quarter]["date"].split()
                    transcript = company[year][quarter]["transcript"]

                    #Checks to see when the call is released
                    #If the call is in the middle of the day we want the close price of the previous day
                    #If call is after hours we use the close price of that day
                    time = date[1].split(":")
                    date_0 = 0
                    if int(time[0] + time[1]) < 1600:
                        date_0 = (company_price_df["Date"] == date[0]).shift(-1, fill_value = False)
                        #print("first", company_price_df["Date"], date[0])
                    else:
                        date_0 = company_price_df["Date"] == date[0]
                        #print("second", sum(date_0))
                    if sum(date_0) !=0:
                        #Grabs the Close price prior to earnings call, 20 days after, and 60 days after
                        date_20 = date_0.shift(20, fill_value = False)
                        date_60 = date_0.shift(60, fill_value = False)
                        data_on_dates = company_price_df[date_0 + date_20 + date_60]
                        close_price_0, close_price_20, close_price_60 = data_on_dates["Close"]

                        #Calculates the one month and three month price change
                        one_month_change = ((close_price_20 - close_price_0) / close_price_0) * 100
                        three_month_change = ((close_price_60 - close_price_0) / close_price_0) * 100

                        #Creates a label for whether the stock has gone up, down, or stayed the same after the call release
                        label = 0
                        if one_month_change > 5 and three_month_change > 10:
                            label = 1
                        elif one_month_change < -5 and three_month_change < -10:
                            label = -1

                        sentences = transcript.split("\n")
                        for sentence in sentences:
                            check = sentence.split()
                            check1 = sentence.split(":")
                            if len(check) !=0 and len(check) < 513:
                                if check[0] not in bad_starts and len(check1) == 2:
                                    #print(sentence.split(":"))
                                    final_texts.append(sentence.split(":")[1])
                                    final_labels.append(label)
    elif i < 250 and i >= 200:
        #Get company price and transcript date
        company = t_250[tick]
        company_price_df = dict_of_df[tick]
        #Transform Date column so that the time of day is not included for price data (makes look up easier)
        company_price_df["Date"] = company_price_df["Date"].map(lambda x: x.split()[0])
        #Iterate through years
        for year in YEARS:
            #Check to see if a earnings call has been reported for the given year
            if len(company[year].keys()) != 0:
                #Iterate through all the quarters that have a released earnings call
                for quarter in company[year].keys():
                    #Grabs date and transcript of the earnings call
                    date = company[year][quarter]["date"].split()
                    transcript = company[year][quarter]["transcript"]

                    #Checks to see when the call is released
                    #If the call is in the middle of the day we want the close price of the previous day
                    #If call is after hours we use the close price of that day
                    time = date[1].split(":")
                    date_0 = 0
                    if int(time[0] + time[1]) < 1600:
                        date_0 = (company_price_df["Date"] == date[0]).shift(-1, fill_value = False)
                        #print("first", company_price_df["Date"], date[0])
                    else:
                        date_0 = company_price_df["Date"] == date[0]
                        #print("second", sum(date_0))
                    if sum(date_0) !=0:
                        #Grabs the Close price prior to earnings call, 20 days after, and 60 days after
                        date_20 = date_0.shift(20, fill_value = False)
                        date_60 = date_0.shift(60, fill_value = False)
                        data_on_dates = company_price_df[date_0 + date_20 + date_60]
                        close_price_0, close_price_20, close_price_60 = data_on_dates["Close"]

                        #Calculates the one month and three month price change
                        one_month_change = ((close_price_20 - close_price_0) / close_price_0) * 100
                        three_month_change = ((close_price_60 - close_price_0) / close_price_0) * 100

                        #Creates a label for whether the stock has gone up, down, or stayed the same after the call release
                        label = 0
                        if one_month_change > 5 and three_month_change > 10:
                            label = 1
                        elif one_month_change < -5 and three_month_change < -10:
                            label = -1
                        

                        sentences = transcript.split("\n")
                        for sentence in sentences:
                            check = sentence.split()
                            check1 = sentence.split(":")
                            if len(check) !=0 and len(check) < 513:
                                if check[0] not in bad_starts and len(check1) == 2:
                                    #print(sentence.split(":"))
                                    final_texts.append(sentence.split(":")[1])
                                    final_labels.append(label)
"""

final_df = pd.DataFrame({"Text": final_texts, "Labels": final_labels})

In [31]:
print("Proportion of data labeled 0:", final_df[final_df["Labels"] == 0].shape[0] / final_df.shape[0])
print("Proportion of data labeled 1:", final_df[final_df["Labels"] == 1].shape[0] / final_df.shape[0])
print("Proportion of data labeled -1:", final_df[final_df["Labels"] == -1].shape[0] / final_df.shape[0])
print(final_df.shape[0])

Proportion of data labeled 0: 0.5219018434385226
Proportion of data labeled 1: 0.32811924425960826
Proportion of data labeled -1: 0.14997891230186908
187313


In [24]:
temp = final_df.sample(frac=.5).reset_index(drop=True)
print(temp.shape[0])
print("Proportion of data labeled 0:", temp[temp["Labels"] == 0].shape[0] / temp.shape[0])
print("Proportion of data labeled 1:", temp[temp["Labels"] == 1].shape[0] / temp.shape[0])
print("Proportion of data labeled -1:", temp[temp["Labels"] == -1].shape[0] / temp.shape[0])
X_train = temp["Text"]
y_train = temp["Labels"]

93656
Proportion of data labeled 0: 0.7563637140172547
Proportion of data labeled 1: 0.1696314171008798
Proportion of data labeled -1: 0.07400486888186555


In [21]:
#All key inputs up here
num_labels = 3  # Number of labels (right now it's neutral 0, bad 1, good 2)
MAX_LENGTH = 512
batch_size = 25 # Number for minibatch training here
num_epochs = 100 # Number of training epochs

# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define the device we want to use
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Step 0 Done")
# Step 1: Preprocessing the data
# Tokenize the text data
tokenized_texts = []
labels = []
for i, row in temp.iterrows():
    
    tokenized_text = tokenizer.encode(row['Text'], add_special_tokens=True, max_length=512, truncation=True)
    tokenized_texts.append(tokenized_text)
    labels.append(row['Labels'])

print("Step 1 Done")
# Define the label mapping
label_map = {0: 0, -1: 1, 1: 2}

# Change labels to be consistent with label mapping above
labels = [label_map[label] for label in labels]

# Step 2: Create dataloader

input_ids = torch.tensor([tokenized_text[:MAX_LENGTH] + [0] * (MAX_LENGTH - len(tokenized_text[:MAX_LENGTH])) for tokenized_text in tokenized_texts])
labels = torch.tensor(labels)

# Create dataloader
data = TensorDataset(input_ids, labels)
dataloader = DataLoader(data, batch_size=batch_size, shuffle=True)
print("Step 2 Done")
# Step 3: Define the BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)
model.to(device)  # Move the model to the right device
print("Step 3 Done")
# Step 4: Define the optimizer
optimizer = AdamW(model.parameters(), lr=0.001)
print("Step 4 Done")
# Step 5: Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    print("prebatch")
    count = 0
    for batch in dataloader:
        print("Batch ", count +1)
        batch = tuple(t.to(device) for t in batch)
        input_ids, labels = batch
        print("Batch: ", batch)

        optimizer.zero_grad()
        outputs = model(input_ids, labels=labels)
        print("Outputs: ", outputs)
        loss = outputs.loss
    
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()
        count += 1

    avg_train_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}, Average Training Loss: {avg_train_loss}")

# Step 6: Define a prediction function
def predict(text):
    # Tokenize the input text
    tokenized_text = tokenizer.encode(text, add_special_tokens=True, max_length=512, truncation=True)
    
    # Convert tokenized input to tensor and move it to the device
    input_ids = torch.tensor(tokenized_text).unsqueeze(0).to(device)
    
    # Set the model to eval mode
    model.eval()
    
    # Apparently turning off grad saves memory and computation
    with torch.no_grad():
        # Give model the inputs
        outputs = model(input_ids)
        
        # Get the logits from the model's output
        logits = outputs.logits
        
        # Calculate the probabilities using softmax
        probabilities = torch.softmax(logits, dim=-1).squeeze(0)
        
        # Get the predicted label
        predicted_label = torch.argmax(probabilities).item()
        
        # Return the predicted label and its probability
        return predicted_label, probabilities



Step 0 Done
Step 1 Done
Step 2 Done


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step 3 Done
Step 4 Done
prebatch
Batch  1
Batch:  (tensor([[  101,  3100,  1012,  ...,     0,     0,     0],
        [  101,  2307,  1012,  ...,     0,     0,     0],
        [  101, 15419,  1011,  ...,     0,     0,     0],
        ...,
        [  101,  3100,  1012,  ...,     0,     0,     0],
        [  101,  3100,  1012,  ...,     0,     0,     0],
        [  101, 27547,  1012,  ...,     0,     0,     0]]), tensor([0, 0, 0, 0, 0, 1, 2, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0,
        0]))
Outputs:  SequenceClassifierOutput(loss=tensor(1.2008, grad_fn=<NllLossBackward0>), logits=tensor([[-0.3802, -0.1359, -0.1424],
        [-0.3429,  0.1051, -0.5226],
        [-0.3032,  0.0394, -0.5149],
        [-0.2540,  0.0265, -0.2626],
        [-0.5119, -0.0565, -0.2213],
        [-0.4503, -0.0849, -0.4769],
        [-0.3489, -0.1429, -0.2031],
        [-0.3221, -0.0482, -0.0891],
        [-0.2076, -0.1168, -0.1472],
        [-0.2262,  0.0478, -0.4269],
        [-0.4417, -0.0802, -0.50

KeyboardInterrupt: 

In [None]:
guesses = []

company = t_100[tickers[50]]
company_price_df = dict_of_df[tickers[50]]
#Transform Date column so that the time of day is not included for price data (makes look up easier)
company_price_df["Date"] = company_price_df["Date"].map(lambda x: x.split()[0])
#Iterate through years
for year in YEARS:
    #Check to see if a earnings call has been reported for the given year
    if len(company[year].keys()) != 0:
        #Iterate through all the quarters that have a released earnings call
        for quarter in company[year].keys():
            dev_texts =[]
            #Grabs date and transcript of the earnings call
            date = company[year][quarter]["date"].split()
            transcript = company[year][quarter]["transcript"]

            #Checks to see when the call is released
            #If the call is in the middle of the day we want the close price of the previous day
            #If call is after hours we use the close price of that day
            time = date[1].split(":")
            date_0 = 0
            if int(time[0] + time[1]) < 1600:
                date_0 = (company_price_df["Date"] == date[0]).shift(-1, fill_value = False)
                #print("first", company_price_df["Date"], date[0])
            else:
                date_0 = company_price_df["Date"] == date[0]
                #print("second", sum(date_0))
            if sum(date_0) != 0:
                #Grabs the Close price prior to earnings call, 20 days after, and 60 days after
                date_20 = date_0.shift(20, fill_value = False)
                date_60 = date_0.shift(60, fill_value = False)
                data_on_dates = company_price_df[date_0 + date_20 + date_60]
                close_price_0, close_price_20, close_price_60 = data_on_dates["Close"]

                #Calculates the one month and three month price change
                one_month_change = ((close_price_20 - close_price_0) / close_price_0) * 100
                three_month_change = ((close_price_60 - close_price_0) / close_price_0) * 100

                #Creates a label for whether the stock has gone up, down, or stayed the same after the call release
                label = 0
                if one_month_change > 5 and three_month_change > 10:
                    label = 1
                elif one_month_change < -5 and three_month_change < -10:
                    label = -1

                sentences = transcript.split("\n")
                for sentence in sentences:
                    check = sentence.split()
                    check1 = sentence.split(":")
                    if len(check) !=0 and len(check) < 513:
                        if check[0] not in bad_starts and len(check1) == 2:
                            #print(sentence.split(":"))
                            dev_texts.append(sentence.split(":")[1])
                
                first_method = []
                second_method = []
                for text in dev_texts:
                    predicted_label, probability = predict(text)
                    first_method.append(predicted_label)
                    second_method.append(probability)
                
                first_method_pred = torch.argmax(torch.Tensor([first_method.count(0), first_method.count(1), first_method.count(2)])).item()
                second_method_pred = torch.argmax(sum(second_method_pred)).item()

                guesses.append((second_method_pred, label))
                print("Method 1 Predition: ", first_method_pred)
                print("Method 2 Predition: ", second_method_pred)
                print("Actual Labe: ", (label + 3)%3)

print("All Guesses: ", guesses)

In [27]:
r = [(1,2), (2,3), (3,4)]
r[0][0]

1