In [1]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import requests
import json 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
%load_ext autoreload
%autoreload 2
from functions import get_tickers, get_stock_dict, get_companies_by_50
QUARTERS = ["Q1", "Q2", "Q3", "Q4"]
YEARS = [str(2005 + i) for i in range(18)]


In [4]:
tickers = get_tickers()
dict_of_df = get_stock_dict()
t_50, t_100, t_150, t_200, t_250 = get_companies_by_50()

In [6]:

print(t_250.keys())
print(tickers[200:250])

print(t_100.keys())
print(tickers[100:150])


dict_keys(['DLR', 'PAYX', 'ROST', 'D', 'LULU', 'TEL', 'MRNA', 'AMP', 'IQV', 'BK', 'LEN', 'CCI', 'GEV', 'JCI', 'A', 'NUE', 'KMI', 'PRU', 'CMI', 'DOW', 'AME', 'LHX', 'ODFL', 'KR', 'GIS', 'HSY', 'FAST', 'IDXX', 'CNC', 'YUM', 'HUM', 'EXC', 'FIS', 'CTVA', 'IR', 'CHTR', 'PWR', 'MSCI', 'PCG', 'OTIS', 'CSGP', 'MLM', 'SYY', 'KVUE', 'ACGL', 'GEHC', 'RCL', 'FANG', 'PEG', 'NDAQ'])
['DLR', 'PAYX', 'ROST', 'D', 'LULU', 'TEL', 'MRNA', 'AMP', 'IQV', 'BK', 'LEN', 'CCI', 'GEV', 'JCI', 'A', 'NUE', 'KMI', 'PRU', 'CMI', 'DOW', 'AME', 'LHX', 'ODFL', 'KR', 'GIS', 'HSY', 'FAST', 'IDXX', 'CNC', 'YUM', 'HUM', 'EXC', 'FIS', 'CTVA', 'IR', 'CHTR', 'PWR', 'MSCI', 'PCG', 'OTIS', 'CSGP', 'MLM', 'SYY', 'KVUE', 'ACGL', 'GEHC', 'RCL', 'FANG', 'PEG', 'NDAQ']
dict_keys(['PFE', 'CMCSA', 'IBM', 'PM', 'MS', 'AMGN', 'COP', 'UNP', 'BX', 'UBER', 'NEE', 'NOW', 'GS', 'NKE', 'RTX', 'SCHW', 'ISRG', 'SPGI', 'LOW', 'INTC', 'HON', 'ETN', 'SYK', 'UPS', 'PGR', 'ELV', 'MU', 'T', 'C', 'BKNG', 'LRCX', 'BLK', 'LMT', 'DE', 'MDT', 'TJX', 'BSX

In [3]:
final_texts = []
final_labels = []
bad_starts = ["Operator:", "TRANSCRIPT", "Operator", "Related:", "Executives"]

#Iterates through the first 300 companies
for i, tick in enumerate(tickers[:300]):
    #print(i)
    #Check to select the correct company transcript files ("t_50" variable)
    if i < 50:
        #Get company price and transcript date
        company = t_50[tick]
    elif i < 100 and i >= 50:
        company = t_100[tick]
    elif i < 150 and i >= 100:
        company = t_150[tick]
    elif i < 200 and i >= 150:
        company = t_200[tick]
    elif i < 250 and i >= 200:
        company = t_250[tick]

    company_price_df = dict_of_df[tick]
    #Transform Date column so that the time of day is not included for price data (makes look up easier)
    company_price_df["Date"] = company_price_df["Date"].map(lambda x: x.split()[0])
    #Iterate through years
    for year in YEARS:
        #Check to see if a earnings call has been reported for the given year
        if len(company[year].keys()) != 0:
            #Iterate through all the quarters that have a released earnings call
            for quarter in company[year].keys():
                #Grabs date and transcript of the earnings call
                date = company[year][quarter]["date"].split()
                transcript = company[year][quarter]["transcript"]

                #Checks to see when the call is released
                #If the call is in the middle of the day we want the close price of the previous day
                #If call is after hours we use the close price of that day
                time = date[1].split(":")
                date_0 = 0
                if int(time[0] + time[1]) < 1600:
                    date_0 = (company_price_df["Date"] == date[0]).shift(-1, fill_value = False)
                    #print("first", company_price_df["Date"], date[0])
                else:
                    date_0 = company_price_df["Date"] == date[0]
                    #print("second", sum(date_0))
                if sum(date_0) !=0:
                    #Grabs the Close price prior to earnings call, 20 days after, and 60 days after
                    date_20 = date_0.shift(20, fill_value = False)
                    date_60 = date_0.shift(60, fill_value = False)
                    data_on_dates = company_price_df[date_0 + date_20 + date_60]
                    close_price_0, close_price_20, close_price_60 = data_on_dates["Close"]

                    #Calculates the one month and three month price change
                    one_month_change = ((close_price_20 - close_price_0) / close_price_0) * 100
                    three_month_change = ((close_price_60 - close_price_0) / close_price_0) * 100

                    #Creates a label for whether the stock has gone up, down, or stayed the same after the call release
                    label = 0
                    if one_month_change > 5 and three_month_change > 10:
                        label = 1
                    elif one_month_change < -5 and three_month_change < -10:
                        label = -1

                    sentences = transcript.split("\n")
                    for sentence in sentences:
                        check = sentence.split()
                        check1 = sentence.split(":")
                        if len(check) !=0 and len(check) < 513:
                            if check[0] not in bad_starts and len(check1) == 2:
                                #print(sentence.split(":"))
                                final_texts.append(sentence.split(":")[1])
                                final_labels.append(label)


final_df = pd.DataFrame({"Text": final_texts, "Labels": final_labels})

NameError: name 'tickers' is not defined

In [None]:
final_texts = []
final_labels = []
bad_starts = ["Operator:", "TRANSCRIPT", "Operator", "Related:", "Executives"]

#Iterates through the first 300 companies
for i, tick in enumerate(tickers[:300]):
    #print(i)
    #Check to select the correct company transcript files ("t_50" variable)
    if i < 50:
        #Get company price and transcript date
        company = t_50[tick]
    elif i < 100 and i >= 50:
        company = t_100[tick]
    elif i < 150 and i >= 100:
        company = t_150[tick]
    elif i < 200 and i >= 150:
        company = t_200[tick]
    elif i < 250 and i >= 200:
        company = t_250[tick]

    company_price_df = dict_of_df[tick]
    #Transform Date column so that the time of day is not included for price data (makes look up easier)
    company_price_df["Date"] = company_price_df["Date"].map(lambda x: x.split()[0])
    #Iterate through years
    for year in YEARS:
        #Check to see if a earnings call has been reported for the given year
        if len(company[year].keys()) != 0:
            #Iterate through all the quarters that have a released earnings call
            for quarter in company[year].keys():
                #Grabs date and transcript of the earnings call
                date = company[year][quarter]["date"].split()
                transcript = company[year][quarter]["transcript"]

                #Checks to see when the call is released
                #If the call is in the middle of the day we want the close price of the previous day
                #If call is after hours we use the close price of that day
                time = date[1].split(":")
                date_0 = 0
                if int(time[0] + time[1]) < 1600:
                    date_0 = (company_price_df["Date"] == date[0]).shift(-1, fill_value = False)
                    #print("first", company_price_df["Date"], date[0])
                else:
                    date_0 = company_price_df["Date"] == date[0]
                    #print("second", sum(date_0))
                if sum(date_0) !=0:
                    #Grabs the Close price prior to earnings call, 20 days after, and 60 days after
                    date_20 = date_0.shift(20, fill_value = False)
                    date_60 = date_0.shift(60, fill_value = False)
                    data_on_dates = company_price_df[date_0 + date_20 + date_60]
                    close_price_0, close_price_20, close_price_60 = data_on_dates["Close"]

                    #Calculates the one month and three month price change
                    one_month_change = ((close_price_20 - close_price_0) / close_price_0) * 100
                    three_month_change = ((close_price_60 - close_price_0) / close_price_0) * 100

                    #Creates a label for whether the stock has gone up, down, or stayed the same after the call release
                    label = 0
                    if one_month_change > 5 and three_month_change > 10:
                        label = 1
                    elif one_month_change < -5 and three_month_change < -10:
                        label = -1

                    sentences = transcript.split("\n")
                    for sentence in sentences:
                        check = sentence.split()
                        check1 = sentence.split(":")
                        if len(check) !=0 and len(check) < 513:
                            if check[0] not in bad_starts and len(check1) == 2:
                                #print(sentence.split(":"))
                                final_texts.append(sentence.split(":")[1])
                                final_labels.append(label)


final_df = pd.DataFrame({"Text": final_texts, "Labels": final_labels})

In [11]:
final_texts = []
final_labels = []
bad_starts = ["Operator:", "TRANSCRIPT", "Operator", "Related:", "Executives"]

tick = "GOOG"
company = t_50[tick]

company_price_df = dict_of_df[tick]
#Transform Date column so that the time of day is not included for price data (makes look up easier)
company_price_df["Date"] = company_price_df["Date"].map(lambda x: x.split()[0])
#Iterate through years
for year in YEARS:
    #Check to see if a earnings call has been reported for the given year
    if len(company[year].keys()) != 0:
        #Iterate through all the quarters that have a released earnings call
        for quarter in company[year].keys():
            #Grabs date and transcript of the earnings call
            date = company[year][quarter]["date"].split()
            transcript = company[year][quarter]["transcript"]

            #Checks to see when the call is released
            #If the call is in the middle of the day we want the close price of the previous day
            #If call is after hours we use the close price of that day
            time = date[1].split(":")
            date_0 = 0
            if int(time[0] + time[1]) < 1600:
                date_0 = (company_price_df["Date"] == date[0]).shift(-1, fill_value = False)
                #print("first", company_price_df["Date"], date[0])
            else:
                date_0 = company_price_df["Date"] == date[0]
                #print("second", sum(date_0))
            if sum(date_0) !=0:
                #Grabs the Close price prior to earnings call, 20 days after, and 60 days after
                date_20 = date_0.shift(20, fill_value = False)
                date_60 = date_0.shift(60, fill_value = False)
                data_on_dates = company_price_df[date_0 + date_20 + date_60]
                close_price_0, close_price_20, close_price_60 = data_on_dates["Close"]

                #Calculates the one month and three month price change
                one_month_change = ((close_price_20 - close_price_0) / close_price_0) * 100
                three_month_change = ((close_price_60 - close_price_0) / close_price_0) * 100

                #Creates a label for whether the stock has gone up, down, or stayed the same after the call release
                label = 0
                if one_month_change > 5 and three_month_change > 10:
                    label = 1
                elif one_month_change < -5 and three_month_change < -10:
                    label = -1

                sentences = transcript.split("\n")
                for sentence in sentences:
                    check = sentence.split()
                    check1 = sentence.split(":")
                    if len(check) !=0 and len(check) < 513:
                        if check[0] not in bad_starts and len(check1) == 2:
                            #print(sentence.split(":"))
                            final_texts.append(sentence.split(":")[1])
                            final_labels.append(label)


final_df = pd.DataFrame({"Text": final_texts, "Labels": final_labels})

In [12]:
print("Proportion of data labeled 0:", final_df[final_df["Labels"] == 0].shape[0] / final_df.shape[0])
print("Proportion of data labeled 1:", final_df[final_df["Labels"] == 1].shape[0] / final_df.shape[0])
print("Proportion of data labeled -1:", final_df[final_df["Labels"] == -1].shape[0] / final_df.shape[0])
print(final_df.shape[0])

Proportion of data labeled 0: 0.7724765581908439
Proportion of data labeled 1: 0.1701599558742416
Proportion of data labeled -1: 0.057363485934914506
3626


In [13]:
list(map(lambda x: x.split(":"), t_50[tickers[0]]["2020"]["Q3"]["transcript"].split("\n")))

[['Operator',
  ' Greetings, and welcome to the Microsoft Fiscal Year 2020 Third Quarter Earnings Conference Call. [Operator Instructions]. As a reminder, this conference is being recorded. I would now like to turn the conference over to your host, Mr. Mike Spencer, General Manager of Investor Relations for Microsoft. Thank you. You may begin.'],
 ['Michael Spencer',
  " Good afternoon, and thank you for joining us today. On the call with me are Satya Nadella, Chief Executive Officer; Amy Hood, Chief Financial Officer; Frank Brod, Chief Accounting Officer; and Keith Dolliver, Deputy General Counsel. On the Microsoft Investor Relations website, you can find our earnings press release and financial summary slide deck, which is intended to supplement our prepared remarks during today's call and provides a reconciliation of differences between GAAP and non-GAAP financial measures. Unless otherwise specified, we will refer to non-GAAP metrics on the call. The non-GAAP financial measures pro

In [14]:
min(list(map(lambda x: len(x.split()), t_50[tickers[4]]["2020"]["Q3"]["transcript"].split("\n"))))

4

In [16]:
print(final_df)

                                                   Text  Labels
0      Well thanks very much, Kim. I’d like to thank...       0
1      Thank you very much, two quick questions. Fir...       0
2      This is George, as you’re correctly starting ...       0
3                                  Correct, and Sergey?       0
4      Yes, this is Sergey. We have increasingly off...       0
...                                                 ...     ...
3621                                   Thank you, Ruth.       0
3622   Good afternoon, Ruth, can you give us a sense...       0
3623   So I think as you know well, we don't tend to...       0
3624                                         Thank you.       0
3625   Thanks, everyone, for joining us today. We lo...       0

[3626 rows x 2 columns]


In [15]:
temp = final_df.sample(frac=1).reset_index(drop=True)
X_train = temp["Text"]
y_train = temp["Labels"]
# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [392]:

temp = tokenizer(t_50[tickers[6]]["2020"]["Q3"]["transcript"].split("\n")[1])
temp

{'input_ids': [101, 15555, 10554, 1024, 4067, 2017, 1012, 2204, 5027, 1998, 6160, 2000, 9130, 1521, 1055, 2353, 4284, 12609, 16565, 3034, 2655, 1012, 5241, 2033, 2651, 2000, 6848, 2256, 3463, 2024, 2928, 16950, 9102, 4059, 1010, 5766, 1025, 2016, 23320, 5472, 4059, 1010, 2522, 2080, 1025, 1998, 4913, 2057, 28989, 1010, 12935, 2080, 1012, 2077, 2057, 2131, 2318, 1010, 1045, 2052, 2066, 2000, 2202, 2023, 4495, 2000, 10825, 2017, 2008, 2256, 12629, 2651, 2097, 2421, 2830, 1513, 2559, 8635, 1012, 5025, 3463, 2089, 11234, 3430, 2135, 2013, 2216, 23133, 2011, 2122, 2830, 1513, 2559, 8635, 1012, 5876, 2008, 2071, 3426, 2122, 3463, 2000, 11234, 3430, 2135, 2024, 2275, 5743, 1999, 2651, 1521, 1055, 2811, 2713, 1010, 1998, 1999, 2256, 12174, 3189, 2006, 2433, 2184, 1513, 1053, 6406, 2007, 1996, 10819, 1012, 2151, 2830, 1513, 2559, 8635, 2008, 2057, 2191, 2006, 2023, 2655, 2024, 2241, 2006, 17568, 2004, 1997, 2651, 1998, 2057, 16617, 2053, 14987, 2000, 10651, 2122, 8635, 2004, 1037, 2765, 1997, 2

In [17]:
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

In [None]:
#All key inputs up here
num_labels = 3  # Number of labels (right now it's neutral 0, bad 1, good 2)
MAX_LENGTH = 128
batch_size = 10 # Number for minibatch training here
num_epochs = 3 # Number of training epochs

# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define the device we want to use
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Step 1: Preprocessing the data
# Tokenize the text data
tokenized_texts = []
labels = []
for i, row in df_data.iterrows():
    tokenized_text = tokenizer.encode(row['text'], add_special_tokens=True, max_length=512, truncation=True)
    tokenized_texts.append(tokenized_text)
    labels.append(row['label'])

# Define the label mapping
label_map = {0: 0, -1: 1, 1: 2}

# Change labels to be consistent with label mapping above
labels = [label_map[label] for label in labels]

# Step 2: Create dataloader

input_ids = torch.tensor([tokenized_text[:MAX_LENGTH] + [0] * (MAX_LENGTH - len(tokenized_text[:MAX_LENGTH])) for tokenized_text in tokenized_texts])
labels = torch.tensor(labels)

# Create dataloader

data = TensorDataset(input_ids, labels)
dataloader = DataLoader(data, batch_size=batch_size, shuffle=True)

# Step 3: Define the BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)
model.to(device)  # Move the model to the right device

# Step 4: Define the optimizer
optimizer = AdamW(model.parameters(), lr=0.001)

# Step 5: Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in dataloader:
        batch = tuple(t.to(device) for t in batch)
        input_ids, labels = batch

        optimizer.zero_grad()
        outputs = model(input_ids, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}, Average Training Loss: {avg_train_loss}")

# Step 6: Define a prediction function
def predict(text):
    # Tokenize the input text
    tokenized_text = tokenizer.encode(text, add_special_tokens=True, max_length=512, truncation=True)
    
    # Convert tokenized input to tensor and move it to the device
    input_ids = torch.tensor(tokenized_text).unsqueeze(0).to(device)
    
    # Set the model to eval mode
    model.eval()
    
    # Apparently turning off grad saves memory and computation
    with torch.no_grad():
        # Give model the inputs
        outputs = model(input_ids)
        
        # Get the logits from the model's output
        logits = outputs.logits
        
        # Calculate the probabilities using softmax
        probabilities = torch.softmax(logits, dim=-1).squeeze(0)
        
        # Get the predicted label
        predicted_label = torch.argmax(probabilities).item()
        
        # Return the predicted label and its probability
        return predicted_label, probabilities[predicted_label].item()