In [303]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import requests
import json 

  from .autonotebook import tqdm as notebook_tqdm


In [304]:
%load_ext autoreload
%autoreload 2
from functions import get_tickers, get_stock_dict, get_companies_by_50
QUARTERS = ["Q1", "Q2", "Q3", "Q4"]
YEARS = [str(2005 + i) for i in range(18)]


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [301]:
tickers = get_tickers()
dict_of_df = get_stock_dict()
t_50, t_100, t_150, t_200, t_250, t_300 = get_companies_by_50()

In [310]:
# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)  # For now, will assume binary classification

# Need to define training arguments here 

# Need to define the trainer here

# Need to fine-tune BERT on our data here
#trainer.train()

# Need to evaluate our work here
#trainer.evaluate()

#Iterates through the first 300 companies
for i, tick in enumerate(tickers[:300]):
    #Check to select the correct company transcript files ("t_50" variable)
    if i < 50:
        #Get company price and transcript date
        company = t_50[tick]
        company_price_df = dict_of_df[tick]
        #Transform Date column so that the time of day is not included for price data (makes look up easier)
        company_price_df["Date"] = company_price_df[tick]["Date"].map(lambda x: x.split()[0])
        #Iterate through years
        for year in YEARS:
            #Check to see if a earnings call has been reported for the given year
            if len(company[year].keys()) != 0:
                #Iterate through all the quarters that have a released earnings call
                for quarter in company[year].keys():
                    #Grabs date and transcript of the earnings call
                    date = company[year][quarter]["date"].split()
                    transcript = company[year][quarter]["transcript"]

                    #Checks to see when the call is released
                    #If the call is in the middle of the day we want the close price of the previous day
                    #If call is after hours we use the close price of that day
                    time = date[1].split(":")
                    date = 0
                    if int(time[0] + time[1]) < 1630:
                        date_0 = (company_price_df["Date"] == date[0]).shift(-1, fill_value = False)
                    else:
                        date_0 = company_price_df["Date"] == date[0]
                    
                    #Grabs the Close price prior to earnings call, 20 days after, and 60 days after
                    date_20 = date_0.shift(20, fill_value = False)
                    date_60 = date_0.shift(60, fill_value = False)
                    data_on_dates = company_price_df[date_0 + date_20 + date_60]
                    close_price_0, close_price_20, close_price_60 = data_on_dates["Close"]

                    #Calculates the one month and three month price change
                    one_month_change = ((close_price_20 - close_price_0) / close_price_0) * 100
                    three_month_change = ((close_price_60 - close_price_0) / close_price_0) * 100

                    #Creates a label for whether the stock has gone up, down, or stayed the same after the call release
                    label = 0
                    if one_month_change > 5 and three_month_change > 10:
                        label = 1
                    elif one_month_change < -5 and three_month_change < -10:
                        label = -1

                    #Insert the tokenized transcript and label into model for training
                    model.train(tokenizer(transcript), label)
    #Same code as above where we check to select the correct company transcript files ("t_100" variable)
    elif i < 100 and i >= 50:
        company = t_100[tick]
        company_price_df = dict_of_df[tick]
        company_price_df["Date"] = company_price_df[tick]["Date"].map(lambda x: x.split()[0])
        for year in YEARS:
            if len(company[year].keys()) != 0:
                for quarter in company[year].keys():
                    date = company[year][quarter]["date"].split()
                    transcript = company[year][quarter]["transcript"]
                    
                    time = date[1].split(":")
                    date = 0
                    if int(time[0] + time[1]) < 1630:
                        date_0 = (company_price_df["Date"] == date[0]).shift(-1, fill_value = False)
                    else:
                        date_0 = company_price_df["Date"] == date[0]

                    date_20 = date_0.shift(20, fill_value = False)
                    date_60 = date_0.shift(60, fill_value = False)
                    data_on_dates = company_price_df[date_0 + date_20 + date_60]
                    close_price_0, close_price_20, close_price_60 = data_on_dates["Close"]

                    one_month_change = ((close_price_20 - close_price_0) / close_price_0) * 100
                    three_month_change = ((close_price_60 - close_price_0) / close_price_0) * 100

                    label = 0
                    if one_month_change > 5 and three_month_change > 10:
                        label = 1
                    elif one_month_change < -5 and three_month_change < -10:
                        label = -1
                    
                    model.train(tokenizer(transcript), label)
    #Same code as above where we check to select the correct company transcript files ("t_150" variable)
    elif i < 150 and i >= 100:
        company = t_150[tick]
        company_price_df = dict_of_df[tick]
        company_price_df["Date"] = company_price_df[tick]["Date"].map(lambda x: x.split()[0])
        for year in YEARS:
            if len(company[year].keys()) != 0:
                for quarter in company[year].keys():
                    date = company[year][quarter]["date"].split()
                    transcript = company[year][quarter]["transcript"]
                    
                    time = date[1].split(":")
                    date = 0
                    if int(time[0] + time[1]) < 1630:
                        date_0 = (company_price_df["Date"] == date[0]).shift(-1, fill_value = False)
                    else:
                        date_0 = company_price_df["Date"] == date[0]

                    date_20 = date_0.shift(20, fill_value = False)
                    date_60 = date_0.shift(60, fill_value = False)
                    data_on_dates = company_price_df[date_0 + date_20 + date_60]
                    close_price_0, close_price_20, close_price_60 = data_on_dates["Close"]

                    one_month_change = ((close_price_20 - close_price_0) / close_price_0) * 100
                    three_month_change = ((close_price_60 - close_price_0) / close_price_0) * 100

                    label = 0
                    if one_month_change > 5 and three_month_change > 10:
                        label = 1
                    elif one_month_change < -5 and three_month_change < -10:
                        label = -1
                    
                    model.train(tokenizer(transcript), label)
    #Same code as above where we check to select the correct company transcript files ("t_200" variable)
    elif i < 150 and i >= 200:
        company = t_200[tick]
        company_price_df = dict_of_df[tick]
        company_price_df["Date"] = company_price_df[tick]["Date"].map(lambda x: x.split()[0])
        for year in YEARS:
            if len(company[year].keys()) != 0:
                for quarter in company[year].keys():
                    date = company[year][quarter]["date"].split()
                    transcript = company[year][quarter]["transcript"]
                    
                    time = date[1].split(":")
                    date = 0
                    if int(time[0] + time[1]) < 1630:
                        date_0 = (company_price_df["Date"] == date[0]).shift(-1, fill_value = False)
                    else:
                        date_0 = company_price_df["Date"] == date[0]

                    date_20 = date_0.shift(20, fill_value = False)
                    date_60 = date_0.shift(60, fill_value = False)
                    data_on_dates = company_price_df[date_0 + date_20 + date_60]
                    close_price_0, close_price_20, close_price_60 = data_on_dates["Close"]

                    one_month_change = ((close_price_20 - close_price_0) / close_price_0) * 100
                    three_month_change = ((close_price_60 - close_price_0) / close_price_0) * 100

                    label = 0
                    if one_month_change > 5 and three_month_change > 10:
                        label = 1
                    elif one_month_change < -5 and three_month_change < -10:
                        label = -1
                    
                    model.train(tokenizer(transcript), label)
    #Same code as above where we check to select the correct company transcript files ("t_250" variable)
    elif i < 200 and i >= 250:
        company = t_250[tick]
        company_price_df = dict_of_df[tick]
        company_price_df["Date"] = company_price_df[tick]["Date"].map(lambda x: x.split()[0])
        for year in YEARS:
            if len(company[year].keys()) != 0:
                for quarter in company[year].keys():
                    date = company[year][quarter]["date"].split()
                    transcript = company[year][quarter]["transcript"]
                    
                    time = date[1].split(":")
                    date = 0
                    if int(time[0] + time[1]) < 1630:
                        date_0 = (company_price_df["Date"] == date[0]).shift(-1, fill_value = False)
                    else:
                        date_0 = company_price_df["Date"] == date[0]

                    date_20 = date_0.shift(20, fill_value = False)
                    date_60 = date_0.shift(60, fill_value = False)
                    data_on_dates = company_price_df[date_0 + date_20 + date_60]
                    close_price_0, close_price_20, close_price_60 = data_on_dates["Close"]

                    one_month_change = ((close_price_20 - close_price_0) / close_price_0) * 100
                    three_month_change = ((close_price_60 - close_price_0) / close_price_0) * 100

                    label = 0
                    if one_month_change > 5 and three_month_change > 10:
                        label = 1
                    elif one_month_change < -5 and three_month_change < -10:
                        label = -1
                    
                    model.train(tokenizer(transcript), label)
    #Same code as above where we check to select the correct company transcript files ("t_300" variable)
    elif i < 250 and i >= 200:
        company = t_300[tick]
        company_price_df = dict_of_df[tick]
        company_price_df["Date"] = company_price_df[tick]["Date"].map(lambda x: x.split()[0])
        for year in YEARS:
            if len(company[year].keys()) != 0:
                for quarter in company[year].keys():
                    date = company[year][quarter]["date"].split()
                    transcript = company[year][quarter]["transcript"]
                    
                    time = date[1].split(":")
                    date = 0
                    if int(time[0] + time[1]) < 1630:
                        date_0 = (company_price_df["Date"] == date[0]).shift(-1, fill_value = False)
                    else:
                        date_0 = company_price_df["Date"] == date[0]

                    date_20 = date_0.shift(20, fill_value = False)
                    date_60 = date_0.shift(60, fill_value = False)
                    data_on_dates = company_price_df[date_0 + date_20 + date_60]
                    close_price_0, close_price_20, close_price_60 = data_on_dates["Close"]

                    one_month_change = ((close_price_20 - close_price_0) / close_price_0) * 100
                    three_month_change = ((close_price_60 - close_price_0) / close_price_0) * 100

                    label = 0
                    if one_month_change > 5 and three_month_change > 10:
                        label = 1
                    elif one_month_change < -5 and three_month_change < -10:
                        label = -1
                    
                    model.train(tokenizer(transcript), label)



KeyboardInterrupt: 

In [353]:
test = []
for tick in tickers[:50]:
    new = t_50[tick]
    for year in YEARS:
        if len(new[year].keys()) != 0:
            for quarter in new[year].keys():
                test.append(new[year][quarter]["date"].split()[1])


In [366]:
n = test[321].split(":")
print(int(n[0]+n[1]) < 1630)
n

False


['20', '23', '37']