# Stock prediction from earnings call transcript
Modify 2nd cell with company name and url of latest earnings call from seekingalpha.com, then run all cells sequentially.You might also need to add latest earnings surprise to the XX_eps.csv file under data/ as well if it is not updated. 

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import urllib.request
from bs4 import BeautifulSoup
import urllib.request
import re
import string
import sys
import matplotlib.pyplot as plt
%matplotlib inline
from scipy.interpolate import interp1d
import pandas_datareader as pdr
import Load_MasterDictionary as LM
import pickle
import os
MASTER_DICTIONARY_FILE = r'LoughranMcDonald_MasterDictionary_2014.csv'
lm_dictionary = LM.load_masterdictionary(MASTER_DICTIONARY_FILE, True)


 ...Loading Master Dictionary 85000
Master Dictionary loaded from file: 
  LoughranMcDonald_MasterDictionary_2014.csv
  85,131 words loaded in master_dictionary.



In [2]:

url = "https://seekingalpha.com/article/4237346-amazon-com-inc-amzn-q4-2018-results-earnings-call-transcript"
company_name = 'Amazon'

In [3]:
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'

def download_url(url):
    import urllib.request

    headers = {
        "User-Agent": user_agent,
        "referrer": 'https://google.com',
        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"
    }
    req = urllib.request.Request(
        url, 
        data=None, 
        headers=headers
    )

    f = urllib.request.urlopen(req)

    return f.read().decode('utf-8')
    

In [4]:
def get_one_earnings_call(url):
    html = download_url(url)
    soup_obj = BeautifulSoup(html, 'html.parser')
    #get the title
    title_obj = soup_obj.find_all("h1")
    title = title_obj[0].get_text()
    #get company name
    company_name = title.split()[0]
    #get time stamp string
    time_str = soup_obj.find("div",{"class":"a-info clearfix"}).find("time")['content']
    #get full text
    paras = soup_obj.find_all("p")
    full_text = "\n".join([p.text for p in paras])
    return time_str,company_name,title,full_text
    

In [5]:
#split one transcript into intro and Q&A
#compute the tone for both parts 
def parse_one_call_transcript(doc):
    doc = doc.upper()
    div = re.findall('QUESTION.{0,2}AND.{0,2}ANSWER.{0,10}\n|QUESTION.{0,2}&.{0,2}ANSWER.{0,10}\n',doc)
    if len(div) == 0:
        div = re.findall('QUESTION.*AND.ANSWER.{0,10}OPERATOR',doc)
    if len(div) == 0:
        div = re.findall('Q&A.{0,8}',doc)
    
    sections = doc.split(div[0])
    intro = sections[0]
    qna = sections[1]
    #compute linguistic characteristics using L and McDonauld dictionary 
    odata_intro = feature_extraction(intro)
    odata_qna = feature_extraction(qna)
    #tone = positive sentiment - negative sentiment
    tone_intro = odata_intro[3] - odata_intro[4]
    tone_qna = odata_qna[3] - odata_qna[4]
    #compute abnormal tone which is difference btw introduction tone and q&a tone
    tone_ab = tone_intro - tone_qna
    
    return tone_intro,tone_qna, tone_ab

In [6]:
#extract featurs from doc using Loughran and McDonald Dictionary
def feature_extraction(doc):
    
    vdictionary = {}
    _odata = [0] * 17
    total_syllables = 0
    word_length = 0
    
    doc = doc.upper()
    tokens = re.findall('\w+', doc)  # Note that \w+ splits hyphenated words
    for token in tokens:
        if not token.isdigit() and len(token) > 1 and token in lm_dictionary:
            _odata[2] += 1  # word count
            word_length += len(token)
            if token not in vdictionary:
                vdictionary[token] = 1
            if lm_dictionary[token].positive: _odata[3] += 1
            if lm_dictionary[token].negative: _odata[4] += 1
            if lm_dictionary[token].uncertainty: _odata[5] += 1
            if lm_dictionary[token].litigious: _odata[6] += 1
            if lm_dictionary[token].weak_modal: _odata[7] += 1
            if lm_dictionary[token].moderate_modal: _odata[8] += 1
            if lm_dictionary[token].strong_modal: _odata[9] += 1
            if lm_dictionary[token].constraining: _odata[10] += 1
            total_syllables += lm_dictionary[token].syllables

    _odata[11] = len(re.findall('[A-Z]', doc))
    _odata[12] = len(re.findall('[0-9]', doc))
    # drop punctuation within numbers for number count
    doc = re.sub('(?!=[0-9])(\.|,)(?=[0-9])', '', doc)
    doc = doc.translate(str.maketrans(string.punctuation, " " * len(string.punctuation)))
    _odata[13] = len(re.findall(r'\b[-+\(]?[$€£]?[-+(]?\d+\)?\b', doc))
    _odata[14] = total_syllables / _odata[2]
    _odata[15] = word_length / _odata[2]
    _odata[16] = len(vdictionary) #number of unqiue words
    
    # Convert counts to %
    for i in range(3, 10 + 1):
        _odata[i] = (_odata[i] / _odata[2]) * 100
    # Vocabulary
        
    return _odata

In [7]:
#compute stock price and volatility change
#inputs: time stamp of earnings call, company ticker, number of days after and before call
#outputs: price and volatility changes
def compute_stock_prop(call_time,ticker,time_window):
    start_time = call_time - timedelta(days = time_window[0] + 5)#add extra days to account for weekend 
    end_time = call_time + timedelta(days = time_window[1] + 5) 
    stock_data = pdr.get_data_yahoo(ticker,start = start_time,end = end_time).reset_index()
    if len(stock_data) < 6:
        return None
    stock_data['Date'] = pd.to_datetime(stock_data['Date'])
    
    #split into before and after earnings call
    stock_before = stock_data[stock_data['Date'] <= call_time]['Adj Close']
    stock_after = stock_data[stock_data['Date'] > call_time]['Adj Close']
    stock_before = stock_before[-time_window[0]:]
    stock_after = stock_after[0:time_window[1]]
    
    #compute stock price and volatitiy change
    price_change  = (stock_after.mean() - stock_before.mean())/stock_before.mean()
    vol_before = stock_before.std()*np.sqrt(len(stock_before))
    vol_after = stock_after.std()*np.sqrt(len(stock_after))
    vol_change = (vol_after - vol_before)/vol_before
    
    return price_change,vol_change

In [8]:
#load earnings surprise data
def load_eps(company_name):
    path = os.getcwd()
    path = path[:-3]
    path = path + 'data/'
    eps_data = pd.read_csv(path + company_name + '_eps.csv',sep = '\t',names = ['time','eps'])
    eps_data['time'] = pd.to_datetime(eps_data['time'])
    eps_data['eps'] = eps_data['eps'].str.replace('%','').apply(float)
    return eps_data

In [9]:
#input: url of latest earnings call and company name
#output: stock price and volatility change 
def post_call_stock_prediction(url, company_name):
    #scrape the earnings call transcript and put it in a dataframe
    time_str,company,title,full_text = get_one_earnings_call(url)
    latest_call = pd.DataFrame(columns = ['time','company','title','full_text','url'])
    latest_call.loc[0] = [time_str,company,title,full_text,url]
    #get company ticker
    ticker_list = {"Apple":"AAPL","Amazon":"AMZN","Twitter":"TWTR","Microsoft":"MSFT","IBM":"IBM",
                  "Facebook":"FB","Ebay":"EBAY","Google":"GOOG","Oracle":"ORCL","Intel":"INTC"}
    ticker = ticker_list[company_name]
    # compute the time of earnings call
    time_earnings = pd.to_datetime(latest_call['time']) - timedelta(days=0.5)
    time_earnings = pd.to_datetime(time_earnings.apply(datetime.date))
    # extract tone from the transcript
    tones = latest_call['full_text'].apply(parse_one_call_transcript)
    tone_intro = []
    tone_qna = []
    tone_ab = []
    for tone in tones:
        tone_intro.append(tone[0])
        tone_qna.append(tone[1])
        tone_ab.append(tone[2])
    earnings_stock = pd.DataFrame(time_earnings,columns = ['time'])
    earnings_stock['company'] = ticker
    earnings_stock['tone_intro'] = tone_intro
    earnings_stock['tone_qna'] = tone_qna
    earnings_stock['tone_ab'] = tone_ab    
    
    #get the stock data from Yahoo finance and compute the change
    #pick up stock change around the call
    time_window = [10,10] #ten days before and after call
    stock_change = earnings_stock['time'].apply(compute_stock_prop,args = (ticker,time_window))
    price_change = []
    vola_change = []
    for change in stock_change:
        price_change.append(change[0])
        vola_change.append(change[1])
    
    earnings_stock['price_change'] = price_change
    earnings_stock['vola_change'] = vola_change   

    #load earnings surprise data and merge with earnings call
    eps_latest = load_eps(company_name)
    data_latest = eps_latest.merge(earnings_stock,on = 'time')
    print(data_latest)
    X_latest = data_latest[['eps','tone_ab']]
    y_latest = data_latest[['price_change','vola_change']]
    
    #load model
    p = os.path.join(os.getcwd(), "stock_prediction_model.best")
    with open(p, 'rb') as f2:
        best_model = pickle.load(f2)
    
    #compute predicted price and volatility change
    y_pred_latest = best_model.predict(X_latest)
    real_price_change = round(y_latest['price_change'][0]*10000)/100
    predicted_price_change = round(y_pred_latest[0,0]*10000)/100
    real_vola_change = round(y_latest['vola_change'][0]*10000)/100
    predicted_vola_change = round(y_pred_latest[0,1]*10000)/100
    print(company_name +' stock price change:' + str(real_price_change) + '%')
    print('predicted price change: ' + str(predicted_price_change) + '%')
    print(company_name +' stock volatility change:' + str(real_vola_change) + '%')
    print('predicted volatility change: ' + str(predicted_vola_change) + '%')
    

In [10]:
post_call_stock_prediction(url, company_name)

        time   eps company  tone_intro  tone_qna   tone_ab  price_change  \
0 2019-01-31  8.83    AMZN   -1.550388  1.096391 -2.646779     -0.020434   

   vola_change  
0     -0.49277  
Amazon stock price change:-2.04%
predicted price change: -2.07%
Amazon stock volatility change:-49.28%
predicted volatility change: 21.51%
