In [1]:
#! Python3
# by Jacob Kovach
# Confidential and Proprietary

import numpy as np, pandas as pd, urllib.request, spacy, pickle, re
from string import punctuation
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
from nltk import tokenize
from nltk.corpus import stopwords
from pandas_datareader import DataReader
from tqdm import tqdm

# Sets number of files for training the model, nlp dictionary, and list for the call links
file_no = 1500
call_links = []

# Read call links from the file
with open ('/Users/jkovach/Downloads/earnings-call-transcripts/_index.txt', 'r') as index:
    urls = index.readlines()
    for line in urls:
        url = line
        if len(call_links) >= file_no:
            break
        else:
            call_links.append(url)
        if len(call_links) % 500 == 0:
            print("Files Read: {}".format(len(call_links)))

# Create dataset
call_df = pd.DataFrame()
call_df['date_string'] = np.asarray([link[47:57] for link in call_links])
call_df['datetime'] = np.asarray(pd.to_datetime(call_df['date_string'], errors='coerce', format='%Y/%m/%d'))
call_df['filename'] = np.asarray([link[58:-6] for link in call_links])
call_df = call_df.drop('date_string', 1)
call_df.head()

Files Read: 500
Files Read: 1000
Files Read: 1500


Unnamed: 0,datetime,filename
0,2017-10-11,barracuda-networks-q2-2018-earnings-conference...
1,2017-10-12,delta-air-lines-q3-2017-earnings-conference-ca...
2,2017-10-12,jp-morgan-chase-co-q3-2017-earnings-conference...
3,2017-10-13,citigroup-q3-2017-earnings-conference-call-tra...
4,2017-10-16,bank-of-america-corporation-q3-2017-earnings-c...


In [2]:
# Define functions for getting raw text and additional information
nlp = spacy.load('en')
root = '/Users/jkovach/Downloads/earnings-call-transcripts/'

# Regex objects
ticker_re = re.compile(r'\:[A-Z]+\)')
dur_re = re.compile(r'(Duration: )(\d+)')
chars = (punctuation + '0123456789').replace('-', '')

def text_cleaner(f):
    filepath = root + f + '.txt'
    with open (filepath, 'r') as tf:
        
        # Gets the raw text and creates a newline if these fuckwits didn't put spaces after periods
        raw_text = re.sub(r'([a-z0-9][\.:\?]?)([A-Z])', r'\1\n\2', tf.read())
        raw_text = re.sub(r'www\.\S+\.com', '', raw_text)
        raw_text = re.sub(r'--', '', raw_text)
        
        content = ""
        try:
            content = raw_text.split([sec for sec in raw_text.split('Contents:', 1)[1].split('\n')  
                                      if len(sec) > 2][0], 2)[2]
            head_check = 1
        except:
            head_check = np.nan    
        
        # Gets any line break section with content
        paras = content.split('\n')
        
        # Get only paragraphs which occur once, eliminate occurance of titles in corpus
        dicts = {para: 0 for para in set(paras)}
        for pg in paras:
            dicts[pg] += 1
        
        content = '\n'.join([pg for pg in paras if dicts.get(pg) == 1])
        
        # Remove content after the 'duration' header
        try:
            content = content.split(dur_re.search(raw_text).group(1))[0]
            foot_check = 1
        
        except:
            foot_check = np.nan
        
        # Create list of entities and remove them from the data
        tokens_to_remove = set([entity.text for entity in list(nlp(content).ents) 
                                if entity.label_ in ["MONEY", "PERCENT", "GPE", "PERSON", 
                                                     "TIME", "CARDINAL", "DATE"]])
        
        for token in tokens_to_remove:
            content = content.replace(token, '')
        
        content = content.replace('-', ' ')
        content = content.replace("'", '')
        
        # Tokenize to lemmas and remove numbers and punctuation
        content = ' '.join([word.lemma_.lower() for word in nlp(content) 
                            if not word.is_punct and not word.is_stop and len(word)>1])
        content = ' '.join([word for word in tokenize.word_tokenize(content) if word.isalpha()])
        
    return raw_text, content, head_check, foot_check

# Get the duration of the call
def get_duration(raw):
    try:
        duration = dur_re.search(raw).group(2)
    except:
        duration = np.nan
    return duration

#Get the ticker symbol
def get_ticker(raw):
    ts=""
    for sent in tokenize.sent_tokenize(raw)[0:2]:
        try:
            ts += ticker_re.search(sent).group()
        except:
            ts=ts
    return ts[1:-1]

tqdm.pandas(position=0, leave=True)
call_df['raw'], call_df['content'], call_df['header_check'], call_df['footer_check'] = zip(*call_df['filename'].progress_apply(text_cleaner))
call_df['duration'] = call_df['raw'].apply(get_duration)
call_df['ticker'] = call_df['raw'].apply(get_ticker)

"""sample = call_df.loc[0, 'content']
print(sample)
"""
call_df.head()


  from pandas import Panel
100%|██████████| 1500/1500 [52:35<00:00,  2.10s/it] 


Unnamed: 0,datetime,filename,raw,content,header_check,footer_check,duration,ticker
0,2017-10-11,barracuda-networks-q2-2018-earnings-conference...,\n\nImage source: The Motley Fool.\n\nBarracud...,good welcome networks second quarter earning c...,1.0,1.0,56,CUDA
1,2017-10-12,delta-air-lines-q3-2017-earnings-conference-ca...,\n\nImage source: The Motley Fool.\n\nDelta Ai...,ladies gentleman stand begin good welcome delt...,1.0,1.0,61,DAL
2,2017-10-12,jp-morgan-chase-co-q3-2017-earnings-conference...,\n\nImage source: The Motley Fool.\n\nJP Morga...,stand begin good lady gentleman welcome jp mor...,1.0,1.0,74,JPM
3,2017-10-13,citigroup-q3-2017-earnings-conference-call-tra...,\n\nImage source: The Motley Fool.\n\nCitigrou...,hello welcome earning review chief executive o...,1.0,1.0,82,C
4,2017-10-16,bank-of-america-corporation-q3-2017-earnings-c...,\n\nImage source: The Motley Fool.\n\nBank of ...,welcome bank america earnings announcement tim...,1.0,1.0,94,BAC


In [3]:
def price_delta(ticker, date):
    s = date - timedelta(days=3)
    e = date + timedelta(days=10)
    stock_df = DataReader(ticker, "yahoo", s, e).reset_index()
    cur_date_index = stock_df[stock_df['Date'] == date].index[0]
    min1 = stock_df.loc[cur_date_index-1, 'Close']
    plus7 = stock_df.loc[cur_date_index+5, 'Close']
    delta7 = ((plus7 - min1)/min1)*100
    return delta7

def change_type(change_ratio):
    if change_ratio >= 4:
        return 2
    elif change_ratio <= -4:
        return 0
    else:
        return 1

for i in range(call_df.shape[0]):
    try:
        call_df.loc[i, 'price_delta'] = price_delta(call_df.loc[i, 'ticker'], call_df.loc[i, 'datetime'])
        if i % 100 == 0:
            print("Processing Row: ", i)
        
    except:
        call_df.loc[i, 'price_delta'] = np.nan

call_df['Movement'] = call_df['price_delta'].apply(change_type)

# Check for functionality on next run
call_df = call_df.dropna().reset_index(drop=True)

with open ('/Users/jkovach/Downloads/earnings-call-transcripts/_call_df_1500', 'wb') as file:
    pickle.dump(call_df, file)
call_df.head()

Processing Row:  100
Processing Row:  200
Processing Row:  300
Processing Row:  400
Processing Row:  500
Processing Row:  600
Processing Row:  700
Processing Row:  800
Processing Row:  900
Processing Row:  1000
Processing Row:  1100
Processing Row:  1200
Processing Row:  1300
Processing Row:  1400


Unnamed: 0,datetime,filename,raw,content,header_check,footer_check,duration,ticker,price_delta,Movement
0,2017-10-12,delta-air-lines-q3-2017-earnings-conference-ca...,\n\nImage source: The Motley Fool.\n\nDelta Ai...,ladies gentleman stand begin good welcome delt...,1.0,1.0,61,DAL,-1.507442,1
1,2017-10-12,jp-morgan-chase-co-q3-2017-earnings-conference...,\n\nImage source: The Motley Fool.\n\nJP Morga...,stand begin good lady gentleman welcome jp mor...,1.0,1.0,74,JPM,1.311446,1
2,2017-10-13,citigroup-q3-2017-earnings-conference-call-tra...,\n\nImage source: The Motley Fool.\n\nCitigrou...,hello welcome earning review chief executive o...,1.0,1.0,82,C,1.602869,1
3,2017-10-16,bank-of-america-corporation-q3-2017-earnings-c...,\n\nImage source: The Motley Fool.\n\nBank of ...,welcome bank america earnings announcement tim...,1.0,1.0,94,BAC,5.149051,2
4,2017-10-16,wells-fargo-q3-2017-earnings-conference-call-t...,\n\nImage source: The Motley Fool.\n\nWells Fa...,good conference operator time like welcome wel...,1.0,1.0,94,WFC,2.272306,1
