In [1]:
#! Python3
# by Jacob Kovach
# Confidential and Proprietary

import numpy as np, pandas as pd, urllib.request, spacy, re, os
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
from nltk import tokenize
from pandas_datareader import DataReader

# Sets number of files for training the model, nlp dictionary, and list for the call links
file_no = 1000
call_links = []
ticker_re = re.compile(r'\:[A-Z]+\)')

# Read call links from the file
with open ('/Users/jkovach/Downloads/earnings-call-transcripts/_index.txt', 'r') as index:
    urls = index.readlines()
    for line in urls:
        url = line
        if len(call_links) >= file_no:
            break
        else:
            call_links.append(url)
        if len(call_links) % 100 == 0:
            print("Files Read: {}".format(len(call_links)))

# Create dataset
call_df = pd.DataFrame()
call_df['date_string'] = np.asarray([link[47:57] for link in call_links])
call_df['datetime'] = np.asarray(pd.to_datetime(call_df['date_string'], errors='coerce', format='%Y/%m/%d'))
call_df['filename'] = np.asarray([link[58:-6] for link in call_links])

# Define functions for getting raw text and additional information
root = '/Users/jkovach/Downloads/earnings-call-transcripts/'

def get_content(f):
    filepath = root + f + '.txt'
    with open (filepath, 'r') as tf:
        content = tf.read()
        content = ' '.join(content.split())
        content = re.sub(r'([A-Z]\S+\s){2}--(\s[A-Z]\S+){1,3}\s\S+(\s[A-Z]\S+)', '', content)
        content = re.sub(r'Executive Officer', '', content)
        content = tokenize.sent_tokenize(content)
        return content
    
def get_info(c):
    info = c[0:2]
    return info

def get_tokens(t):
    c = t[4:]
    content_string = ""
    for sent in c:
        content_string += sent
    return content_string

def get_ticker(info):
    ts = ""
    for sent in info:
        try:
            ts+=ticker_re.search(sent).group()
        except:
            ts=ts
    return ts[1:-1]

def price_min1(ticker, date):
    s = date - timedelta(days=3)
    e = date + timedelta(days=10)
    stock_df = DataReader(ticker, "yahoo", s, e).reset_index()
    cur_date_index = stock_df[stock_df['Date'] == date].index[0]
    return stock_df.loc[cur_date_index-1, 'Close']

def price_plus7(ticker, date):
    s = date - timedelta(days=3)
    e = date + timedelta(days=10)
    stock_df = DataReader(ticker, "yahoo", s, e).reset_index()
    cur_date_index = stock_df[stock_df['Date'] == date].index[0]
    return stock_df.loc[cur_date_index+5, 'Close']

def change_type(change_ratio):
    if change_ratio >= 4:
        return 2
    elif change_ratio <= -4:
        return 0
    else:
        return 1

call_df['content_raw'] = call_df['filename'].apply(get_content)
print("Content Fetched")
call_df['info'] = call_df['content_raw'].apply(get_info)
print("Info Fetched")
call_df['transcript'] = call_df['content_raw'].apply(get_tokens)
print("Transcript Fetched")
call_df['ticker'] = call_df['info'].apply(get_ticker)
print("Ticker Fetched")

for i in range(call_df.shape[0]):
    try:
        call_df.loc[i, 'min1'] = price_min1(call_df.loc[i, 'ticker'], call_df.loc[i, 'datetime'])
        call_df.loc[i, 'plus7'] = price_plus7(call_df.loc[i, 'ticker'], call_df.loc[i, 'datetime'])
        if i % 50 == 0:
            print(i)
        
    except:
        call_df.loc[i, 'min1'] = np.nan
        call_df.loc[i, 'plus7'] = np.nan

call_df['delta7'] = ((call_df['plus7'] - call_df['min1'])/call_df['min1']) * 100
print("Delta Fetched")
call_df['Movement'] = call_df['delta7'].apply(change_type)
print("Movement Binned")
call_df = call_df.drop(['date_string', 'filename', 'info', 'content_raw'], 1)

# Check for functionality on next run
call_df = call_df.dropna().reset_index(drop=True)

# Look at shelve to save with df types
call_df.to_csv('/Users/jkovach/Downloads/earnings-call-transcripts/_raw_data.csv')
call_df.head()

Files Read: 100
Files Read: 200
Files Read: 300
Files Read: 400
Files Read: 500
Files Read: 600
Files Read: 700
Files Read: 800
Files Read: 900
Files Read: 1000
Content Fetched
Info Fetched
Transcript Fetched
Ticker Fetched
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
Delta Fetched
Movement Binned


Unnamed: 0,datetime,transcript,ticker,min1,plus7,delta7,Movement
0,2017-10-12,"We're about to begin.Good morning, everyone, a...",DAL,53.07,52.27,-1.507442,1
1,2017-10-12,"We are about to begin.Good morning, ladies and...",JPM,96.839996,98.110001,1.311446,1
2,2017-10-13,"Today's call will be hosted by Susan Kendall, ...",C,72.370003,73.529999,1.602869,1
3,2017-10-16,"At this time, all participants are on a listen...",BAC,25.83,27.16,5.149051,2
4,2017-10-16,My name is Regina and I will be your conferenc...,WFC,53.689999,54.91,2.272306,1
