This module gets length of call (number of words)

In [None]:
import pandas as pd
import time
import multiprocessing
import boto3
import awswrangler as wr
import numpy as np
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re
from tqdm import tqdm
import warnings
from joblib import Parallel, delayed
import multiprocessing
import io
from nltk import sent_tokenize, word_tokenize

ps = PorterStemmer()
bucket = '[your-bucket-name]'
warnings.filterwarnings("ignore")
stop_words = set(stopwords.words('english'))
tqdm.pandas()
datdir = "s3://{}/data/".format(bucket)


In [None]:
# basic data cleaning. (use same algorithm as in extract_covconcerns)

def clean_sentence(sentence, remove_stop=False, stem_words=True):

    # remove formatting
    sentence = re.sub('\n',' ', sentence) # remove line break markers 
    sentence = re.sub('&#[0-9]+;',' ', sentence) # remove character ids
    sentence = sentence.strip()
    sentence = sentence.lower() 
    
    # remove false flags
    sentence = re.sub(r"\b(?:'ll|we'll|will|may|should|shouldn't|can|can't|would|wouldn't|can also|may also|will also|should also) \b(?:increase|decrease|step down|step up|see|say|mention|recall|note|add|talk|like to)",'', sentence)
    sentence = re.sub('May','',sentence)
    
    # additional cleaning
    sentence = re.sub(r"\b(?=[mdclxvii])m{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})([ii]x|[ii]v|v?[ii]{0,3})\b\.?", '', sentence)
    sentence = re.sub(r'(mda|md a)','', sentence) # short form
    sentence = re.sub(r'form\s\w{0,1}','',sentence) # form number
    sentence = re.sub('table of contents','',sentence) # table of contents
    sentence = re.sub(r'(item|i tem)\s{0,1}[0-9]*[a-z]{0,1}','', sentence) # header
    sentence = re.sub('(year|years) ended','', sentence)
    sentence = re.sub('page\s{0,1}[0-9]*','',sentence)
    sentence = re.sub('rsquo','', sentence)
    sentence = re.sub('amp','', sentence)
    sentence = re.sub('rdquo','',sentence)
    sentence = re.sub('ldquo','',sentence)
    
    # remove hanging characters
    sentence = re.sub(r'(?<!\w)\.(?!\w)',' ',sentence) # remove hanging .
    sentence = re.sub(' +',' ',sentence)
    
    # remove stopwords
    if remove_stop:
        word_tokens = word_tokenize(sentence)
        word_filtered = [w for w in word_tokens if w not in stop_words]
        sentence = ' '.join(word_filtered)
        
    # remove capitalization and punctuations
    sentence = re.sub(r'\b[b-z]\b',' ', sentence) # remove hanging characters
    sentence = re.sub("[^A-Za-z\s]",' ',sentence) 
    sentence = re.sub(' +',' ',sentence)   
        
    # stem
    if stem_words:
        sentence = ' '.join([ps.stem(x) for x in sentence.split()])

    return sentence

In [None]:
# module to get number of sentence and number of words

def get_call_length(doc_in):
    # doc_in is a string containing the content of the call
    
    nsents = 0
    nwords = 0
    
    if len(doc_in) > 0:
        
        # clean text
        doc_clean = clean_sentence(doc_in, remove_stop=True, stem_words=True)
        
        # calculate number of tokens
        nwords = len(word_tokenize(doc_in))
    
    return {
        'nwords': nwords,
           }

In [None]:
df_all = pd.DataFrame()

yearstart = int(input('start year: '))
yearend = int(input('end year: '))
yearlist = range(yearstart,yearend+1)

for yr in reversed(yearlist):

    # read in earnings calls data
    print(f'loading data from year {yr}')

    # read file
    s3_client = boto3.client('s3')
    file = f"factset_calls/raw_{yr}.gzip"
    obj = s3_client.get_object(Bucket=bucket,Key=file)
    df = pd.read_parquet(io.BytesIO(obj['Body'].read()))
    
    # combine mda and qa section
    print('combining mda and qa...')
    raw_text = df.progress_apply(lambda x: x['mda']+x['qa'],axis=1)
    df.drop(['mda','qa'],axis=1,inplace=True)
    df['raw_text'] = raw_text
    
    # extract covmentions
    print('extracting call length...')
    query = Parallel(n_jobs=multiprocessing.cpu_count(), batch_size=32) \
            (delayed(get_call_length)(text) for text in tqdm(df['raw_text'])) 
    query = pd.DataFrame(query)
    df1 = df.join(query)
    
    # append to main data frame
    df1.drop(['raw_text'],axis=1,inplace=True)
    df_all = df_all.append(df1, ignore_index=True)
    

In [None]:
output_prefix = f's3://{bucket}/output/'
savepath = output_prefix +'factset_call_length_postsubmit.txt'
wr.s3.to_csv(
    df=df_all,
    path=savepath,
    sep='|'
)