module to get sentiment of call using Loughran-Mcdonald dictionary

In [None]:
import pandas as pd
import numpy as np
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re
from tqdm import tqdm
import warnings
from nltk import sent_tokenize, word_tokenize

ps = PorterStemmer()
bucket = '[bucket-name]'
warnings.filterwarnings("ignore")
stop_words = set(stopwords.words('english'))
tqdm.pandas()
datdir = "s3://{}/data/".format(bucket)

In [None]:
# read in Loughran Mcdonald sentiment dictionary

dat_path = datdir+"LoughranMcDonald_SentimentWordLists_2018.xlsx"

# read raw data
lm_pos = pd.read_excel(dat_path, sheet_name='Positive',header=None)[0]\
                        .apply(lambda x: re.sub('#.*$', '', str(x).lower()))
lm_neg = pd.read_excel(dat_path, sheet_name='Negative',header=None)[0]\
                        .apply(lambda x: re.sub('#.*$', '', str(x).lower()))

# stem and remove duplicates
lm_pos_stem = list(set([ps.stem(w) for w in lm_pos]))
lm_neg_stem = list(set([ps.stem(w) for w in lm_neg]))

lmdict = dict()
lmdict['positive'] = lm_pos_stem
lmdict['negative'] = lm_neg_stem

In [None]:
# basic data cleaning. (use same algorithm as in extract_covconcerns)

def clean_sentence(sentence, remove_stop=False, stem_words=True):

    # remove formatting
    sentence = re.sub('\n',' ', sentence) # remove line break markers 
    sentence = re.sub('&#[0-9]+;',' ', sentence) # remove character ids
    sentence = sentence.strip()
    sentence = sentence.lower() 
    
    # remove false flags
    sentence = re.sub(r"\b(?:'ll|we'll|will|may|should|shouldn't|can|can't|would|wouldn't|can also|may also|will also|should also) \b(?:increase|decrease|step down|step up|see|say|mention|recall|note|add|talk|like to)",'', sentence)
    sentence = re.sub('May','',sentence)
    
    # additional cleaning
    sentence = re.sub(r"\b(?=[mdclxvii])m{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})([ii]x|[ii]v|v?[ii]{0,3})\b\.?", '', sentence)
    sentence = re.sub(r'(mda|md a)','', sentence) # short form
    sentence = re.sub(r'form\s\w{0,1}','',sentence) # form number
    sentence = re.sub('table of contents','',sentence) # table of contents
    sentence = re.sub(r'(item|i tem)\s{0,1}[0-9]*[a-z]{0,1}','', sentence) # header
    sentence = re.sub('(year|years) ended','', sentence)
    sentence = re.sub('page\s{0,1}[0-9]*','',sentence)
    sentence = re.sub('rsquo','', sentence)
    sentence = re.sub('amp','', sentence)
    sentence = re.sub('rdquo','',sentence)
    sentence = re.sub('ldquo','',sentence)
    
    # remove hanging characters
    sentence = re.sub(r'(?<!\w)\.(?!\w)',' ',sentence) # remove hanging .
    sentence = re.sub(' +',' ',sentence)
    
    # remove stopwords
    if remove_stop:
        word_tokens = word_tokenize(sentence)
        word_filtered = [w for w in word_tokens if w not in stop_words]
        sentence = ' '.join(word_filtered)
        
    # remove capitalization and punctuations
    sentence = re.sub(r'\b[b-z]\b',' ', sentence) # remove hanging characters
    sentence = re.sub("[^A-Za-z\s]",' ',sentence) 
    sentence = re.sub(' +',' ',sentence)   
        
    # stem
    if stem_words:
        sentence = ' '.join([ps.stem(x) for x in sentence.split()])

    return sentence

In [None]:
# module to extract sentiment (Loughran-Mcdonald sentiment 2018)

def compute_sentiment(sent_in, senti_dict):

    # get list
    positive_list = senti_dict["positive"]
    negative_list = senti_dict["negative"]

    # tokenize sentence
    senti_score = None
    pos_words = None
    neg_words = None
    if sent_in is not None:
        sent_in = clean_sentence(sent_in, remove_stop=True, stem_words=True)
        tokens = word_tokenize(sent_in)

        # match sentiment word list
        pos_words = set(tokens).intersection(set(positive_list))
        neg_words = set(tokens).intersection(set(negative_list))

        # count number of sentiment words
        pos_count = sum([tokens.count(w) for w in pos_words])
        neg_count = sum([tokens.count(w) for w in neg_words])
        tot_count = pos_count + neg_count

        # compute sentiment score
        if (tot_count > 0) & (np.isfinite(tot_count)):
            senti_score = np.divide(pos_count*1 + neg_count*(-1), tot_count)
        else:
            senti_score = 0 

        # conver ttype
        senti_score = np.nan_to_num(senti_score)
        pos_words = list(pos_words)
        neg_words = list(neg_words)

    return senti_score, pos_words, neg_words

In [None]:
# MAIN PROGRAM

#### 1. READ DATA
filepath = datdir+'factset_calls_covconcerns_v6.gzip'
df = pd.read_parquet(filepath)

# run sentiment on all covenant sentences
covmention_senti = df['covenant_text'].progress_apply(lambda x: compute_sentiment(x, lmdict))
covmention_senti = list(zip(*covmention_senti))

# run sentiment on covconcern sentences
covconcern_senti = df['covconcern_text'].progress_apply(lambda x: compute_sentiment(x, lmdict))
covconcern_senti = list(zip(*covconcern_senti))

# save data
df['cov_senti'] = covmention_senti[0]
df['cov_pos_words'] = covmention_senti[1]
df['cov_neg_words'] = covmention_senti[2]
df['covcon_senti'] = covconcern_senti[0]
df['covcon_pos_words'] = covconcern_senti[1]
df['covcon_neg_words'] = covconcern_senti[2]

In [None]:
# check quality

test = df[df.cov_senti>0].sample(n=5).reset_index()
test
for i in range(5):
    
    print(test.loc[i,'cov_senti'])
    print(test.loc[i,'cov_pos_words'])
    print(test.loc[i,'cov_neg_words'])
    print(test.loc[i,'covenant_text'])
    print('\n\n')


In [None]:
# export sentiment data 

df1 = df[['date','repid','cov_senti','covcon_senti']]
df1 = df1.fillna(0)

savepath = datdir+'factset_calls_covconcerns_sentiment_v6.txt'
df1.to_csv(savepath, sep='|')
