# Sentiment Analysis

In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import json

### Natural Language Processing

We begin by parsing the text and pre-processing it to prepare it for Latent Dirichlet Analysis. This step is meant to remove stopwords and identify nouns and adjectives.

In [53]:
from pattern.en import parse
from pattern.en import pprint
from pattern.vector import stem, PORTER, LEMMA
punctuation = list('.,;:!?()[]{}`''\"@#$^&*+-|=~_')

In [54]:
from sklearn.feature_extraction import text 
stopwords=text.ENGLISH_STOP_WORDS

The FOMC statements are full of phrases like "growth is expected to continue--given the current data--at a moderate pace". The two hyphens should be treated as a space.

In [None]:
import re
regex1=re.compile(r"\-{2,}")

We now define a function to find the nouns and adjectives of the text. The function returns a tuple where the first element is a list of lists, where each list includes the nouns from a sentence. The second element is a list of lists, where each list includes the adjectives from a sentence.

In [73]:
def modified_get_parts(thetext):
    thetext=re.sub(regex1, ', ', thetext)
    nouns=[]
    descriptives=[]
    for i,sentence in enumerate(parse(thetext, tokenize=True, lemmata=True).split()):
        
        # Skip the first three sentences that include the HTML
        nouns.append([])
        descriptives.append([])
#         if i in range(1,4):
#             continue
            
        for token in sentence:
            #print token
            if len(token[4]) >0:
                if token[1] in ['JJ', 'JJR', 'JJS']:
                    if token[4] in stopwords or token[4][0] in punctuation or token[4][-1] in punctuation or len(token[4])==1:
                        continue
            
                    descriptives[i].append(token[4])
                elif token[1] in ['NN', 'NNS']:
                    if token[4] in stopwords or token[4][0] in punctuation or token[4][-1] in punctuation or len(token[4])==1:
                        continue
                    nouns[i].append(token[4])
    out=zip(nouns, descriptives)
    nouns2=[]
    descriptives2=[]
    for n,d in out:
        if len(n)!=0 and len(d)!=0:
            nouns2.append(n)
            descriptives2.append(d)
    return nouns2[1:], descriptives2[1:]

We load in the fomc_mins_all dictionary that we created in the Scraping.ipynb iPython Notebook.

In [77]:
with open("fomc_mins_all.json", "rb") as infile:
    fomc_mins = json.load(infile)

We can now check how our modified_get_parts function would deal with a sample FOMC statement, printing out the first two noun lists.

In [118]:
modified_get_parts(pq(fomc_mins['20140730']).text())[0][:2]

[[u'buildmenu',
  u'policy',
  u'minute',
  u'meeting',
  u'office',
  u'governor',
  u'present',
  u'member',
  u'president',
  u'economist',
  u'dev'],
 [u'governor',
  u'governor',
  u'adviser',
  u'governor',
  u'governor',
  u'governor',
  u'merten',
  u'interval',
  u'meeting',
  u'subcommittee',
  u'communication',
  u'issue']]

We run modified_get_parts on each fomc statement and create a new dictionary fomc_parts. We strip each statement of the html at its start before passing it to modified_get_parts.

In [85]:
%%time
fomc_parts = {}
for key in fomc_mins.keys():
    fomc_parts[key] = modified_get_parts(pq(fomc_mins[key]).text())

Wall time: 2min 4s


In [87]:
fomc_parts_file = open("fomc_parts.json", "wb")
json.dump(fomc_parts, fomc_parts_file)
fomc_parts_file.close()

We can now create two lists, a list of nouns for each sentence, and a flattened list of all the nouns which we will create to produce a dictionary (needed as an argument for the LDA function).

In [97]:
nvocab = []
for key in fomc_mins.keys():
    nouns = fomc_parts[key][0]
    for nounlist in nouns:
        nvocab.append(nounlist)

flattenednvocab = []
for key in fomc_mins.keys():
    nouns = fomc_parts[key][0]
    for nounlist in nouns:
        for n in nounlist:
            flattenednvocab.append(n)

In [99]:
from collections import Counter
frequency = Counter(flattenednvocab)

In [100]:
id2word = {}; vocab = {}
for i,word in enumerate(frequency.keys()):
    vocab[word] = i 
    id2word[i] = word

In [101]:
len(vocab.keys())

2759

In [102]:
from collections import defaultdict
def sentencelist(sentence):
    d = defaultdict(int); group = []
    for word in sentence:
        word_id = vocab[word] 
        d[word_id] += 1 
    group = [(a, d[a]) for a in d.keys()]
    return group
corpus = [sentencelist(sentence) for sentence in nvocab]

## Topic Extraction

In [103]:
import gensim

In [106]:
lda2 = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = id2word, update_every=1, chunksize=300)

In [107]:
lda2.print_topics()

[u'0.043*inflation + 0.037*policy + 0.035*market + 0.035*rate + 0.023*member + 0.022*participant + 0.019*meeting + 0.019*period + 0.018*condition + 0.018*fund',
 u'0.039*price + 0.022*quarter + 0.021*labor + 0.021*increase + 0.019*growth + 0.019*consumer + 0.019*month + 0.017*business + 0.016*spending + 0.016*activity']

## Naive Bayes Approach

### First Approach: Training a Naive Bayes Classifier

We create the vocabulary of adjectives that we will use.

In [124]:
flattened_adj_vocab = []
for key in fomc_mins.keys():
    nouns = fomc_parts[key][1]
    for nounlist in nouns:
        for n in nounlist:
            flattened_adj_vocab.append(n)

frequency2 = Counter(flattened_adj_vocab)
id2adj = {}; adjvocab = {}
for i,word in enumerate(frequency2.keys()):
    adjvocab[word] = i 
    id2adj[i] = word

We combine the adjectives in each statement into a single list and thereby have a list of adjectives for each review.

In [129]:
adjvocab = []
for key in fomc_mins.keys():
    nouns = fomc_parts[key][1]
    for nounlist in nouns:
        adjvocab.append(nounlist)
X = adjvocab

In [130]:
from sklearn.cross_validation import train_test_split
itrain, itest = train_test_split(xrange(len(X)), train_size=0.7)
mask=np.ones(len(X), dtype='int')
mask[itrain]=1
mask[itest]=0
mask = (mask==1)

In [131]:
def make_xy(X_col, y_col, vectorizer):
    X = vectorizer.fit_transform(X_col)
    y = y_col
    return X, y

In [132]:
"""
Function
--------
log_likelihood

Compute the log likelihood of a dataset according to 
a Naive Bayes classifier. 
The Log Likelihood is defined by

L = Sum_positive(logP(positive)) + Sum_negative(logP(negative))

Where Sum_positive indicates a sum over all positive reviews, 
and Sum_negative indicates a sum over negative reviews
    
Parameters
----------
clf : Naive Bayes classifier
x : (nexample, nfeature) array
    The input data
y : (nexample) integer array
    Whether each review is Fresh
"""
def log_likelihood(clf, x, y):
    prob = clf.predict_log_proba(x)
    rotten = y == 0
    fresh = ~rotten
    return prob[rotten,0].sum() + prob[fresh,1].sum()

In [133]:
from sklearn.cross_validation import KFold

def cv_score(clf, x, y, score_func, nfold=5):
    """
    Uses 5-fold cross validation to estimate a score of a classifier
    
    Inputs
    ------
    clf : Classifier object
    x : Input feature vector
    y : Input class labels
    score_func : Function like log_likelihood, that takes (clf, x, y) as input,
                 and returns a score
                 
    Returns
    -------
    The average score obtained by splitting (x, y) into 5 folds of training and 
    test sets, fitting on the training set, and evaluating score_func on the test set
    
    Examples
    cv_score(clf, x, y, log_likelihood)
    """
    result = 0
    for train, test in KFold(y.size, nfold): # split data into train/test groups, 5 times
        clf.fit(x[train], y[train]) # fit
        result += score_func(clf, x[test], y[test]) # evaluate score function on held-out data
    return result / nfold # average

In [139]:
def calibration_plot(clf, xtest, ytest):
    prob = clf.predict_proba(xtest)[:, 1]
    outcome = ytest
    data = pd.DataFrame(dict(prob=prob, outcome=outcome))

    #group outcomes into bins of similar probability
    bins = np.linspace(0, 1, 20)
    cuts = pd.cut(prob, bins)
    binwidth = bins[1] - bins[0]
    
    #freshness ratio and number of examples in each bin
    cal = data.groupby(cuts).outcome.agg(['mean', 'count'])
    cal['pmid'] = (bins[:-1] + bins[1:]) / 2
    cal['sig'] = np.sqrt(cal.pmid * (1 - cal.pmid) / cal['count'])
        
    #the calibration plot
    ax = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
    p = plt.errorbar(cal.pmid, cal['mean'], cal['sig'])
    plt.plot(cal.pmid, cal.pmid, linestyle='--', lw=1, color='k')
    plt.ylabel("Empirical P(+)")
    
    #the distribution of P(+)
    ax = plt.subplot2grid((3, 1), (2, 0), sharex=ax)
    
    plt.bar(left=cal.pmid - binwidth / 2, height=cal['count'],
            width=.95 * (bins[1] - bins[0]),
            fc=p[0].get_color())
    
    plt.xlabel("Predicted P(+)")
    plt.ylabel("Number")

In [134]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [136]:
print "alpha: %f" % best_alpha
print "min_df: %f" % best_min_df

In [None]:
vectorizer = CountVectorizer(vocabulary=adjvocab, min_df=best_min_df)
Xnew, ynew = make_xy(X, y, vectorizer)
xtrain=Xnew[mask]
ytrain=ynew[mask]
xtest=Xnew[~mask]
ytest=ynew[~mask]

clf = MultinomialNB(alpha=best_alpha).fit(xtrain, ytrain)

training_accuracy = clf.score(xtrain, ytrain)
test_accuracy = clf.score(xtest, ytest)

print "Accuracy on training data: %0.4f" % (training_accuracy)
print "Accuracy on test data:     %0.4f" % (test_accuracy)

calibration_plot(clf, xtest, np.ravel(ytest))

### Second Approach: Using the Loughran-McDonald Financial Dictionary

Our first approach assigned probabilities to words based on the training set of FOMC statements we have. However, the success of this is limited by the number of FOMC statements (182) that we have access to. We can improve this analysis by using more accurate probabilities given by a dictionary that focuses on finance. The **Loughran-McDonald 2014 Master Dictionary** is a great tool for our purposes as it includes words that often appear in 10-K documents and other financial statements. The dictionary includes 9 sentiment categories, including "negative", "positive", "uncertainty", "litigious", "modal", and "constraining", among others. The probabilities given by this dictionary are more useful to us because of the sheer size of the data set used to calculate the probabilities.

We read in the dictionary.

Below is an example of the data encapsulated in $lmdict$.

The dictionary includes 9 sentiment categories (e.g. "negative", "positive", "uncertainty", "litigious", "modal", and "constraining"). We reduce the dimensionality of this data down to just three categories: "negative", "positive", and "uncertain". A $word$ that is "uncertain" is assigned the probabilities $$P(word_u\,|\,-)=P(word_u\,|\,+)=\frac{1}{2}$$.
We set words in the "positive" and "negative" parts to have 90% probabilities of being in the group they are assigned to (based on the average probability given by Loughran and McDonald).
$$P(word_p\,|\,+) = 0.9, P(word_p\,|\,-) = 0.1$$
$$P(word_n\,|\,-) = 0.1, P(word_n\,|\,-) = 0.9$$

We adjust $lmdict$ to reflect these decisions.

We define a function to calculate the probability of a sentence being positive.

In [138]:
def calc_pplus(adjlist, lp, ln, pp, pn):
    prob_p = 0; prob_n = 0
    for adj in adjlist:
        prob_p += lp[adj]
        prob_n += ln[adj]
    prob_pos = prob_p * pp
    prob_neg = prob_n * pn
    prob = float(prob_pos)/(prob_pos + prob_neg)
    return prob