# Sentiment Analysis

In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import json

### Natural Language Processing

We begin by parsing the text and pre-processing it to prepare it for Latent Dirichlet Analysis. This step is meant to remove stopwords and identify nouns and adjectives.

In [53]:
from pattern.en import parse
from pattern.en import pprint
from pattern.vector import stem, PORTER, LEMMA
punctuation = list('.,;:!?()[]{}`''\"@#$^&*+-|=~_')

In [54]:
from sklearn.feature_extraction import text 
stopwords=text.ENGLISH_STOP_WORDS

The FOMC statements are full of phrases like "growth is expected to continue--given the current data--at a moderate pace". The two hyphens should be treated as a space.

In [None]:
import re
regex1=re.compile(r"\-{2,}")

We now define a function to find the nouns and adjectives of the text. The function returns a tuple where the first element is a list of lists, where each list includes the nouns from a sentence. The second element is a list of lists, where each list includes the adjectives from a sentence.

In [73]:
def modified_get_parts(thetext):
    thetext=re.sub(regex1, ', ', thetext)
    nouns=[]
    descriptives=[]
    for i,sentence in enumerate(parse(thetext, tokenize=True, lemmata=True).split()):
        
        # Skip the first three sentences that include the HTML
        nouns.append([])
        descriptives.append([])
#         if i in range(1,4):
#             continue
            
        for token in sentence:
            #print token
            if len(token[4]) >0:
                if token[1] in ['JJ', 'JJR', 'JJS']:
                    if token[4] in stopwords or token[4][0] in punctuation or token[4][-1] in punctuation or len(token[4])==1:
                        continue
            
                    descriptives[i].append(token[4])
                elif token[1] in ['NN', 'NNS']:
                    if token[4] in stopwords or token[4][0] in punctuation or token[4][-1] in punctuation or len(token[4])==1:
                        continue
                    nouns[i].append(token[4])
    out=zip(nouns, descriptives)
    nouns2=[]
    descriptives2=[]
    for n,d in out:
        if len(n)!=0 and len(d)!=0:
            nouns2.append(n)
            descriptives2.append(d)
    return nouns2[1:], descriptives2[1:]

We load in the fomc_mins_all dictionary that we created in the Scraping.ipynb iPython Notebook.

In [77]:
with open("fomc_mins_all.json", "rb") as infile:
    fomc_mins = json.load(infile)

We can now check how our modified_get_parts function would deal with a sample FOMC statement.

In [79]:
modified_get_parts(pq(fomc_mins['20140730']).text())

([[u'buildmenu',
   u'policy',
   u'minute',
   u'meeting',
   u'office',
   u'governor',
   u'present',
   u'member',
   u'president',
   u'economist',
   u'dev'],
  [u'governor',
   u'governor',
   u'adviser',
   u'governor',
   u'governor',
   u'governor',
   u'merten',
   u'interval',
   u'meeting',
   u'subcommittee',
   u'communication',
   u'issue'],
  [u'subcommittee',
   u'work',
   u'subcommittee',
   u'frame',
   u'discussion',
   u'range',
   u'communication',
   u'issue'],
  [u'development',
   u'session',
   u'governor',
   u'manager',
   u'development',
   u'market'],
  [u'manager',
   u'market',
   u'operation',
   u'period',
   u'outcome',
   u'test',
   u'operation',
   u'result',
   u'agreement',
   u'exercise',
   u'effect',
   u'bank',
   u'policy',
   u'action',
   u'yield',
   u'portion',
   u'portfolio'],
  [u'addition',
   u'manager',
   u'plan',
   u'pilot',
   u'program',
   u'number',
   u'counterparty',
   u'agency',
   u'security',
   u'operation',
   u'fi

We run modified_get_parts on each fomc statement and create a new dictionary fomc_parts. We strip each statement of the html at its start before passing it to modified_get_parts.

In [None]:
%%time
fomc_parts = {}
for key in fomc_mins.keys():
    fomc_parts[key] = modified_get_parts(pq(fomc_mins[key]).text())

In [None]:
fomc_parts_file = open("fomc_parts.json", "wb")
json.dump(fomc_parts, fomc_parts_file)
fomc_parts_file.close()

We can now create a flattened list of all the nouns.

In [207]:
nvocab = []
for key in fomc_mins.keys():
    nouns = fomc_parts[key][0]
    for nounlist in nouns:
        nvocab.append(nounlist)
nvocab

[[u'member'],
 [u'adviser', u'governor'],
 [u'governor'],
 [u'subcommittee',
  u'work',
  u'subcommittee',
  u'frame',
  u'discussion',
  u'range',
  u'communication',
  u'issue'],
 [u'development',
  u'session',
  u'governor',
  u'manager',
  u'development',
  u'market'],
 [u'manager',
  u'market',
  u'operation',
  u'period',
  u'outcome',
  u'test',
  u'operation',
  u'result',
  u'agreement',
  u'exercise',
  u'effect',
  u'bank',
  u'policy',
  u'action',
  u'yield',
  u'portion',
  u'portfolio'],
 [u'addition',
  u'manager',
  u'plan',
  u'pilot',
  u'program',
  u'number',
  u'counterparty',
  u'agency',
  u'security',
  u'operation',
  u'firm',
  u'dealer'],
 [u'vote', u'transaction', u'period'],
 [u'intervention', u'operation', u'currency', u'account', u'period'],
 [u'participant',
  u'discussion',
  u'issue',
  u'normalization',
  u'stance',
  u'policy',
  u'intention',
  u'information',
  u'public',
  u'year',
  u'participant',
  u'step',
  u'policy',
  u'accommodation'],
 [

In [201]:
from collections import Counter
frequency = Counter(nvocab)

In [202]:
id2word = {}; vocab = {}
for i,word in enumerate(frequency.keys()):
    vocab[word] = i 
    id2word[i] = word

In [203]:
len(vocab.keys())

1757

In [211]:
from collections import defaultdict
def sentencelist(sentence):
    d = defaultdict(int); group = []
    for word in sentence:
        word_id = vocab[word] 
        d[word_id] += 1 
    group = [(a, d[a]) for a in d.keys()]
    return group
corpus = [sentencelist(sentence) for sentence in nvocab]

## Topic Extraction

In [210]:
import gensim

In [226]:
lda2 = gensim.models.ldamodel.LdaModel(corpus, num_topics=3, id2word = id2word, update_every=1, chunksize=300)

In [227]:
lda2.print_topics()

[u'0.064*rate + 0.043*policy + 0.039*participant + 0.035*inflation + 0.024*fund + 0.023*security + 0.018*market + 0.018*percent + 0.016*condition + 0.016*objective',
 u'0.055*market + 0.034*period + 0.024*price + 0.022*condition + 0.022*labor + 0.021*operation + 0.019*risk + 0.018*dollar + 0.017*bank + 0.016*currency',
 u'0.054*price + 0.051*inflation + 0.033*quarter + 0.027*growth + 0.025*consumer + 0.024*year + 0.023*measure + 0.023*energy + 0.022*decline + 0.022*spending']

## Naive Bayes Approach