# Sentiment Analysis

In [21]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import json

### Natural Language Processing

We begin by parsing the text and pre-processing it to prepare it for Latent Dirichlet Analysis. This step is meant to remove stopwords and identify nouns and adjectives.

In [1]:
from pattern.en import parse
from pattern.en import pprint
from pattern.vector import stem, PORTER, LEMMA
punctuation = list('.,;:!?()[]{}`''\"@#$^&*+-|=~_')

In [2]:
from sklearn.feature_extraction import text 
stopwords=text.ENGLISH_STOP_WORDS

### Financial Vocabulary and Adjectives

**TO CHANGE: WE ARE GOING TO DO THIS FOR NAIVE BAYES PART!!!!!!!!**
Since our analysis will focus on financial information and so will be heavily reliant on the "sentiments" of vocabulary from this field, we can enhance our analysis by using a financial dictionary. The **Loughran-McDonald 2014 Master Dictionary** is a good tool to use as it includes words that often appear in 10-K documents. The dictionary includes 9 sentiment categories, including "negative", "positive", "uncertainty", "litigious", "modal", and "constraining", among others.


Ther are two main modifications to getting the parts-of-speech of the FOMC statements. Firstly, the statements are full of phrases like "growth is expected to continue--given the current data--at a moderate pace". The two hyphens should be treated as a space. Secondly, we must ignore the HTML code at the beginning of the statements.

In [173]:
regex2=re.compile('[,]+(?![0-9])') # For situations like "window,document" => "window, document" becomes two words

In [174]:
import re
regex1=re.compile(r"\-{2,}")

def modified_get_parts(thetext):
    thetext=re.sub(regex1, ', ', thetext)
    nouns=[]
    descriptives=[]
    for i,sentence in enumerate(parse(thetext, tokenize=True, lemmata=True).split()):
        
        # Skip the first three sentences that include the HTML
        nouns.append([])
        descriptives.append([])
        if i in [0,1,2,3]:
            continue
            
        for token in sentence:
            #print token
            if len(token[4]) >0:
                if token[1] in ['JJ', 'JJR', 'JJS']:
                    if token[4] in stopwords or token[4][0] in punctuation or token[4][-1] in punctuation or len(token[4])==1:
                        continue
                        
                    # We check for longer words as a method to filter out the long HTML code at the beginning of each document
                    if len(token[4]) > 20:
                        continue
                    
                    descriptives[i].append(token[4])
                elif token[1] in ['NN', 'NNS']:
                    if token[4] in stopwords or token[4][0] in punctuation or token[4][-1] in punctuation or len(token[4])==1:
                        continue
                        
                    # We check for longer words as a method to filter out the long HTML code at the beginning of each document
                    if len(token[4]) > 20:
                        continue
                    nouns[i].append(token[4])
    out=zip(nouns, descriptives)
    nouns2=[]
    descriptives2=[]
    for n,d in out:
        if len(n)!=0 and len(d)!=0:
            nouns2.append(n)
            descriptives2.append(d)
    return nouns2, descriptives2

In [175]:
thetext=re.sub('[,]+(?![0-9])', ', ', "hello,right")
print thetext

hello, right


We try modified_get_parts on the text below to see if it accurately removes the initial HTML code.

In [176]:
modified_get_parts("\n\n\n\n\n\nFRB: FOMC Minutes, July 30, 2014\n\n\n\n\n\n\n\n\n  (function(i,s,o,g,r,a,m){i[\'GoogleAnalyticsObject\']=r;i[r]=i[r]||function(){\n  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),\n  m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)\n  })(window,document,\'script\',\'//www.google-analytics.com/analytics.js\',\'ga\');\n  ga(\'create\', \'UA-35121701-1\', \'federalreserve.gov\'. {\'cookieExpires\': 0});\n.  ga(\'set\', \'anonymizeIp\', true);\n  ga(\'send\', \'pageview\');\n\n\n\nskip to main navigation\nskip to secondary navigation\nskip to content\n\n\n\n\nWhat\'s. In determining how long to maintain this target range, the Committee will assess progress--both realized and expected--toward its objectives of maximum employment and 2 percent inflation.")

([[u'skip', u'navigation', u'skip', u'navigation', u'skip'],
  [u'target',
   u'range',
   u'progress',
   u'objective',
   u'employment',
   u'percent',
   u'inflation']],
 [[u'main', u'secondary'], [u'long', u'maximum']])

We load in the fomc_mins dictionary that we created in the Scraping.ipynb iPython Notebook.

In [184]:
modified_get_parts("S1. s2. s3. s4. s6. Sentence 5 is great. Restaurant is short town and high. Restaurant is hot and near the car.")

([[u'sentence'], [u'town'], [u'car']],
 [[u'great'], [u'short', u'high'], [u'hot']])

In [133]:
with open("fomc_mins.json", "rb") as infile:
    fomc_mins = json.load(infile)

We can now check how our modified_get_parts function would deal with a sample FOMC statement.

In [136]:
modified_get_parts(fomc_mins['20140730'])

([[u'member'],
  [u'adviser', u'governor'],
  [u'governor'],
  [u'subcommittee',
   u'work',
   u'subcommittee',
   u'frame',
   u'discussion',
   u'range',
   u'communication',
   u'issue'],
  [u'development',
   u'session',
   u'governor',
   u'manager',
   u'development',
   u'market'],
  [u'manager',
   u'market',
   u'operation',
   u'period',
   u'outcome',
   u'test',
   u'operation',
   u'result',
   u'agreement',
   u'exercise',
   u'effect',
   u'bank',
   u'policy',
   u'action',
   u'yield',
   u'portion',
   u'portfolio'],
  [u'addition',
   u'manager',
   u'plan',
   u'pilot',
   u'program',
   u'number',
   u'counterparty',
   u'agency',
   u'security',
   u'operation',
   u'firm',
   u'dealer'],
  [u'vote', u'transaction', u'period'],
  [u'intervention', u'operation', u'currency', u'account', u'period'],
  [u'participant',
   u'discussion',
   u'issue',
   u'normalization',
   u'stance',
   u'policy',
   u'intention',
   u'information',
   u'public',
   u'year',
   u'pa

In [None]:
# fomc_parts = fomc_mins.map(lambda r: modified_get_parts(r))

In [141]:
%%time
fomc_parts = {}
for key in fomc_mins.keys():
    fomc_parts[key] = modified_get_parts(fomc_mins[key])

Wall time: 43.7 s


In [179]:
fomc_mins['20140730']

u'\n\n\n\n\n\nFRB: FOMC Minutes, July 30, 2014\n\n\n\n\n\n\n\n\n  (function(i,s,o,g,r,a,m){i[\'GoogleAnalyticsObject\']=r;i[r]=i[r]||function(){\n  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),\n  m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)\n  })(window,document,\'script\',\'//www.google-analytics.com/analytics.js\',\'ga\');\n  ga(\'create\', \'UA-35121701-1\', \'federalreserve.gov\', {\'cookieExpires\': 0});\n  ga(\'set\', \'anonymizeIp\', true);\n  ga(\'send\', \'pageview\');\n\n\n\nskip to main navigation\nskip to secondary navigation\nskip to content\n\n\n\n\nWhat\'s New \xb7 \nWhat\'s Next \xb7 \nSite Map \xb7 \nA-Z Index \xb7 \nCareers \xb7 \nRSS \xb7 \nAll Videos \xb7 \nCurrent FAQs \xb7 \nContact Us\n\n\n\nSearch\xa0\xa0Advanced Search\n\n\n\n\n\n\n\n\n\n\nAboutthe Fed\n\n\nNews& Events\n\n\nMonetaryPolicy\n\n\nBankingInformation& Regulation\n\n\nPaymentSystems\n\n\nEconomicResearch& Data\n\n\nConsumer Informa

In [143]:
fomc_parts['20140730']

([[u'member'],
  [u'adviser', u'governor'],
  [u'governor'],
  [u'subcommittee',
   u'work',
   u'subcommittee',
   u'frame',
   u'discussion',
   u'range',
   u'communication',
   u'issue'],
  [u'development',
   u'session',
   u'governor',
   u'manager',
   u'development',
   u'market'],
  [u'manager',
   u'market',
   u'operation',
   u'period',
   u'outcome',
   u'test',
   u'operation',
   u'result',
   u'agreement',
   u'exercise',
   u'effect',
   u'bank',
   u'policy',
   u'action',
   u'yield',
   u'portion',
   u'portfolio'],
  [u'addition',
   u'manager',
   u'plan',
   u'pilot',
   u'program',
   u'number',
   u'counterparty',
   u'agency',
   u'security',
   u'operation',
   u'firm',
   u'dealer'],
  [u'vote', u'transaction', u'period'],
  [u'intervention', u'operation', u'currency', u'account', u'period'],
  [u'participant',
   u'discussion',
   u'issue',
   u'normalization',
   u'stance',
   u'policy',
   u'intention',
   u'information',
   u'public',
   u'year',
   u'pa

In [187]:
len(fomc_parts['20140730'])

2

In [None]:
nvocab = []
for key in fomc_mins.keys():
    nouns = fomc_parts[key][0]
    for nounlist in nouns:
        for noun in nounlist:
            nvocab.append(noun)
nvocab