# Sentiment Analysis

In [21]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import json

### Natural Language Processing

We begin by parsing the text and pre-processing it to prepare it for Latent Dirichlet Analysis. This step is meant to remove stopwords and identify nouns and adjectives.

In [1]:
from pattern.en import parse
from pattern.en import pprint
from pattern.vector import stem, PORTER, LEMMA
punctuation = list('.,;:!?()[]{}`''\"@#$^&*+-|=~_')

In [2]:
from sklearn.feature_extraction import text 
stopwords=text.ENGLISH_STOP_WORDS

### Financial Vocabulary and Adjectives

**TO CHANGE: WE ARE GOING TO DO THIS FOR NAIVE BAYES PART!!!!!!!!**
Since our analysis will focus on financial information and so will be heavily reliant on the "sentiments" of vocabulary from this field, we can enhance our analysis by using a financial dictionary. The **Loughran-McDonald 2014 Master Dictionary** is a good tool to use as it includes words that often appear in 10-K documents. The dictionary includes 9 sentiment categories, including "negative", "positive", "uncertainty", "litigious", "modal", and "constraining", among others.


In [36]:
import re
regex1=re.compile(r"\.{2,}")

def modified_get_parts(thetext):
    thetext=re.sub(regex1, ' ', thetext)
    nouns=[]
    descriptives=[]
    for i,sentence in enumerate(parse(thetext, tokenize=True, lemmata=True).split()):
        nouns.append([])
        descriptives.append([])
        for token in sentence:
            #print token
            if len(token[4]) >0:
                if token[1] in ['JJ', 'JJR', 'JJS']:
                    if token[4] in stopwords or token[4][0] in punctuation or token[4][-1] in punctuation or len(token[4])==1:
                        continue
                    descriptives[i].append(token[4])
                elif token[1] in ['NN', 'NNS']:
                    if token[4] in stopwords or token[4][0] in punctuation or token[4][-1] in punctuation or len(token[4])==1:
                        continue
                    if len(token[4]) > 20:
                        continue
                    nouns[i].append(token[4])
    out=zip(nouns, descriptives)
    nouns2=[]
    descriptives2=[]
    for n,d in out:
        if len(n)!=0 and len(d)!=0:
            nouns2.append(n)
            descriptives2.append(d)
    return nouns2, descriptives2

In [37]:
get_parts("In determining how long to maintain this target range, the Committee will assess progress--both realized and expected--toward its objectives of maximum employment and 2 percent inflation.")

([[u'target',
   u'range',
   u'objective',
   u'employment',
   u'percent',
   u'inflation']],
 [[u'long', u'maximum']])

We load in the fomc_mins dictionary that we created in the Scraping.ipynb iPython Notebook.

In [38]:
with open("fomc_mins.json", "rb") as infile:
    fomc_mins = json.load(infile)

In [39]:
fomc_mins['20100127']

u'\n\n\n\n\n\nFRB: FOMC Minutes, January 26-27, 2010\n\n\n\n\n\n\n\n\n  (function(i,s,o,g,r,a,m){i[\'GoogleAnalyticsObject\']=r;i[r]=i[r]||function(){\n  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),\n  m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)\n  })(window,document,\'script\',\'//www.google-analytics.com/analytics.js\',\'ga\');\n  ga(\'create\', \'UA-35121701-1\', \'federalreserve.gov\', {\'cookieExpires\': 0});\n  ga(\'set\', \'anonymizeIp\', true);\n  ga(\'send\', \'pageview\');\n\n\n\nskip to main navigation\nskip to secondary navigation\nskip to content\n\n\n\n\nWhat\'s New \xb7 \nWhat\'s Next \xb7 \nSite Map \xb7 \nA-Z Index \xb7 \nFAQs \xb7 \nCareers \xb7 \nRSS\n\n\n\nSearch\xa0\xa0Advanced Search\n\n\n\n\n\n\n\n\n\n\nAboutthe Fed\n\n\nNews& Events\n\n\nMonetaryPolicy\n\n\nBankingInformation& Regulation\n\n\nPaymentSystems\n\n\nEconomicResearch& Data\n\n\nConsumer Information\n\n\nCommunityDevelopment\n\n\nRe

In [40]:
get_parts(fomc_mins['20100127'])

([[u'm=s.getelementsbytagname(o)[0];a.async=1;a.src=g;m.parentnode.insertbefore(a,m',
   u'window,document',
   u"script','",
   u"ga('create",
   u'federalreserve.gov',
   u'cookieexpire',
   u'ga',
   u'set',
   u'ga',
   u'pageview'],
  [u'skip', u'navigation', u'skip', u'navigation', u'skip'],
  [u'meeting', u'governor', u'office', u'governor'],
  [u'member'],
  [u'matter',
   u'agenda',
   u'meeting',
   u'advice',
   u'election',
   u'member',
   u'member',
   u'term',
   u'individual',
   u'oath',
   u'office'],
  [u'member', u'member'],
  [u'vote',
   u'officer',
   u'selection',
   u'successor',
   u'meeting',
   u'event',
   u'discontinuance',
   u'connection',
   u'connection'],
  [u'vote', u'addition', u'summary', u'rule', u'access', u'information'],
  [u'vote', u'transaction'],
  [u'vote', u'pleasure', u'selection'],
  [u'review',
   u'authorization',
   u'market',
   u'operation',
   u'currency',
   u'transaction',
   u'dollar',
   u'roll',
   u'transaction',
   u'process