# Sentiment Analysis

In [21]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import json

### Natural Language Processing

We begin by parsing the text and pre-processing it to prepare it for Latent Dirichlet Analysis. This step is meant to remove stopwords and identify nouns and adjectives.

In [1]:
from pattern.en import parse
from pattern.en import pprint
from pattern.vector import stem, PORTER, LEMMA
punctuation = list('.,;:!?()[]{}`''\"@#$^&*+-|=~_')

In [2]:
from sklearn.feature_extraction import text 
stopwords=text.ENGLISH_STOP_WORDS

### Financial Vocabulary and Adjectives

**TO CHANGE: WE ARE GOING TO DO THIS FOR NAIVE BAYES PART!!!!!!!!**
Since our analysis will focus on financial information and so will be heavily reliant on the "sentiments" of vocabulary from this field, we can enhance our analysis by using a financial dictionary. The **Loughran-McDonald 2014 Master Dictionary** is a good tool to use as it includes words that often appear in 10-K documents. The dictionary includes 9 sentiment categories, including "negative", "positive", "uncertainty", "litigious", "modal", and "constraining", among others.


Ther are two main modifications to getting the parts-of-speech of the FOMC statements. Firstly, the statements are full of phrases like "growth is expected to continue--given the current data--at a moderate pace". The two hyphens should be treated as a space. Secondly, we must ignore the HTML code at the beginning of the statements.

In [None]:
regex1=re.compile('[,]+(?![0-9])') # For situations like "window,document" => "window, document" becomes two words

In [115]:
import re
regex1=re.compile(r"\-{2,}")

def modified_get_parts(thetext):
    thetext=re.sub(regex1, ', ', thetext)
    nouns=[]
    descriptives=[]
    for i,sentence in enumerate(parse(thetext, tokenize=True, lemmata=True).split()):
        # Skip the first three sentences that include the HTML
        nouns.append([])
        descriptives.append([])
        if i in [0]:
            continue
        for token in sentence:
            #print token
            if len(token[4]) >0:
                if token[1] in ['JJ', 'JJR', 'JJS']:
                    if token[4] in stopwords or token[4][0] in punctuation or token[4][-1] in punctuation or len(token[4])==1:
                        continue
                    # We check for longer words as a method to filter out the long HTML code at the beginning of each document
                    if len(token[4]) > 20:
                        continue
                    descriptives[i].append(token[4])
                elif token[1] in ['NN', 'NNS']:
                    if token[4] in stopwords or token[4][0] in punctuation or token[4][-1] in punctuation or len(token[4])==1:
                        continue
                    # We check for longer words as a method to filter out the long HTML code at the beginning of each document
                    if len(token[4]) > 20:
                        continue
                    nouns[i].append(token[4])
    out=zip(nouns, descriptives)
    nouns2=[]
    descriptives2=[]
    for n,d in out:
        if len(n)!=0 and len(d)!=0:
            nouns2.append(n)
            descriptives2.append(d)
    return nouns2, descriptives2

In [116]:
thetext=re.sub('[,]+(?![0-9])', ', ', "hello,right")
print thetext

hello, right


In [117]:
get_parts("In determining how long to maintain this target range, the Committee will assess progress--both realized and expected--toward its objectives of maximum employment and 2 percent inflation.")

([[u'target',
   u'range',
   u'progress',
   u'objective',
   u'employment',
   u'percent',
   u'inflation']],
 [[u'long', u'maximum']])

We load in the fomc_mins dictionary that we created in the Scraping.ipynb iPython Notebook.

In [118]:
with open("fomc_mins.json", "rb") as infile:
    fomc_mins = json.load(infile)

In [119]:
modified_get_parts(fomc_mins['20110921'])

([[u'meeting', u'governor', u'office', u'governor', u'present'],
  [u'member'],
  [u'development', u'development', u'market', u'period'],
  [u'market',
   u'operation',
   u'reinvestment',
   u'security',
   u'payment',
   u'holding',
   u'agency',
   u'debt',
   u'security'],
  [u'vote', u'transaction', u'period'],
  [u'staff',
   u'presentation',
   u'staff',
   u'presentation',
   u'tool',
   u'policy',
   u'framework',
   u'policy',
   u'accommodation',
   u'recovery'],
  [u'presentation',
   u'option',
   u'size',
   u'composition',
   u'portfolio',
   u'reinvestment',
   u'maturity',
   u'extension',
   u'program',
   u'portfolio',
   u'maturity',
   u'extension',
   u'program',
   u'asset',
   u'purchase',
   u'program'],
  [u'option', u'payment', u'holding', u'agency', u'security', u'security'],
  [u'option',
   u'security',
   u'security',
   u'transaction',
   u'maturity',
   u'portfolio',
   u'size',
   u'balance',
   u'sheet',
   u'level',
   u'reserve',
   u'balance',
   u