In [45]:
class McDonald_Word_List:
    def __init__(self, pos_words, neg_words):
        self.pos_words = pos_words
        self.neg_words = neg_words
        self.pos_word_counts = {word:0 for word, val in pos_words.items()}
        self.intersection_pos = {word:0 for word, val in pos_words.items()}
        self.neg_word_counts = {word:0 for word, val in neg_words.items()}
        self.intersection_neg = {word:0 for word,val in neg_words.items()}
        
    def __str__(self):
        print("""Pos words: {0}
                 Neg words: {1}""".format(
                  len(self.pos_words),
                  len(self.neg_words)))

We will now read Bill McDonald's Excel file containing the master dictionary of financial sentiment words.
For this task, I am using the xlrd library. For now, I am only reading the cell values that have words with positive or negative sentiment.

In [46]:
from xlrd import open_workbook
FORMAT = ['Positive', 'Negative']
values = ""

wb = open_workbook('McDonaldDict.xlsx')

values = []
for s in wb.sheets():
    #print 'Sheet:',s.name
    words = []
    pos = {}
    neg = {}
    for row in range(1, s.nrows):
        col_names = s.row(0)[1:]
        col_value = []
        word = s.cell(row, 0).value
        for name, col in zip(col_names, range(1,s.ncols)):
            value = (s.cell(row,col).value)
            if name.value == 'Positive' and int(value) > 0:
                pos[word] = int(value)
            elif name.value == 'Negative' and int(value) > 0:
                neg[word] = int(value)
            col_value.append((name.value, value))
        values.append(col_value)
mcd = McDonald_Word_List(pos, neg)
print(mcd.pos_words)

{'COMPLIMENTED': 2009, 'ATTRACTIVE': 2009, 'SUCCEED': 2009, 'DISTINCTIVELY': 2009, 'PROSPERED': 2009, 'ACHIEVES': 2009, 'PROFICIENTLY': 2009, 'ENABLED': 2009, 'FAVORITES': 2009, 'PROACTIVE': 2009, 'INTEGRITY': 2009, 'ENCOURAGES': 2009, 'COLLABORATED': 2009, 'COMPLIMENTS': 2009, 'ENHANCEMENTS': 2009, 'COLLABORATIONS': 2009, 'FAVORABLE': 2009, 'PLEASURE': 2009, 'PROGRESS': 2009, 'EXCELLENT': 2009, 'GREATLY': 2009, 'PLENTIFUL': 2009, 'IMPROVE': 2009, 'REVOLUTIONIZES': 2009, 'ADVANCES': 2009, 'EMPOWERED': 2009, 'BREAKTHROUGH': 2009, 'ENTHUSIASTIC': 2009, 'ACHIEVE': 2009, 'EXCLUSIVELY': 2009, 'INNOVATIVENESS': 2011, 'PREEMINENCE': 2009, 'UPTURN': 2009, 'INCREDIBLE': 2009, 'IMPROVEMENT': 2009, 'BETTER': 2009, 'DELIGHTS': 2009, 'ENHANCING': 2009, 'UNPARALLELED': 2009, 'DELIGHTING': 2009, 'EMPOWERING': 2009, 'SUCCEEDING': 2009, 'EFFICIENCIES': 2009, 'HONORED': 2009, 'STRONG': 2009, 'WINNING': 2009, 'STABILITY': 2009, 'PRESTIGIOUS': 2009, 'COMPLIMENTARY': 2009, 'ASSURES': 2009, 'PRESTIGE': 2009

Voila. We have our lists of positive and negative words generated. These words were annotated for the financial domain and will be what we use to analyze our pointwise mutual information across the corpus.

In [5]:
import re
import sys

We will use the method below to extract header information from the news article. Additionally, we get rid of the miscellaneous header text in order to extract text more precisely.

In [30]:
def extract_header(text):
    search = re.search('--(.+?)--(.+?)--(.+?)--(.+?)Reuters\)\s-', text, flags=re.DOTALL)
    text = re.sub('--.+?--.+?--.+?--.+?Reuters\)\s-', '', text, flags=re.DOTALL)
    title = search.group(1)
    author = search.group(2)
    date = search.group(3)
    link = search.group(4)
    return title, author, date, link, text

In [31]:
import os

path = os.getcwd() 

reuters_folders = os.listdir('/home/jmkovachi/Documents/jupyter_notebooks/reuters')[0:10]

path += '/reuters'

articles = []

In [10]:
print(reuters_folders)

['20090106', '20071103', '20081209', '20090908', '20130619', '20110530', '20070328', '20121009', '20081101', '20130719']


In [37]:
count = 0
for folder in reuters_folders:
    article_files = os.listdir(path + '/' + folder)
    sys.stdout.flush()
    for file in article_files:
        count += 1
        with open(path + '/' + folder + '/' + file) as f:
            raw_text = f.read()
            title, author, date, link, text = extract_header(raw_text)
            articles.append({'title' : title, 'author' : author, 'date' : date, 'link' : link, 'text' : text})
            #print(articles[len(articles)-1]['text'] if count % 100 == 0 else '')
            



KeyboardInterrupt: 

In [38]:
for article in articles:
    print(article)

{'text': ' Stocks gained on Tuesday on the increased likelihood of a government stimulus package after the release of minutes from the last Federal Reserve policy meeting painted a dismal picture of the U.S. economy. \n\n Investors bet technology stocks would benefit from President-elect Barack Obama\'s proposed economic plan that would include the largest U.S. infrastructure investment since the 1950s. Microsoft ( MSFT.O ) added 1.2 percent to $20.76 after the software maker said it sold 28 million units worldwide of its Xbox 360 video game console through the end of 2008, extending the Xbox\'s lead over rival Sony Corp\'s ( 6758.T )( SNE.N ) PlayStation 3. The Federal Reserve, in minutes from its December 15-16 meeting, warned of uncomfortably low levels of inflation and said the economic outlook will be weak for some time. "There is a little bit of a honeymoon period with the ushering in of the new calendar year, people are anticipating bold initiatives in the stimulus package," sai

We use this code above to open up our Reuters folder and read the files from our directory. The data being used here comes from this repository [financial news corpus](https://github.com/philipperemy/financial-news-dataset). It is pretty great. 

In [9]:
import nltk

In [10]:
from nltk.chunk import conlltags2tree, tree2conlltags

sentence = "Mark and John are working at Google."

for sent in nltk.sent_tokenize(sentence):
   for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
      if hasattr(chunk, 'label'):
         print(chunk.label(), ' '.join(c[0] for c in chunk))

PERSON Mark
PERSON John
ORGANIZATION Google


Above is some example code from nltk's NE chunker/tagger. It works quite well in our purposes for this PMI task.

Below is where I get into the meat of calculating the PMI. 

$$pmi(x,y) = log\frac{p(x,y)}{p(x)p(y)}$$

Usually we define p(x,y) as the probability of the intersection of two entities within some window. For the purposes of this experiment, I am defining windows as sentences. Therefore, the equation we arrive for calculating PMI at is:

$$pmi(x,y) = log\frac{count(x,y)_{D}}{count(x)_{D}count(y)_{D}}$$

Where $$D$$ is all of the documents in the Reuters corpus. $$x$$ and $$y$$ are occurrences of a polarity word (positive when calculating positive PMI, negative words when calculating negative PMI). 

Each article is looped through in order to build the overall counts of words in order to count PMIs.

Additionally, we store the counts of all words as they relate to organizations.

In [42]:
import math
import nltk

In [40]:
def num_words(sentences):
    l = 0
    pos_count = 0
    neg_count = 0
    for s in sentences:
        l += len(s)
        for word in nltk.word_tokenize(s):
            if word.upper() in mcd.pos_words:
                pos_count += 1
            elif word.upper() in mcd.neg_words:
                neg_count += 1
    return l, pos_count, neg_count

In [47]:
for article in articles:
    print(article)
    l, pos_count, neg_count = num_words(nltk.sent_tokenize(article['text']))
    print('{} positve, {} negative'.format(pos_count,neg_count))
    
    print('\n\n')

{'text': ' Stocks gained on Tuesday on the increased likelihood of a government stimulus package after the release of minutes from the last Federal Reserve policy meeting painted a dismal picture of the U.S. economy. \n\n Investors bet technology stocks would benefit from President-elect Barack Obama\'s proposed economic plan that would include the largest U.S. infrastructure investment since the 1950s. Microsoft ( MSFT.O ) added 1.2 percent to $20.76 after the software maker said it sold 28 million units worldwide of its Xbox 360 video game console through the end of 2008, extending the Xbox\'s lead over rival Sony Corp\'s ( 6758.T )( SNE.N ) PlayStation 3. The Federal Reserve, in minutes from its December 15-16 meeting, warned of uncomfortably low levels of inflation and said the economic outlook will be weak for some time. "There is a little bit of a honeymoon period with the ushering in of the new calendar year, people are anticipating bold initiatives in the stimulus package," sai

{'text': ' Bob McCann, head of brokerage at Merrill Lynch & Co, announced his plans to leave the securities firm, just days after its acquisition by Bank of America ( BAC.N ) was completed. \n\n On Monday, John Thain, the former Merrill chief executive who is now president of the Bank of America\'s global banking, securities and wealth management, announced McCann\'s plans to staff in a memo obtained by Reuters. A Merrill Lynch spokeswoman confirmed the memo. McCann, a 26-year veteran of Merrill who headed Merrill\'s global wealth management since 2003, told Thain "this is the right time for me to move on," according to the memo. Bank of America\'s acquisition of Merrill closed on Jan 1. (Reporting by  Steve James ,  Elinor Comlay , and  Phil Wahba ; Editing by  Andre Grenon ; Editing by Anshuman Daga)', 'date': ' Mon Jan 5, 2009 11:51pm EST\n', 'title': ' Merrill brokerage chief McCann to leave\n', 'author': ' \n', 'link': ' http://www.reuters.com/article/2009/01/06/us-merrill-mccann-

1 positve, 18 negative



{'text': ' Time Warner Cable extended a deal to carry CBS Corp\'s television stations, CBS said on Tuesday, avoiding what could have been another heated industry dispute. \n\n CBS did not disclose details of the agreement, but it comes just days after its sister company Viacom Inc threatened to pull its cable channels from Time Warner\'s 13 million homes because of a disagreement over a separate deal. The Time Warner-Viacom dispute was eventually resolved but underscored the rising tensions between networks, which provide programing, and cable operators, which transmit that programing into living rooms around the country. CBS Chief Executive Les Moonves has made new broadcast deals a priority, specifically insisting that he wants cable operators to pay CBS for the right to carry its stations in what are known as retransmission deals. In the past, deals involving cash payments were rare. Moonves declined to comment on the particulars of the deal with Time Warne

{'text': ' Bob McCann, head of brokerage at Merrill Lynch & Co, announced his plans to leave the securities firm, just days after its acquisition by Bank of America ( BAC.N ) was completed. \n\n On Monday, John Thain, the former Merrill chief executive who is now president of the Bank of America\'s global banking, securities and wealth management, announced McCann\'s plans to staff in a memo obtained by Reuters. A Merrill Lynch spokeswoman confirmed the memo. McCann, a 26-year veteran of Merrill who headed Merrill\'s global wealth management since 2003, told Thain "this is the right time for me to move on," according to the memo. Bank of America\'s acquisition of Merrill closed on Jan 1. (Reporting by  Steve James ,  Elinor Comlay , and  Phil Wahba ; Editing by  Andre Grenon ; Editing by Anshuman Daga)', 'date': ' Mon Jan 5, 2009 11:51pm EST\n', 'title': ' Merrill brokerage chief McCann to leave\n', 'author': ' \n', 'link': ' http://www.reuters.com/article/2009/01/06/us-merrill-mccann-

3 positve, 26 negative



{'text': ' Federal regulators are preparing a rescue plan to shore up the finances of some large credit unions, using billions of dollars in new government borrowings, the Wall Street Journal reported. \n\n The plan is not a taxpayer-funded bailout, but a short-term "mechanism to stabilize the credit-union system" while regulators work on other steps, to be announced early in 2009, Michael Fryzel, chairman of the National Credit Union Administration (NCUA) told the paper. A related program also to be announced by NCUA will provide as much as $2 billion in inexpensive loans to credit unions, which the institutions can use to reduce mortgage interest rates for homeowners, the paper said. Fryzel told the paper he does not know how much new federal borrowing the two programs would entail. Funding for the loan programs will come through the Treasury Department. NCUA and the Treasury Department could not be immediately reached for comment. Credit unions are non-prof




{'text': ' The slumping U.S. housing, factory and service sectors produced more misery for the world\'s largest economy in the last two months of 2008 as the year-old recession looked set to drag on into 2009, data showed on Tuesday. \n\n The Federal Reserve, in minutes of its December interest rate meeting, did nothing to dispel worries over the economy. Fed officials believed the U.S. economy would face "substantial" risks even after benchmark interest rates were cut to near zero, with some worrying about the risk of deflation, the minutes showed. In the housing market, the original source of the U.S. economic morass, pending sales of existing U.S. homes plunged to their lowest in at least seven years in November, according to data from a real estate industry group. The service sector, which represents about 80 percent of U.S. economic activity, contracted for a third straight month in December, the Institute for Supply Management said in a separate statement. Though the slump in 

{'text': ' American International Group Inc ( AIG.N ) stockholder and former chief executive Maurice "Hank" Greenberg said on Friday he was considering "strategic alternatives" for the world\'s largest insurer. \n\n Greenberg, ousted by AIG\'s board in 2005 during an investigation into fraud by then New York attorney general Eliot Spitzer, said in a regulatory filing that he anticipated holding discussions with other shareholders and third parties that would "improve ... the value of their investment." As of October, Greenberg owned or controlled through his companies more than 300 million shares of AIG, or nearly 12 percent, according to Reuters data. Greenberg, who has extensive links with Chinese state-owned entities and government officials, said in the filing with the U.S. Securities and Exchange Commission that he and parties involved with him had not yet made any decisions regarding their intentions. The filing named Edward Matthews, a former executive of AIG, and Starr Internat




{'text': ' Oil prices fell nearly 4 percent on Tuesday after the U.S. government forecast the world economic slowdown would shrink global oil consumption this year for the first time since the early 1980s. \n\n U.S. crude fell $1.71, or 3.91 percent, to $42.00 a barrel at 2:30 p.m. EST, after hitting a session low of $41.95. London Brent fell $2.02 to $41.40 a barrel. The U.S. Energy Information Administration said in its monthly energy outlook it expected global oil demand to fall by 50,000 barrels per day in 2008 and 450,000 bpd in 2009 -- marking the first time since 1983 that year-to-year world oil demand has dropped. The lower forecast came as the EIA revised its 2009 world GDP growth estimate to 0.5 percent, down from last month\'s estimate of 1.8 percent. The EIA estimates 2008 GDP growth will end up at 2.7 percent. "The EIA forecast is overly optimistic. I expect a significant contraction in demand, given the current state of the global economy," said Tom Knight, a trader at

In [None]:
def compute_PMI(class1, class2, int_c1c2, overall_count):
    return math.log((int_c1c2+1/overall_count)/((class1+1/overall_count)*(class2+1/overall_count)))
    # +1s added for smoothing

In [None]:
import pandas as pd
import numpy as np

length = 0
overall_pos = 0
overall_neg = 0
overall_org = 0
intersection_pos = 0
intersection_neg = 0

pos_df = pd.DataFrame(0, index=[str(key) for (key,val) in mcd.pos_words.items()], columns=[])
neg_df = pd.DataFrame(0, index=[str(key) for (key,val) in mcd.neg_words.items()], columns=[])

These pandas dataframes will allow us to represent co-occurences of organizations and polarity words as a matrix. Initially, dataframes are initialized as 0 column matrices with row indexes as the words from the positive and negative financial polarity lists.

In [None]:
for article in articles[:1000]:
    sentences = nltk.sent_tokenize(article.text)
    tmpL, tmp_pos, tmp_neg = num_words(sentences)
    length += tmpL
    overall_pos += tmp_pos
    overall_neg += tmp_neg
    for sent in sentences:
       org_count = 0
       pos_count = 0
       neg_count = 0
       org_list = []
       chunks = [chunk for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent)))]
       for chunk in chunks:
            if hasattr(chunk, 'label') and str(chunk.label()) == 'ORGANIZATION':
                #print(chunk.label())
                org_count += 1
                overall_org += 1
                org_list.append(str(chunk[0]).upper())
                if str(chunk[0]).upper() not in pos_df.columns:
                    print(str(chunk[0]).upper())
                    pos_df[str(chunk[0]).upper()] = np.zeros(len(pos_df.index))
                    neg_df[str(chunk[0]).upper()] = np.zeros(len(neg_df.index))
                
       tmp_org_count = org_count
       for chunk in chunks:
            #print(chunk[0])
            #print(mcd.pos_words)
            if str(chunk[0]).upper() in mcd.pos_words:
                tmp_org_list = org_list
                #print(chunk[0])
                pos_org_count = tmp_org_count
                while len(tmp_org_list) > 0:
                    pos_count += 1
                    pos_df.at[str(chunk[0]).upper(), tmp_org_list[0]] += 1
                    tmp_org_list.pop(0)
                    mcd.intersection_pos[str(chunk[0]).upper()] += 1
                mcd.pos_word_counts[str(chunk[0]).upper()] += 1
            elif str(chunk[0]).upper() in mcd.neg_words:
                #print(chunk[0])
                tmp_org_list = org_list  
                while(len(tmp_org_list) > 0):
                    neg_count += 1
                    neg_df.at[str(chunk[0]).upper(), tmp_org_list[0]] += 1
                    tmp_org_list.pop(0)
                    mcd.intersection_neg[str(chunk[0]).upper()] += 1
                mcd.neg_word_counts[str(chunk[0]).upper()] += 1
       intersection_pos += org_count if org_count < pos_count else pos_count
       intersection_neg += org_count if org_count < neg_count else neg_count
    #print(pos_count)
    #print(overall_org)
    #print(intersection)
    #print(l)
    
print(compute_PMI(overall_pos, overall_org, intersection_pos, l))
print(compute_PMI(overall_neg, overall_org, intersection_neg, l))
#print(mcd.pos_word_counts)
print(pos_df)


            
                
        
              #print(chunk.label(), ' '.join(c[0] for c in chunk))
    #create_co_occurrence_matrix(sentences)
    

%%latex

Here is the above algorithm:

\begin{enumerate}
    \item Initialize two empty Pandas dataframes, one for the positive words and one for the negative words.
    \item Loop through each article:
    \begin{enumerate}
        \item Tokenize each sentence in the article using NLTK.
            \begin{enumerate}
                \item Initialize an organization word count, a positive word count, a negative word count, and an empty list of orgs.
                \item Chunk the sentence using the NLTK NER chunker. Loop through each chunk and append the organization to the org list. If the organization is not present in the columns of the dataframe, insert a new column into the dataframe. 
                \item Loop through the chunks a second time. If the chunked word is in the positive (or negative, conversely) words dict, create a temporary organization list and increment the counts of the positive (negative) count (representing the number of co-occurrences in a sentence) and increment the index in the pandas dataframe corresponding to that positive or negative word.
            \end{enumerate}
    \end{enumerate}
\end{enumerate}

In [None]:
import numpy as np
import matplotlib.pyplot as plt

sorted_counts = sorted(mcd.pos_word_counts.items(), key=lambda kv: kv[1], reverse=True)
print(sorted_counts)
print(sorted_counts[50][0])
print(mcd.intersection_pos[sorted_counts[50][0]])
print(compute_PMI(sorted_counts[100][1], overall_org, mcd.intersection_pos[sorted_counts[100][0]], l))

sorted_counts[0:50]

plt.figure(figsize=(20, 3))  # width:20, height:3
# save the names and their respective scores separately
# reverse the tuples to go from most frequent to least frequent 
plt.bar(range(len(sorted_counts[0:20])), [val[1] for val in sorted_counts[0:20]], align='edge', width=.3)
plt.xticks(range(len(sorted_counts[0:20])), [val[0] for val in sorted_counts[:20]])
plt.xticks(rotation=70)
plt.show()


PMIs = [compute_PMI(count[1], overall_org, mcd.intersection_pos[count[0]], l) for count in sorted_counts[0:20]]

plt.figure(figsize=(20, 3))  # width:20, height:3
# save the names and their respective scores separately
# reverse the tuples to go from most frequent to least frequent 
plt.bar(range(len(sorted_counts[0:20])), PMIs, align='edge', width=.3)
plt.xticks(range(len(sorted_counts[0:20])), [val[0] for val in sorted_count[:20]])
plt.xticks(rotation=70)
plt.show()