In [120]:
# Not sure which prior to use... Also, it looks like I may be putting the prior into the calculations at the wrong point 
# By the time we get to the end, the prior is too small to make a difference (by several orders of magnitude). Is this right? 

# Regardless of whether I use Indonesia or the Philippines, it looks like the sample will be skewed 
# Is there a standard method for dealing with skewed samples? (In this case, a single set that accounts for 65-75% of the total)
# It seems like the fundamental issue here is likely to be document length (see Frank And Bouckaert)
# Can I use augmented frequency to account for document length a la https://en.wikipedia.org/wiki/Tf%E2%80%93idf 
# Or can I use the constant from the Frank And Bouckaert paper (not sure if they're the same thing...)

# Still to review: 
# NLP for finance 
# Masters thesis 

In [121]:
import csv
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import nltk
from nltk.corpus import stopwords

In [252]:
# Import master matrix for testing and examine 

master = pd.read_csv('C:/Users/ajarczyk/Dropbox/Programming/BI_Project/Master matrix.csv')
master.head()

Unnamed: 0,Summary Date,Summary file number,Decreased,Same,Increased
0,2/3/2009,1,1,0,0
1,3/20/2009,2,1,0,0
2,4/20/2009,3,1,0,0
3,5/13/2009,4,1,0,0
4,6/9/2009,5,1,0,0


In [253]:
# Examine breakdown for the total sample 

master[['Decreased', 'Same', 'Increased']].sum()

Decreased    10
Same         67
Increased     5
dtype: int64

In [254]:
# Divide the data set for training, testing and examine breakdown 
# Is there a good way to make this replicable? I've tried using random.setstate() but the following indices continue to change: 
#test.index
#train.index

training, testing = train_test_split(master, test_size = 0.2)
training[['Decreased', 'Same', 'Increased']].sum()

Decreased     6
Same         54
Increased     5
dtype: int64

In [255]:
# Calculate the priors of the sample (used to normalize results across uneven samples)

# Training prior 
training_prob_s = float(training['Same'].sum()) / len(training['Same'])
training_prob_m = float(training['Decreased'].sum() + training['Increased'].sum()) / len(training['Same'])
training_prior = np.log(training_prob_s / training_prob_m)

# Sample prior
sample_prob_s = float(master['Same'].sum()) / len(master['Same'])
sample_prob_m = float(master['Decreased'].sum() + master['Increased'].sum()) / len(master['Same'])
sample_prior = np.log(sample_prob_s / sample_prob_m)
sample_prior

# Uniform prior (1/3, 1/3, 1/3)
uniform_prior = np.log((1.0/3) / float(2.0/3))
uniform_prior

-0.69314718055994529

In [256]:
# Divide the training set into two groups for training: 
# A 'steady' group for when interest rates remain steady 
# A 'move' group for when interest rates move (increase or decrease)

training_steady = training[training.Same==1]
training_move = training[training.Same==0]

In [257]:
# Open training files for each training group and combine into single text files
# Feed python the appropriate character set for decoding 

raw_steady = str()
for i in (training_steady.index + 1):
    fname = 'C:/Users/ajarczyk/Dropbox/Programming/BI_Project/BIEngStatement' + str(i) + '.txt'
    temp = open(fname).read()
    raw_steady += temp
raw_steady = raw_steady.decode('utf_16')

raw_move = str()
for i in (training_move.index + 1):
    fname = 'C:/Users/ajarczyk/Dropbox/Programming/BI_Project/BIEngStatement' + str(i) + '.txt'
    temp = open(fname).read()
    raw_move += temp
raw_move = raw_move.decode('utf_16')

In [258]:
# Separate the individual words in the documents

tokens_steady = nltk.word_tokenize(raw_steady)
tokens_move = nltk.word_tokenize(raw_move)

In [259]:
# Make all words lower case
# Remove punctuation, \n's, and numbers 

#tokens_steady = [word.lower() for word in tokens_steady if word.isalpha() and len(word)>2]
#tokens_move = [word.lower() for word in tokens_move if word.isalpha() and len(word)>2]
tokens_steady = [word.lower() for word in tokens_steady if word.isalpha()]
tokens_move = [word.lower() for word in tokens_move if word.isalpha()]

In [260]:
# Remove stopwards

stopwords = nltk.corpus.stopwords.words('english')
tokens_steady = [word for word in tokens_steady if word not in stopwords]
tokens_move = [word for word in tokens_move if word not in stopwords]

In [261]:
# Count the occurrences of each word 

fdist_steady = nltk.FreqDist(tokens_steady)
fdist_move = nltk.FreqDist(tokens_move)

In [262]:
# Create an index for the data table 

vocabulary_steady = fdist_steady.keys()
vocabulary_move = fdist_move.keys()

In [263]:
# Create a data table for the word count 

df_steady = pd.Series(fdist_steady, index=vocabulary_steady)
df_move = pd.Series(fdist_move, index=vocabulary_move)

In [264]:
# Add one to every count to deal with rare words via "additive smoothing"

df_steady = df_steady + 1 
df_move = df_move + 1 

In [265]:
# Divide both data tables by the largest frequency value to account for differing sample sizes 

df_steady = np.log(df_steady / float(df_steady.max()))
df_move = np.log(df_move / float(df_move.max()))

In [266]:
# Calculate the sample probability for each word 

df_steady = df_steady / sum(df_steady)
df_move = df_move / sum(df_move)

In [267]:
# Log the values to account for floating point underflow 

df_steady = np.log(df_steady)
df_move = np.log(df_move)

In [268]:
# Examine the training sets (1)

df_steady.sort_values(inplace=True)
df_steady.describe

<bound method Series.describe of economic             -inf
inflation      -15.743387
growth         -12.046265
indonesia      -11.852366
bank           -10.770829
global         -10.372338
financial       -9.999111
policy          -9.929142
rate            -9.835534
rupiah          -9.805059
domestic        -9.784922
economy         -9.710555
capital         -9.686126
prices          -9.604173
yoy             -9.585103
government      -9.490657
account         -9.429826
market          -9.415823
well            -9.392501
also            -9.387838
expected        -9.289814
line            -9.271075
investment      -9.242892
credit          -9.214594
foreign         -9.205132
us              -9.200394
imports         -9.195653
performance     -9.195653
stability       -9.176640
continue        -9.162328
                  ...    
disburse        -7.639981
downwards       -7.639981
encompassing    -7.639981
bolsterd        -7.639981
rent            -7.639981
locations       -7.639981
publi

In [269]:
# Examine the training sets (2)

df_move.sort_values(inplace=True)
df_move.describe

<bound method Series.describe of economic              -inf
indonesia       -11.111224
bank            -10.647527
inflation       -10.574062
growth          -10.210022
global          -10.062205
financial        -9.213181
rate             -9.186834
economy          -9.160819
domestic         -8.986646
policy           -8.939004
capital          -8.710373
banking          -8.688136
prices           -8.643876
exchange         -8.621832
rupiah           -8.621832
pressure         -8.599832
credit           -8.534000
also             -8.512078
stability        -8.468203
monetary         -8.446229
government       -8.402148
market           -8.335511
system           -8.335511
yoy              -8.313108
line             -8.267928
performance      -8.267928
trend            -8.267928
inflationary     -8.245119
level            -8.245119
                   ...    
booked           -7.108022
develop          -7.108022
intervention     -7.108022
mean             -7.108022
fasting          -7.10

In [271]:
# Open files for the testing group and combine into a single text file
# Feed python the appropriate character set for decoding 
# Separate the individual words in the testing document
# Make all words lower case
# Remove punctuation, \n's, numbers, and words with less than 3 letters
# Create a data frame for the testing set 
# Calculate probabilities for test set from training sets 

for i in (testing.index + 1):
    fname = 'C:/Users/ajarczyk/Dropbox/Programming/BI_Project/BIEngStatement' + str(i) + '.txt'
    temp = open(fname).read().decode('utf_16')
    tokens = nltk.word_tokenize(temp)
    tokens = [word.lower() for word in tokens if word.isalpha() and len(word)>2]
    df_testing = pd.Series(tokens)    
    p_steady = df_testing.map(df_steady).sum() 
    p_move = df_testing.map(df_move).sum()
    #print "P-Steady for doc " + str(i) + " is", p_steady
    #print "P-Move for doc " + str(i) + " is", p_move
    if (p_steady - p_move) > 0: 
        print "After doc " + str(i) + " interest rates should be steady." 
    else:
        print "After doc " + str(i) + " interest rates should move." 

After doc 25 interest rates should move.
After doc 59 interest rates should move.
After doc 67 interest rates should move.
After doc 6 interest rates should move.
After doc 14 interest rates should move.
After doc 30 interest rates should move.
After doc 20 interest rates should move.
After doc 1 interest rates should move.
After doc 15 interest rates should move.
After doc 72 interest rates should move.
After doc 50 interest rates should move.
After doc 21 interest rates should move.
After doc 32 interest rates should move.
After doc 11 interest rates should move.
After doc 70 interest rates should move.
After doc 48 interest rates should move.
After doc 19 interest rates should move.
