In [1]:
import re

# Open ~ 1000 words about the great barrier reef

In [2]:
with open('greatbarrier.txt','r') as f:
    text = f.read()

# the most basic possible tokenization

In [3]:
tokens = re.sub('[^a-zA-Z ]+','',text.lower()).split()

In [4]:
len(tokens)

871

In [5]:
counts = {}
for t in tokens:
    if t in counts:
        counts[t]+=1
    else:
        counts[t]=1

# take the largest last few
we do this by sorting the list in reverse count order

In [6]:
sorted([(k,v) for k,v in counts.items()],key=lambda x: x[1], reverse=True)[:10]

[('the', 77),
 ('of', 38),
 ('in', 23),
 ('reef', 23),
 ('to', 20),
 ('and', 18),
 ('reefs', 17),
 ('great', 14),
 ('barrier', 13),
 ('years', 12)]

at this stage, we still clearly have some work to do: we have a bunch of 'boring' words the, of, in etc and we have 'reefs' and 'reef' as two separate words - to note two really basic problems

In [7]:
import nltk

In [8]:
#help(nltk.tokenize)

In [9]:
# could also do this
nltk_tokens=nltk.word_tokenize(text)

# lets remove stopwords

In [10]:
from nltk.corpus import stopwords

In [11]:
stopwords =  stopwords.words('english')
print stopwords

[u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u'your', u'yours', u'yourself', u'yourselves', u'he', u'him', u'his', u'himself', u'she', u'her', u'hers', u'herself', u'it', u'its', u'itself', u'they', u'them', u'their', u'theirs', u'themselves', u'what', u'which', u'who', u'whom', u'this', u'that', u'these', u'those', u'am', u'is', u'are', u'was', u'were', u'be', u'been', u'being', u'have', u'has', u'had', u'having', u'do', u'does', u'did', u'doing', u'a', u'an', u'the', u'and', u'but', u'if', u'or', u'because', u'as', u'until', u'while', u'of', u'at', u'by', u'for', u'with', u'about', u'against', u'between', u'into', u'through', u'during', u'before', u'after', u'above', u'below', u'to', u'from', u'up', u'down', u'in', u'out', u'on', u'off', u'over', u'under', u'again', u'further', u'then', u'once', u'here', u'there', u'when', u'where', u'why', u'how', u'all', u'any', u'both', u'each', u'few', u'more', u'most', u'other', u'some', u'such', u'no', u'nor', u

In [12]:
better_counts = {}
for t in tokens:
    if t in stopwords:
        # ignore this word and go to next elem in loop
        continue
    elif t in better_counts:
        better_counts[t]+=1
    else:
        better_counts[t]=1

# how much better have we done?

In [13]:
sorted([(k,v) for k,v in better_counts.items()],key=lambda x: x[1], reverse=True)[:15]

[('reef', 23),
 ('reefs', 17),
 ('great', 14),
 ('barrier', 13),
 ('years', 12),
 ('sea', 10),
 ('ago', 9),
 ('level', 8),
 ('coral', 8),
 ('found', 7),
 ('island', 7),
 ('islands', 7),
 ('growth', 5),
 ('queensland', 5),
 ('grow', 5)]

this looks pretty good except for 'reefs' and 'reef', 'island' and 'islands' - for this we need stemming

In [14]:
from nltk.stem.snowball import EnglishStemmer

In [15]:
stemmer = EnglishStemmer()

In [19]:
even_better_counts = {}
for t in tokens:
    w = stemmer.stem(t)
    if w in stopwords:
        continue
    elif w in even_better_counts:
        even_better_counts[w]+=1
    else:
        even_better_counts[w]=1

In [21]:
sorted([(k,v) for k, v in even_better_counts.items()],
       key=lambda x: x[1], 
       reverse=True
       )[:15]

[(u'reef', 40),
 (u'year', 15),
 (u'great', 14),
 (u'island', 14),
 (u'barrier', 13),
 (u'coral', 11),
 (u'sea', 10),
 (u'level', 9),
 (u'ago', 9),
 (u'found', 7),
 (u'grow', 6),
 (u'sediment', 5),
 (u'growth', 5),
 (u'water', 5),
 (u'form', 5)]

# Requirements
 - time series of sentiment
 - give each document a sentiment score
 - create full time series of sentiment/security
 - resample to create daily time series of sentiment/security
 
 - Are there any words with are particularly indicitave of a stock?
 - compute tf-idf statistic for each security and for each word -> visualise!