# Testing Part of Speech Taggers
* since no. of words is small -- want high accuracy, ok to be slower
<br><br>

In [None]:
from nltk.tag import StanfordPOSTagger
import nltk.data                        # use pre-trained Punkt tokenizer

# Testing Part of Speech Taggers
maxnet_tagger = nltk.data.load('taggers/maxent_treebank_pos_tagger/english.pickle')  # pre-trained
stanford_tagger = StanfordPOSTagger('english-bidirectional-distsim.tagger')  

In [None]:
test = 'The quick brown fox jumps over the lazy dog'.split()
%timeit maxnet_tagger.tag(test)    # 8.04 ms per loop - best
maxnet_tagger.tag(test)            # least accurate

In [None]:
%timeit stanford_tagger.tag(test)   # 2.47 s per loop - slowest
stanford_tagger.tag(test)           # most accurate

In [None]:
from textblob import TextBlob

In [None]:
testblob = TextBlob('The quick brown fox jumps over the lazy dog')
%timeit testblob.tags  # 2.05 s per loop
testblob.tags          # mediocre accuracy

In [None]:
import spacy
from spacy.en import English
tagger = English()

In [None]:
# %timeit a = tagger(u'The quick brown fox jumps over the lazy dog')
a = tagger(u'The quick brown fox jumps over the lazy dog')
for token in a:
    print token.orth_, token.tag_, token.pos_    # fastest

In [None]:
a = tagger(u'is was been has have being jumps jump wrote write')
for token in a:
    print token.orth_, token.lemma_, token.is_stop

In [None]:
from collections import Counter

lst = ['hi', 'there']
Counter(w.tag_.encode('ascii') for w in tagger(u' '.join(lst)))
# for token in a:
#     print token.orth_, token.tag_    # fastest

In [None]:
'a$'.isalpha()

# Testing US/GB Dictionaries

In [None]:
import enchant

d1 = enchant.Dict('en_US')  # not in dic: spam, spamming, spammers
d2 = enchant.Dict('en_GB')  # not in dic: i, etc

In [None]:
s = 'wxgtk'
print d1.check(s), d2.check(s)

<hr style='background-color: #fff; border-top: 2px dashed #8c8b8b;'>
# Functions
(Converted to `../code/dbio.py` and `../code/basic_stats.py`)
<br><br>

In [None]:
from __future__ import division         # no need to worry about integer division

from pymongo import MongoClient
import psycopg2 as pg2
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT, AsIs

import string, re
from collections import Counter
from itertools import chain, izip

import numpy as np
from nltk.tag import StanfordPOSTagger
import nltk.data                        # use pre-trained Punkt tokenizer
from nltk.corpus import stopwords       # using this set of stopwords for the model
import enchant                          # pyenchant -- US vs. GB spelling

import argparse

In [None]:
d = {'author': 'varchar(10)',
     'date'  : 'timestamptz',
     'type'  : 'varchar(10)'}
a, b = zip(*sorted(d.iteritems()))
print a
print b
tuple(chain(*zip(a,b)))

<hr style='background-color: #fff; border-top: 2px dashed #8c8b8b;'>
# Debug Anomalies:
* satoshi:
    * forum@236<br>
        ```
        StandfordPOSTagger error (java.lang.OutOfMemoryError)
        ```
    * forum@470<br>
        ```
        ufunc 'invert' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''
        ```
        
<br>

In [None]:
!python basic_stats.py satoshi -wt paper -rt True

In [None]:
!python basic_stats.py satoshi -wt email

In [None]:
!python basic_stats.py satoshi -wt forum -s 236 -e 237
# skipped 236

## Anomaly "forum@236"

* len(forum@236) = ~380 words
* verified *NOT* a python list vs. numpy array problem
* error when processing all words at once 
* no error when feed word into tagger one by one *AND* in each loop do a try-except catch
* error running slicing
* error running for loop without try-except catch
* `$ java -XX:+PrintFlagsFinal | grep 'MaxHeapSize'`: 1073741824 byte = 1GB
* but much longer paper (~3000 words) didn't cause this error... maybe too many words not in English? (e.g. computer variables)
<br><br>

In [None]:
client = MongoClient()
db  = client['satoshi']
tbl = db['raw-docs']

query_results = tbl.find( {'author': {'$eq': 'satoshi' },
                                      'type'  : {'$eq': 'forum' }   } )

results = [r for r in query_results]

In [None]:
# satoshi forum 236 -- problematic for StandfordPOSTagger
ctr, words, misspellings, gb_spellings, us_spellings = crunch_statistics(results[236]['content'])

In [None]:
stanford_tagger = StanfordPOSTagger('english-bidirectional-distsim.tagger')

In [None]:
len(words)

In [None]:
poses = []
for i,word in enumerate(words[:100]):
    try:
        lst = stanford_tagger.tag(word)
        print i,t
        lst += lst[0][-1].encode('ascii')
    except:
        break

In [None]:
lst = stanford_tagger.tag(words)


## Anomaly "forum@470"

* lines are all '\n', so no words!
* check original post -- all computer code so correctly removed everything
* numpy boolean selection doesn't like empty selector... 
<br><br>

In [None]:
b = np.array([])
~b

<hr style='background-color: #fff; border-top: 2px dashed #8c8b8b;'>
# Chunking and Boostrapping

(Converted to `prep_sents.py`, `chunking.py` and `prep_samples.py`)<br>
(code-v2: moved to `helper_function.py`)

<hr style='background-color: #fff; border-top: 2px dashed #8c8b8b;'>
# Get 250 Most Frequent Words

(Converted to `../code/term_freq.py`)<br>
(code-v2: moved to `helper_function.py`)