In [None]:
import nltk
from nltk.metrics import *
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import pos_tag
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

## Load data

In [None]:
with open('text.txt') as f:
    text = ' '.join(f.readlines()).decode('utf-8')

## Tokenization

In [None]:
sentences = sent_tokenize(text)
print "Example:\n\n%s"%sentences[1]

In [None]:
word_tokens = word_tokenize(text)
print "Total %d words"%len(word_tokens)

In [None]:
#45:55
#39:49
for i in word_tokens[45:55]:
    print i

## Stop-words removing

In [None]:
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
print len(stop)

In [None]:
filtered_word_tokens = [w for w in word_tokens if not w in stop]
print "Total %d words without stop-words"%len(filtered_word_tokens)

In [None]:
#34:45
#28:38
for i in filtered_word_tokens[34:45]:
    print i

In [None]:
with open('terrier-stop.txt') as f:
    stop_words = [i.strip() for i in f.readlines()]
print "We have %d stop words" % len(stop_words)

In [None]:
filtered_word_tokens = [w for w in word_tokens if not w in stop_words]
print "Total %d words without stop-words"%len(filtered_word_tokens)

In [None]:
#34:45
#28:38
for i in filtered_word_tokens[34:45]:
    print i

## Text preprocessing

Here we will lowcase the text and remove all the punctuation

In [None]:
import re

In [None]:
def removePunctuation(text):
    p = re.compile('[^a-zA-Z0-9_ ]')
    return p.sub('', text.lower()).strip()

In [None]:
text = removePunctuation(text)

In [None]:
print text[:200]

Come back and try previous code again!

## Stemming

In [None]:
ps = PorterStemmer()

In [None]:
stems = map(lambda w: ps.stem(w), filtered_word_tokens)

In [None]:
print "Total %d after stemming"%len(list(set(stems)))

In [None]:
#28-39
for i in range(28,39):
    print filtered_word_tokens[i], stems[i]

## Lemmatization

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
lemmas = map(lambda w: lemmatizer.lemmatize(w), filtered_word_tokens)

In [None]:
print "Total %d after lemmatization"%len(lemmas)

In [None]:
for i in range(28,39):
    print filtered_word_tokens[i], lemmas[i]

## Speech Tagging

In [None]:
example = word_tokenize(sentences[0])
print example

In [None]:
nltk.pos_tag(example)

## Speech tagging for lemmatization

In [None]:
from nltk.corpus import wordnet as wn

def is_noun(tag):
    return tag in ['NN', 'NNS', 'NNP', 'NNPS']

def is_verb(tag):
    return tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']

def is_adverb(tag):
    return tag in ['RB', 'RBR', 'RBS']

def is_adjective(tag):
    return tag in ['JJ', 'JJR', 'JJS']

def penn_to_wn(tag):
    if is_adjective(tag):
        return wn.ADJ
    elif is_noun(tag):
        return wn.NOUN
    elif is_adverb(tag):
        return wn.ADV
    elif is_verb(tag):
        return wn.VERB
    return wn.NOUN#None

In [None]:
lemmas = map(lambda w: lemmatizer.lemmatize(w, pos=penn_to_wn(pos_tag([w])[0][1])), filtered_word_tokens)

In [None]:
print "Total %d after lemmatization"%len(list(set(lemmas)))

In [None]:
for i in range(0,11):
    print filtered_word_tokens[i], lemmas[i]

## Word count

In [None]:
all_words = nltk.FreqDist(lemmas)
print(all_words.most_common(10))

In [None]:
print(all_words["system"])

In [None]:
print(len(all_words))

## Full Cycle

In [None]:
documents = []
with open('text.txt') as f:
    for line in f.readlines():
        word_tokens = word_tokenize(
                                    removePunctuation(line.decode('utf-8'))
                                    )
        filtered_word_tokens = filter(lambda w: not w in stop_words, word_tokens)
        stems = map(lambda w: ps.stem(w), filtered_word_tokens)
        documents += [stems]
documents = filter(lambda x: len(x)>0, documents)

## Syntax tree

In [None]:
from bllipparser import RerankingParser
rrp = RerankingParser.fetch_and_load('WSJ-PTB3', verbose=True)

In [None]:
rrp.simple_parse(sentences[0].encode('utf-8'))

In [None]:
rrp.parse(sentences[0].encode('utf-8'))[0].ptb_parse.as_nltk_tree().draw()