In [0]:
!wget https://raw.githubusercontent.com/kenwkliu/ideas/master/colab/preprocess.py
import preprocess

import string
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('treebank')

from nltk.corpus import treebank
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

import spacy
ner = spacy.load('en')

In [0]:
sentence = 'Apple was run by Steve Jobs before 2011 and now it is Tim Cook'

tokens = nltk.word_tokenize(sentence)
tokens

In [0]:
# POS tagging
# https://pythonprogramming.net/natural-language-toolkit-nltk-part-speech-tagging/
tagged = nltk.pos_tag(tokens)
tagged[0:len(tagged)]

In [0]:
# NER (Named Entity Recognition)
doc = ner(sentence)

for ent in doc.ents:
  print(ent.text, ":", ent.label_)

In [0]:
# Stemming by PorterStemmer
# https://www.nltk.org/_modules/nltk/stem/porter.html

s = PorterStemmer()
print(s.stem('Having'))
print(s.stem('Have'))
print(s.stem('Had'))

print(s.stem('Fishing'))
print(s.stem('Fish'))
print(s.stem('Fisher'))
print(s.stem('Fishes'))
print(s.stem('Fished'))

print(s.stem('am'))
print(s.stem('is'))
print(s.stem('was'))

In [0]:
# Lemmatization by WordNet
# Lemmatization is the process of converting a word to its base form. 
# Lemmatization considers the context and converts the word to its meaningful base form, 
# whereas stemming just removes the last few characters
# Sometimes the same word can have multiple different lemmas. 
# Based on the context (by POS tag), extract the appropriate lemma.

s = WordNetLemmatizer()
print(s.lemmatize('having', pos='v'))
print(s.lemmatize('have', pos='v'))
print(s.lemmatize('had', pos='v'))

print(s.lemmatize('fishing', pos='v'))
print(s.lemmatize('fish', pos='v'))
print(s.lemmatize('fisher', pos='n'))
print(s.lemmatize('fishes', pos='v'))
print(s.lemmatize('fished', pos='v'))

print(s.lemmatize('am', pos='v'))
print(s.lemmatize('is', pos='v'))
print(s.lemmatize('was', pos='v'))

In [0]:
# Tree Bank
words = treebank.words()

print("Word Count", len(words))
print(words[:17])

parsed = treebank.parsed_sents()[0]
print(parsed)

In [0]:
# Bi-grams
bigrams = nltk.bigrams(words)
biFdist = nltk.FreqDist(bigrams)
biFdist.plot(20, cumulative=False)

In [0]:
# Preprocess the words
preprocessed = []

for w in words:
  p = preprocess.process(w)
  if len(p)>0: preprocessed.append(p)

In [0]:
bigrams = nltk.bigrams(preprocessed)
biFdist = nltk.FreqDist(bigrams)
biFdist.plot(20, cumulative=False)