In [None]:
# !pip3 install nltk

In [1]:
import nltk

In [2]:
# NB: before calling this method elsewhere - 
# better download the data from http://www.nltk.org/data.html
# whole package is ~3G
# nltk.download()
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')
nltk.download('universal_tagset')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('brown')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to /root/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [3]:
# text tokenization and tagging
text = "This is a sentence that we will use to test the magic tool"

# requires tokenizers/punkt/english.pickle of nltk_data
tokens = nltk.word_tokenize(text)
print("Tokens:", tokens)

# tag meanings are here http://www.nltk.org/book/ch05.html
tagged = nltk.pos_tag(tokens, tagset='universal')
print("Tagged:", tagged)

# NB: see PoS for "magic"

Tokens: ['This', 'is', 'a', 'sentence', 'that', 'we', 'will', 'use', 'to', 'test', 'the', 'magic', 'tool']
Tagged: [('This', 'DET'), ('is', 'VERB'), ('a', 'DET'), ('sentence', 'NOUN'), ('that', 'ADP'), ('we', 'PRON'), ('will', 'VERB'), ('use', 'VERB'), ('to', 'PRT'), ('test', 'VERB'), ('the', 'DET'), ('magic', 'NOUN'), ('tool', 'NOUN')]


In [4]:
# use this corpus to indentify parts of speech
from nltk.corpus import brown    # 500 documents, ~1M words, biggest in NLTK

# universal, because default tagset in different from conventional
# https://en.wikipedia.org/wiki/Brown_Corpus
brown_tagged_sents = brown.tagged_sents(tagset='universal')
# brown_tagged_sents = brown.tagged_sents(categories='news')

# unigram tagger does not consider any context, 
# that's why it can make mistakes for words like "TEST"
unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
tagged_u = unigram_tagger.tag(tokens)
print("unigram:", tagged_u)

# what does they mean?
nltk.help.upenn_tagset('JJ')

# cropora provide readme!
print(nltk.corpus.brown.readme())

unigram: [('This', 'DET'), ('is', 'VERB'), ('a', 'DET'), ('sentence', 'NOUN'), ('that', 'ADP'), ('we', 'PRON'), ('will', 'VERB'), ('use', 'NOUN'), ('to', 'PRT'), ('test', 'NOUN'), ('the', 'DET'), ('magic', 'ADJ'), ('tool', 'NOUN')]
JJ: adjective or numeral, ordinal
    third ill-mannered pre-war regrettable oiled calamitous first separable
    ectoplasmic battery-powered participatory fourth still-to-be-named
    multilingual multi-disciplinary ...
BROWN CORPUS

A Standard Corpus of Present-Day Edited American
English, for use with Digital Computers.

by W. N. Francis and H. Kucera (1964)
Department of Linguistics, Brown University
Providence, Rhode Island, USA

Revised 1971, Revised and Amplified 1979

http://www.hit.uib.no/icame/brown/bcm.html

Distributed with the permission of the copyright holder,
redistribution permitted.



In [5]:
bigram_tagger = nltk.BigramTagger(brown_tagged_sents)
tagged_b = bigram_tagger.tag(tokens)
print("bigram:", tagged_b)

# see word test

bigram: [('This', 'DET'), ('is', 'VERB'), ('a', 'DET'), ('sentence', 'NOUN'), ('that', 'PRON'), ('we', 'PRON'), ('will', 'VERB'), ('use', 'VERB'), ('to', 'PRT'), ('test', 'VERB'), ('the', 'DET'), ('magic', 'ADJ'), ('tool', 'NOUN')]
