In [1]:
import nltk

# NLTK
Its a platform with a lot of modules to work with natural language. It provides tokenization, stemming, tagging, parsing and semantic analysis.

Run the command bellow and download the following data:

**In Corpora**
- mac_morpho

- floresta

- stopwords

**In Models**
- punkt

- rslp

The files will be downloaded to user home folder inside *nltk_data*

In [None]:
nltk.download()

# Tokenization
## sentence & word segmentation

In [None]:
text = "Hoje está um ótimo dia para estudar NLP com python 3.*! Explore o NLTK, que tem rotinas para tratar a linguagem natural."
sentences =nltk.tokenize.sent_tokenize(text, language='portuguese')
for sentence in sentences:
    print(sentence)
    tokens = nltk.word_tokenize(sentence, language='portuguese')
    print(tokens)

# Stemming
## keeping word stem (*tronco*)

In [None]:
stemmer = nltk.stem.RSLPStemmer()
print(stemmer.stem("canto"))
print(stemmer.stem("cantar"))
print(stemmer.stem("computador"))

sent = "O canto é retângulo!"
tokens = nltk.word_tokenize(sent, language='portuguese')
print(tokens)
stemmed_tokens = [stemmer.stem(token) for token in tokens]
print(stemmed_tokens)

# Stopwords
## keeping content words

In [None]:
stopwords = nltk.corpus.stopwords.words('portuguese')
print(stopwords)

In [None]:
sent = "O ônibus parou no centro do campo de futebol"
tokens = nltk.word_tokenize(sent, language='portuguese')
tokens_no_stopwords = []
for token in tokens:
    if token.lower() not in stopwords:
        tokens_no_stopwords.append(token)
new_sent = " ".join(tokens_no_stopwords)
print(new_sent)
print(" ".join([token for token in sent.lower().split() if token not in stopwords]))

# Tagging
## morphosyntax

In [None]:
from nltk.corpus import mac_morpho
print(mac_morpho.words())
print(mac_morpho.sents())
print(mac_morpho.tagged_sents())

## training a tagger
### unigram based

In [None]:
tagged_sentences = mac_morpho.tagged_sents()
unigram_tagger = nltk.UnigramTagger(tagged_sentences)
print(unigram_tagger.evaluate(tagged_sentences))

In [None]:
sent = "O canto é retângulo!"
tokens = nltk.word_tokenize(sent, language='portuguese')
print(unigram_tagger.tag(tokens))
sent = "Eu canto pra você!"
tokens = nltk.word_tokenize(sent, language='portuguese')
print(unigram_tagger.tag(tokens))

In [None]:
"retangulo" in mac_morpho.words()

### bigram based

In [None]:
bigram_tagger = nltk.BigramTagger(tagged_sentences, backoff=unigram_tagger)
print(bigram_tagger.evaluate(tagged_sentences))

In [None]:
sent = "O canto é retângulo!"
tokens = nltk.word_tokenize(sent, language='portuguese')
print(bigram_tagger.tag(tokens))
sent = "Eu canto pra você!"
tokens = nltk.word_tokenize(sent, language='portuguese')
print(bigram_tagger.tag(tokens))

### trigram based

In [None]:
trigram_tagger = nltk.TrigramTagger(tagged_sentences, backoff=bigram_tagger)
print(trigram_tagger.evaluate(tagged_sentences))

In [None]:
for word, tag in mac_morpho.tagged_words():
    if word == 'canto':
        print(word,tag)

In [None]:
sent = "O canto é retângulo!"
tokens = nltk.word_tokenize(sent, language='portuguese')
print(trigram_tagger.tag(tokens))
sent = "Eu canto pra você!"
tokens = nltk.word_tokenize(sent, language='portuguese')
print(trigram_tagger.tag(tokens))
sent = "Eu compro pra você!"
tokens = nltk.word_tokenize(sent, language='portuguese')
print(trigram_tagger.tag(tokens))

# Parsing
## Syntax

In [2]:
from nltk.corpus import floresta

In [3]:
print(len(floresta.words()))
print(len(floresta.sents()))

202124
8829


In [9]:
floresta.tagged_sents()[0]

[('Um', '>N+art'), ('revivalismo', 'H+n'), ('refrescante', 'N<+adj')]

In [13]:
print(floresta.parsed_sents()[1])

(STA+fcl
  (SUBJ+np (>N+art O) (H+prop 7_e_Meio))
  (P+v-fin é)
  (SC+np
    (>N+art um)
    (H+n ex-libris)
    (N<+pp
      (H+prp de)
      (P<+np (>N+art a) (H+n noite) (N<+adj algarvia))))
  (. .))


In [11]:
floresta.parsed_sents()[1].draw()

In [14]:
grammar1 = nltk.CFG.fromstring("""
  S -> NP VP
  VP -> V NP | V NP PP
  PP -> P NP
  V -> "viu" | "comeu" | "andou"
  NP -> Det N | Det N PP | N
  Det -> "o" | "a" | "um" | "uma" | "meu" | "minha"
  N -> "homem" | "cachorro" | "gato" | "telescópio" | "parque" | "João" | "Maria" | "Pedro" 
  P -> "no" | "sobre" | "por" | "com"
  """)

In [15]:
sent = "o homem viu o parque".split()
rd_parser = nltk.RecursiveDescentParser(grammar1)
sr_parser = nltk.ShiftReduceParser(grammar1)
for tree in rd_parser.parse(sent):
    print(tree)
print("-------")
for tree in sr_parser.parse(sent):
    print(tree)

(S (NP (Det o) (N homem)) (VP (V viu) (NP (Det o) (N parque))))
-------
(S (NP (Det o) (N homem)) (VP (V viu) (NP (Det o) (N parque))))


In [16]:
sent = "o homem viu".split()
rd_parser = nltk.RecursiveDescentParser(grammar1)
sr_parser = nltk.ShiftReduceParser(grammar1)
for tree in rd_parser.parse(sent):
    print(tree)
print("-------")
for tree in sr_parser.parse(sent):
    print(tree)

-------


In [17]:
sent = "o parque foi visto pelo homem".split()
rd_parser = nltk.RecursiveDescentParser(grammar1)
sr_parser = nltk.ShiftReduceParser(grammar1)
for tree in rd_parser.parse(sent):
    print(tree)
print("-------")
for tree in sr_parser.parse(sent):
    print(tree)

ValueError: Grammar does not cover some of the input words: "'foi', 'visto', 'pelo'".