# Baixando Dados do NLTK

In [1]:
import nltk
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /home/juan-
[nltk_data]    |     burtet/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /home/juan-
[nltk_data]    |     burtet/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package biocreative_ppi to /home/juan-
[nltk_data]    |     burtet/nltk_data...
[nltk_data]    |   Unzipping corpora/biocreative_ppi.zip.
[nltk_data]    | Downloading package brown to /home/juan-
[nltk_data]    |     burtet/nltk_data...
[nltk_data]    |   Unzipping corpora/brown.zip.
[nltk_data]    | Downloading package brown_tei to /home/juan-
[nltk_data]    |     burtet/nltk_data...
[nltk_data]    |   Unzipping corpora/brown_tei.zip.
[nltk_data]    | Downloading package cess_cat to /home/juan-
[nltk_data]    |     burtet/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_cat.zip.
[nltk_data]    | Downloadin

True

# Baixando Modelos do spaCy

In [2]:
# !python3 -m spacy download en_core_web_sm
# !python3 -m spacy download pt_core_news_sm
# !python3 -m spacy download es_core_news_sm
# !python3 -m spacy download de_core_news_sm
#!python -m spacy validate

import spacy.cli

modelos = ["en_core_web_sm", "pt_core_news_sm", "es_core_news_sm", "de_core_news_sm"]
for model in modelos:
  spacy.cli.download(model)

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('pt_core_news_sm')
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('es_core_news_sm')
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')


# Lematização


## Lematização usando NLTK

In [3]:
import nltk
from nltk.stem import WordNetLemmatizer 

lemmatizer = WordNetLemmatizer()

print(lemmatizer.lemmatize("bats"))
print(lemmatizer.lemmatize("are"))
print(lemmatizer.lemmatize("feet"))
print(lemmatizer.lemmatize("rocks")) 
print(lemmatizer.lemmatize("corpora")) 

print(lemmatizer.lemmatize("stripes", 'v'))  
print(lemmatizer.lemmatize("stripes", 'n'))  

print("better :", lemmatizer.lemmatize("better", pos ="a"))


bat
are
foot
rock
corpus
strip
stripe
better : good


## Lematização usando spaCy

In [4]:
import spacy
nlp = spacy.load("en_core_web_sm")

doc = nlp(u"Apples and oranges are similar. Boots and hippos aren't.")

for token in doc:
    print(token, token.lemma_)


Apples apple
and and
oranges orange
are be
similar similar
. .
Boots boot
and and
hippos hippos
are be
n't not
. .


## Comparação

### NLTK

In [5]:
#import library

import nltk 
#print('NLTK Version: %s' % (nltk.__version__))
wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()

#check

article = """Lemmatisation (or lemmatization) in linguistics is 
the process of grouping together the inflected forms of a word so 
they can be analysed as a single item, identified by the word's lemma, 
or dictionary form."""

tokens = nltk.word_tokenize(article)
print('Original Article: %s' % (article))
print()
for token in tokens:
    lemmatized_token = wordnet_lemmatizer.lemmatize(token)
    
    if token != lemmatized_token:
        print('Original : %s, New: %s' % (token, lemmatized_token))

Original Article: Lemmatisation (or lemmatization) in linguistics is 
the process of grouping together the inflected forms of a word so 
they can be analysed as a single item, identified by the word's lemma, 
or dictionary form.

Original : forms, New: form
Original : as, New: a


### spaCy

In [6]:
#import library

import spacy
#print('spaCy Version: %s' % (spacy.__version__))
spacy_nlp = spacy.load('en_core_web_sm')

#normalize

article = """Lemmatisation (or lemmatization) in linguistics is 
the process of grouping together the inflected forms of a word so 
they can be analysed as a single item, identified by the word's lemma, 
or dictionary form."""

doc = spacy_nlp(article)
tokens = [token.text for token in doc]
print('Original Article: %s' % (article))
print()
for token in doc:
    if token.text != token.lemma_:
        print('Original : %s, New: %s' % (token.text, token.lemma_))

Original Article: Lemmatisation (or lemmatization) in linguistics is 
the process of grouping together the inflected forms of a word so 
they can be analysed as a single item, identified by the word's lemma, 
or dictionary form.

Original : Lemmatisation, New: lemmatisation
Original : is, New: be
Original : grouping, New: group
Original : forms, New: form
Original : they, New: -PRON-
Original : analysed, New: analyse
Original : identified, New: identify


# Stemização

### German

In [7]:
# from nltk.stem import SnowballStemmer

# stemmer = SnowballStemmer("german") 
# stemmer.stem("Autobahnen") 

from nltk.stem.snowball import GermanStemmer

stemmer = GermanStemmer()
stemmer.stem("Autobahnen")

'autobahn'

### Spanish

In [8]:
from nltk.stem.snowball import SpanishStemmer

stemmer = SpanishStemmer()
stemmer.stem("Carreteras")

'carreter'

### Portuguese

In [9]:
import nltk

from nltk.stem import RSLPStemmer

stemmer = RSLPStemmer()
stemmer.stem("Rodovias")

'rodov'