# Processamento de linguagem natural

- https://www.youtube.com/watch?v=Vr9QXpELdrs
- https://course.spacy.io/pt/

 The goal is a computer capable of "understanding" the contents of documents, including the contextual nuances of the language within them.
 <br><i>- Wikipedia</i>

Aplicações:

- chatbot
- tradução
- classificação
- correção de texto

Bibliotecas

- OpenNLP
- NLTK
- Spacy
- Gensim
- Stanza

Vosk, Coqui, Tesseract

## Spacy

In [None]:
from spacy import blank

nlp = blank('pt') # modelo
doc = nlp('Hoje é dia de festa! Vamos colocar nossa melhor roupa.') # documento
token = doc[0] # token
span = doc[1:3] # span

print("Exemplo de token:", token)
print("Exemplo de span:", span)

Exemplo de token: Hoje
Exemplo de span: é dia


In [None]:
# !python -m spacy download pt_core_news_sm

In [None]:
from spacy import load

# https://spacy.io/models


nlp = load('pt_core_news_sm') # pt_core_news_lg

<generator object at 0x7fcaab562fc0>
<generator object at 0x7fcaab562fc0>


In [None]:
doc = nlp('Hoje é dia de festa! Vamos colocar nossa melhor roupa.')

list(doc) # tokenização

[Hoje, é, dia, de, festa, !, Vamos, colocar, nossa, melhor, roupa, .]

In [None]:
# type(doc[0])
# dir(doc[0])

In [None]:
# explorando tokens
# https://spacy.io/usage/linguistic-features#pos-tagging

for token in doc:
  print("{:<21}|{:<8}|{:<7}|{:<2}|{:<8}|{:<7}|{:<2}|".format(
      token.shape, token.text, token.shape_, token.is_alpha, token.lemma_, token.dep_, token.is_stop))

10887629174180191697 |Hoje    |Xxxx   |1 |hoje    |advmod |0 |
11123243248953317070 |é       |x      |1 |ser     |cop    |1 |
4088098365541558500  |dia     |xxx    |1 |dia     |ROOT   |0 |
4370460163704169311  |de      |xx     |1 |de      |case   |1 |
13110060611322374290 |festa   |xxxx   |1 |festa   |nmod   |0 |
17494803046312582752 |!       |!      |0 |!       |punct  |0 |
16072095006890171862 |Vamos   |Xxxxx  |1 |ir      |aux    |0 |
13110060611322374290 |colocar |xxxx   |1 |colocar |ROOT   |0 |
13110060611322374290 |nossa   |xxxx   |1 |nosso   |det    |1 |
13110060611322374290 |melhor  |xxxx   |1 |bom     |amod   |0 |
13110060611322374290 |roupa   |xxxx   |1 |roupa   |obj    |0 |
12646065887601541794 |.       |.      |0 |.       |punct  |0 |


In [None]:
# explorando entidades
doc = nlp("Apple está procurando comprar uma startup do Brasil por 1 bilhão de reais")

for ent in doc.ents:
    print("{:<21}|{:<8}|{:<7}|{:<8}|".format(
        ent.text, ent.start_char, ent.end_char, ent.label_))

Apple                |0       |5      |ORG     |
Brasil               |45      |51     |LOC     |


In [None]:
from spacy import displacy

doc = nlp("Apple está procurando comprar uma startup do Brasil por 1 bilhão de reais")
# displacy.serve(doc, style="ent") # local
displacy.render(doc, style='dep', jupyter=True, options={'distance': 90})

In [None]:
from spacy import explain
explain("PROPN")

'proper noun'

In [None]:
explain("xcomp")

'open clausal complement'

In [None]:
doc = nlp("Ele bota a calça.")
# displacy.serve(doc, style="ent") # local
displacy.render(doc, style='dep', jupyter=True, options={'distance': 90})

- https://universaldependencies.org/u/pos/

In [None]:
# explorando morfologia
doc = nlp("Eu viajarei para Roma e já comprei passagens.")

for token in doc:
    print('{:<10}{:<10}{:<10}{}'.format(token.text, token.lemma_, token.pos_, token.morph))

Eu        eu        PRON      Case=Nom|Gender=Fem|Number=Sing|Person=1|PronType=Prs
viajarei  viajarei  VERB      Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin
para      para      ADP       
Roma      Roma      PROPN     Gender=Fem|Number=Sing
e         e         CCONJ     
já        já        ADV       
comprei   comprar   VERB      Mood=Ind|Number=Sing|Person=1|Tense=Past|VerbForm=Fin
passagens passagem  NOUN      Gender=Fem|Number=Plur
.         .         PUNCT     


In [None]:
# explorando morfologia
doc = nlp("Ele viajará para Roma e já comprei 2 passagens.")

for token in doc:
    print('{:<10}{:<10}{:<10}{}'.format(token.text, token.lemma_, token.pos_, token.morph))

Ele       ele       PRON      Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs
viajará   viajar    VERB      Mood=Ind|Number=Sing|Person=3|Tense=Fut|VerbForm=Fin
para      para      ADP       
Roma      Roma      PROPN     Gender=Fem|Number=Sing
e         e         CCONJ     
já        já        ADV       
comprei   compr     VERB      Mood=Ind|Number=Sing|Person=1|Tense=Fut|VerbForm=Fin
2         2         NUM       NumType=Card
passagens passagem  NOUN      Gender=Fem|Number=Plur
.         .         PUNCT     


In [None]:
# explorando morfologia
doc = nlp("Roberto viajará para Roma e já comprei 2 passagens.")

displacy.render(doc, style='ent', jupyter=True, options={'distance': 10})

In [None]:
from spacy.matcher import Matcher

doc = nlp("""
  Saiu um novo jogo.
  Eduarda colheu flores.
  Allan comeu pastel na feira.
  Juliana fará uma pizza domingo.
""")

# um Regex gramatical
matcher = Matcher(nlp.vocab)
padrao = [
    {'POS': "PROPN"},
    {'POS': 'VERB'},
    {"OP": "?"}, # opcional
    {'POS': 'NOUN'}
]

matcher.add('Regra A', [padrao])

matches = matcher(doc)

for id_, start, stop in matches:
  print(doc[start:stop].text)

Eduarda colheu flores
Allan comeu pastel
Juliana fará uma pizza


In [None]:
doc = nlp("""
  Saiu um novo jogo.
  Eduarda colheu flores.
  Allan comerá pastel na feira.
  Juliana fará uma pizza domingo.
""")

# um Regex gramatical
matcher = Matcher(nlp.vocab)
padrao = [
    {'POS': "PROPN", "MORPH": {"IS_SUPERSET": ["Gender=Masc"]}},
    {'MORPH': {"IS_SUPERSET": ["Tense=Fut"]}}, # futuro
    {"OP": "?"}, # opcional
    {'POS': 'NOUN'}
]

matcher.add('Regra A', [padrao])

matches = matcher(doc)

for id_, start, stop in matches:
  print(doc[start:stop].text)

Allan comerá pastel


In [None]:
# clean text
def is_valid(token):
  if not token.is_alpha:
    return False
  if token.is_stop:
    return False
  if token.dep_ == 'punct':
    return False
  return True


clean_text = [token.lemma_ for token in doc if is_valid(token)]
print(clean_text)

['viajar', 'Roma', 'compr', 'passagem']


## Exemplo 2

In [None]:
# !python -m spacy download en_core_web_sm

In [None]:
def print_entities(model, text):
    document = model(text)
    displacy.render(document, jupyter=True, style='ent')

long_text = """
Good news for consumers, undoubtedly, and good news also for investors. Apple’s recent results, covering the three months to December 31 2016, saw the company’s chief financial officer Luca Maestri announce: ‘We returned nearly $15 billion to investors through share re-purchases and dividends during the quarter.’ The quarterly dividend itself was 57 cents a share, identical to the dividend for the previous three quarters and up on the 52 cents paid for each of the four quarters before that.

Business is brisk at Apple. On January 31, Tim Cook, Apple’s chief executive, said of the last three months of 2016: ‘We’re thrilled to report that our holiday quarter results generated Apple’s highest quarterly revenue ever, and broke multiple records along the way. We sold more iPhones than ever before and set all-time revenue records for iPhone, Services, Mac and Apple Watch.’
"""

nlp = load('en_core_web_sm')

print_entities(nlp, long_text)

## Transformers

In [None]:
# !python3 -m spacy download en_core_web_trf

In [None]:
# !pip install spacy-transformers

In [None]:
import spacy_transformers

In [None]:
# Load the spacy transformer (roberta-base) model
roberta_nlp = load("en_core_web_trf")

In [None]:
print_entities(roberta_nlp, long_text)

In [None]:
# [text] -> [transform] -> [get entities]
# [text] -> [transform] -> [get sentiment]

# [text] -> [multi-task transform] -> [get entities and sentiment]

In [None]:
from transformers import pipeline

classifier = pipeline('zero-shot-classification', model='roberta-large-mnli')

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'sequence': "Hi, I'm happy to see you! Have a nice day!",
 'labels': ['good', 'bad'],
 'scores': [0.9929781556129456, 0.0070218658074736595]}

In [None]:
sequence_to_classify = "I like you!"
candidate_labels = ['hate', 'love']
classifier(sequence_to_classify, candidate_labels)

{'sequence': 'I like you!',
 'labels': ['love', 'hate'],
 'scores': [0.9753616452217102, 0.024638356640934944]}

In [None]:
sequence_to_classify = "one day I will see the world"
candidate_labels = ['travel', 'cooking', 'dancing']
classifier(sequence_to_classify, candidate_labels)

classifier(sequence_to_classify, candidate_labels)

{'sequence': 'one day I will see the world',
 'labels': ['travel', 'cooking', 'dancing'],
 'scores': [0.979964017868042, 0.01060498971492052, 0.009431000798940659]}

- https://explosion.ai/blog/spacy-transformers
- https://www.youtube.com/watch?v=HLhFTwi0hDU
- https://github.com/keitazoumana/Named-Entity-Recognition/blob/main/Named%20Entity%20Recognition%20spaCy%20Transformers.ipynb
- https://github.com/huggingface/transformers