In [2]:
import spacy
from spacy.matcher import Matcher

In [3]:
nlp = spacy.load("en_core_web_sm")

In [5]:
matcher = Matcher(nlp.vocab)
pattern = [{"LIKE_EMAIL": True}] #extracai de tudo que se parece um email
matcher.add("EMAIL_ADDRESS", [pattern])

In [6]:
doc = nlp("This is an email address: higormiler55@gmail.com")
matches =  matcher(doc)

In [7]:
print(matches) #lexeme, token inicial e o token final

[(16571425990740197027, 6, 7)]


In [8]:
print(nlp.vocab[matches[0][0]].text) #corresponde a um endereco de email

EMAIL_ADDRESS


In [9]:
with open("wiki_mlk.txt") as f:
    text = f.read()

In [10]:
print(text)

Martin Luther King Jr. (born Michael King Jr.; January 15, 1929 â€“ April 4, 1968) was an American Baptist minister and activist who became the most visible spokesman and leader in the American civil rights movement from 1955 until his assassination in 1968. King advanced civil rights through nonviolence and civil disobedience, inspired by his Christian beliefs and the nonviolent activism of Mahatma Gandhi. He was the son of early civil rights activist and minister Martin Luther King Sr.

King participated in and led marches for blacks' right to vote, desegregation, labor rights, and other basic civil rights.[1] King led the 1955 Montgomery bus boycott and later became the first president of the Southern Christian Leadership Conference (SCLC). As president of the SCLC, he led the unsuccessful Albany Movement in Albany, Georgia, and helped organize some of the nonviolent 1963 protests in Birmingham, Alabama. King helped organize the 1963 March on Washington, where he delivered his famou

In [11]:
nlp = spacy.load("en_core_web_sm")

In [12]:
matcher = Matcher(nlp.vocab)  # cria uma instancia do Matcher usando o vocabulario do modelo nlp
pattern = [{"POS":"PROPN"}]  # define um padrao para capturar substantivos proprios 
matcher.add("PROPER_NOUN", [pattern])  # adiciona o padrão ao matcher para que ele reconheça substantivos proprios
doc = nlp(text) 
matches = matcher(doc)  # aplica o matcher no documento doc, encontrando as ocorrencias que correspondem ao padrao
print(len(matcher))  # exibe o número de padroes no matcher, neste caso o 1
for match in matches[:10]:  # Itera sobre os primeiros 10 matches encontrados
    print(match, doc[match[1]: match[2]])  # Exibe o match e o texto correspondente no documento (substantivo proprio)


1
(451313080118390996, 0, 1) Martin
(451313080118390996, 1, 2) Luther
(451313080118390996, 2, 3) King
(451313080118390996, 3, 4) Jr.
(451313080118390996, 6, 7) Michael
(451313080118390996, 7, 8) King
(451313080118390996, 8, 9) Jr.
(451313080118390996, 10, 11) January
(451313080118390996, 16, 17) April
(451313080118390996, 50, 51) King


In [13]:
matcher = Matcher(nlp.vocab)  # cria uma instancia do Matcher usando o vocabulario do modelo nlp
pattern = [{"POS":"PROPN", "OP": "+"}]  # define um padrao para capturar substantivos proprios que aparecem uma ou mais vezes
matcher.add("PROPER_NOUN", [pattern])  # adiciona o padrão ao matcher para que ele reconheça substantivos proprios
doc = nlp(text) 
matches = matcher(doc)  # aplica o matcher no documento doc, encontrando as ocorrencias que correspondem ao padrao
print(len(matcher))  # exibe o número de padroes no matcher, neste caso o 1
for match in matches[:10]:  # Itera sobre os primeiros 10 matches encontrados
    print(match, doc[match[1]: match[2]])  # Exibe o match e o texto correspondente no documento (substantivo proprio)


1
(451313080118390996, 0, 1) Martin
(451313080118390996, 0, 2) Martin Luther
(451313080118390996, 1, 2) Luther
(451313080118390996, 0, 3) Martin Luther King
(451313080118390996, 1, 3) Luther King
(451313080118390996, 2, 3) King
(451313080118390996, 0, 4) Martin Luther King Jr.
(451313080118390996, 1, 4) Luther King Jr.
(451313080118390996, 2, 4) King Jr.
(451313080118390996, 3, 4) Jr.


In [14]:
matcher = Matcher(nlp.vocab)  # cria uma instancia do Matcher usando o vocabulario do modelo nlp
pattern = [{"POS":"PROPN", "OP": "+"}]  # define um padrao para capturar substantivos proprios que aparecem uma ou mais vezes
matcher.add("PROPER_NOUN", [pattern], greedy="LONGEST")  # adiciona o padrão ao matcher para que ele reconheça substantivos proprios
doc = nlp(text) 
matches = matcher(doc)  # aplica o matcher no documento doc, encontrando as ocorrencias que correspondem ao padrao
print(len(matcher))  # exibe o número de padroes no matcher, neste caso o 1
for match in matches[:10]:  # Itera sobre os primeiros 10 matches encontrados
    print(match, doc[match[1]: match[2]])  # Exibe o match e o texto correspondente no documento (substantivo proprio)


1
(451313080118390996, 84, 89) Martin Luther King Sr.
(451313080118390996, 470, 475) Martin Luther King Jr. Day
(451313080118390996, 537, 542) Martin Luther King Jr. Memorial
(451313080118390996, 0, 4) Martin Luther King Jr.
(451313080118390996, 129, 133) Southern Christian Leadership Conference
(451313080118390996, 248, 252) Director J. Edgar Hoover
(451313080118390996, 6, 9) Michael King Jr.
(451313080118390996, 326, 329) Nobel Peace Prize
(451313080118390996, 423, 426) James Earl Ray
(451313080118390996, 464, 467) Congressional Gold Medal


In [15]:
matcher = Matcher(nlp.vocab)  # cria uma instancia do Matcher usando o vocabulario do modelo nlp
pattern = [{"POS":"PROPN", "OP": "+"}]  # define um padrao para capturar substantivos proprios que aparecem uma ou mais vezes
matcher.add("PROPER_NOUN", [pattern])  # adiciona o padrão ao matcher para que ele reconheça substantivos proprios
doc = nlp(text) 
matches = matcher(doc)  # aplica o matcher no documento doc, encontrando as ocorrencias que correspondem ao padrao
matches.sort(key = lambda x: x[1]) #ordenar a lista de matches baseada no segundo valor de cada tupla
print(len(matcher))  # exibe o número de padroes no matcher, neste caso o 1
for match in matches[:10]:  # Itera sobre os primeiros 10 matches encontrados
    print(match, doc[match[1]: match[2]])  # Exibe o match e o texto correspondente no documento (substantivo proprio)


1
(451313080118390996, 0, 1) Martin
(451313080118390996, 0, 2) Martin Luther
(451313080118390996, 0, 3) Martin Luther King
(451313080118390996, 0, 4) Martin Luther King Jr.
(451313080118390996, 1, 2) Luther
(451313080118390996, 1, 3) Luther King
(451313080118390996, 1, 4) Luther King Jr.
(451313080118390996, 2, 3) King
(451313080118390996, 2, 4) King Jr.
(451313080118390996, 3, 4) Jr.


In [16]:
matcher = Matcher(nlp.vocab)  # cria uma instancia do Matcher usando o vocabulario do modelo nlp
pattern = [{"POS":"PROPN", "OP": "+"}, {"POS": "VERB"}]  # define um padrao para capturar substantivos proprios e verbos que aparecem uma ou mais vezes
matcher.add("PROPER_NOUN", [pattern])  # adiciona o padrão ao matcher para que ele reconheça substantivos proprios
doc = nlp(text) 
matches = matcher(doc)  # aplica o matcher no documento doc, encontrando as ocorrencias que correspondem ao padrao
matches.sort(key = lambda x: x[1]) #ordenar a lista de matches baseada no segundo valor de cada tupla
print(len(matcher))  # exibe o número de padroes no matcher, neste caso o 1
for match in matches[:10]:  # Itera sobre os primeiros 10 matches encontrados
    print(match, doc[match[1]: match[2]])  # Exibe o match e o texto correspondente no documento (substantivo proprio)


1
(451313080118390996, 50, 52) King advanced
(451313080118390996, 90, 92) King participated
(451313080118390996, 114, 116) King led
(451313080118390996, 168, 170) King helped
(451313080118390996, 248, 253) Director J. Edgar Hoover considered
(451313080118390996, 249, 253) J. Edgar Hoover considered
(451313080118390996, 250, 253) Edgar Hoover considered
(451313080118390996, 251, 253) Hoover considered
(451313080118390996, 323, 325) King won
(451313080118390996, 486, 489) United States beginning


In [17]:
import json #abrindo arquivo json
with open("alice.json", "r") as f:
    data = json.load(f)

In [20]:
text = data[0][2][0] #pegando uma parte expecifica do texto
print(text)

Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, `and what is the use of a book,' thought Alice `without pictures or conversation?'


In [21]:
text = text.replace("`","'") #substituindo pontuacao
print(text)

Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversation?'


In [26]:
matcher = Matcher(nlp.vocab) 
pattern = [{"ORTH":"'"},  #qualquer coisa q comece com aspas
           {"IS_ALPHA":True, "OP": "+"}, #serie de caracteres alfabetico
           {"IS_PUNCT":True,"OP":"*"}, #tem uma pontuacao como virgula
           {"ORTH":"'"} #e fecha as aspas
          ] 
matcher.add("PROPER_NOUNS", [pattern])  
doc = nlp(text) 
matches = matcher(doc)  # aplica o matcher no documento doc, encontrando as ocorrencias que correspondem ao padrao
matches.sort(key = lambda x: x[1]) #ordenar a lista de matches baseada no segundo valor de cada tupla
print(len(matcher))  # exibe o número de padroes no matcher, neste caso o 1
for match in matches[:10]:  # Itera sobre os primeiros 10 matches encontrados
    print(match, doc[match[1]: match[2]])  # Exibe o match e o texto correspondente no documento (substantivo proprio)


1
(3232560085755078826, 47, 58) 'and what is the use of a book,'
(3232560085755078826, 57, 61) ' thought Alice '
(3232560085755078826, 60, 67) 'without pictures or conversation?'


In [30]:
speak_lemmas = ["think", "say"] #lista lemmas relacionadas a fala
matcher = Matcher(nlp.vocab) 
pattern = [{"ORTH":"'"},  #qualquer coisa q comece com aspas
           {"IS_ALPHA":True, "OP": "+"}, #serie de caracteres alfabetico
           {"IS_PUNCT":True,"OP":"*"}, #tem uma pontuacao como virgula
           {"ORTH":"'"}, #e fecha as aspas
           {"POS": "VERB", "LEMMA":{"IN":speak_lemmas}}, #um verbo cujo lema esta na lista de lemmas
           {"POS": "PROPN","OP":"+"}, #mostra quem e a pessoa por tras da frase
           {"ORTH":"'"},  #copiar o q foi feito anterioemnte para extrair com sucesso toda a citacao
           {"IS_ALPHA":True, "OP": "+"}, 
           {"IS_PUNCT":True,"OP":"*"}, 
           {"ORTH":"'"}
          ] 
matcher.add("PROPER_NOUNS", [pattern])  
doc = nlp(text) 
matches = matcher(doc)  # aplica o matcher no documento doc, encontrando as ocorrencias que correspondem ao padrao
matches.sort(key = lambda x: x[1]) #ordenar a lista de matches baseada no segundo valor de cada tupla
print(len(matcher))  # exibe o número de padroes no matcher, neste caso o 1
for match in matches[:10]:  # Itera sobre os primeiros 10 matches encontrados
    print(match, doc[match[1]: match[2]])  # Exibe o match e o texto correspondente no documento (substantivo proprio)


1
(3232560085755078826, 47, 67) 'and what is the use of a book,' thought Alice 'without pictures or conversation?'


In [34]:
for text in data[0][2]:
    text = text.replace("`","'") #substituindo pontuacao
    doc = nlp(text) 
    matches = matcher(doc)  # aplica o matcher no documento doc, encontrando as ocorrencias que correspondem ao padrao
    print(len(matches)) #nao ha nenhum exemplo de aspas depois do primeiro capitulo
    matches.sort(key = lambda x: x[1]) #ordenar a lista de matches baseada no segundo valor de cada tupla
    for match in matches[:10]:  # Itera sobre os primeiros 10 matches encontrados
        print(match, doc[match[1]: match[2]])  # Exibe o match e o texto correspondente no documento (substantivo proprio)


1
(3232560085755078826, 47, 67) 'and what is the use of a book,' thought Alice 'without pictures or conversation?'
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


In [35]:
matcher = Matcher(nlp.vocab) #inicializa o Matcher com o vocabulario do modelo

# define os padroes que o Matcher vai procurar
pattern1 = [{'ORTH': "'"}, {'IS_ALPHA': True, "OP": "+"}, {'IS_PUNCT': True, "OP": "*"}, {'ORTH': "'"}]
pattern2 = [{'ORTH': "'"}, {'IS_ALPHA': True, "OP": "+"}, {'IS_PUNCT': True, "OP": "*"}, {'ORTH': "'"}]
pattern3 = [{"POS": "PROPN", "OP": "+"}, {"POS": "VERB", "LEMMA": {"IN": ["speak", "talk"]}}, {'ORTH': "'"}]

matcher.add("PROPER_NOUNS", [pattern1, pattern2, pattern3], greedy='LONGEST') # adiciona os padroes ao Matcher

data = [
    ["Exemplo de texto", "Outro texto", ["'Hello, world!'", "'This is a test.'", "'John speaks.'"]]
]

# Itera sobre os textos e aplica o Matcher
for text in data[0][2]:
    text = text.replace('"', "'") # substitui aspas duplas por aspas simples 
    doc = nlp(text)# processa o texto com o modelo de linguagem
    matches = matcher(doc)# aplica o Matcher ao documento
    matches.sort(key=lambda x: x[1])# ordena as correspondências pelo inicio do match
    print(len(matches))# exibe o nemero de correspondencias encontradas
    for match in matches[:10]: #exibe as primeiras 10 correspondencias
        print(match, doc[match[1]:match[2]])

0
1
(3232560085755078826, 0, 7) 'This is a test.'
1
(3232560085755078826, 0, 5) 'John speaks.'
