Text Mining - 4. SpaCy

AFI - Máster en Data Science y Big Data

Juan de Dios Romero Palop

Abril 2022


In [None]:
!python -m spacy

https://spacy.io/models/es#es_core_news_md

In [None]:
!python -m spacy download es_core_news_sm

### Tokenización frases

In [None]:
import spacy

raw_text = 'Hello, world. This is an example.'

nlp_en = spacy.load('en')
doc = nlp_en(raw_text)


sentences = [sent.string.strip() for sent in doc.sents]
sentences

In [None]:
import es_core_news_sm

nlp = es_core_news_sm.load()
texto = '''Primera parte del Ingenioso hidalgo Don Quijote de la Mancha.
Capitulo Primero. Que trata de la condicion y ejercicio del famoso hidalgo don Quijote de la Mancha.'''

doc = nlp(texto)

frases = [sent.string.strip() for sent in doc.sents]
frases

In [None]:
texto_esp = "¡Hola! ¿Cómo te llamas? Me llamo D. Tomás."
doc = nlp(texto_esp)

frases = [sent.string.strip() for sent in doc.sents]
frases

### Tokenización palabras

In [None]:
palabras = [token.string for token in doc]
palabras

### Part Of Speech

In [None]:
nlp = es_core_news_sm.load()
texto_esp = "¡Hola! ¿Cómo te llamas? Me llamo D. Tomás."
doc = nlp(texto_esp)
print([(w.text, w.pos_) for w in doc])

In [None]:
from spacy import displacy
displacy.render(doc, style='dep', jupyter = True, options = {'distance': 120})

In [None]:
nlp = es_core_news_sm.load()
texto_esp = "¡Hola! ¿Cómo te llamas? Me llamo D. Tomás."
doc = nlp(texto_esp)
print([(w.text, w.tag_) for w in doc])

### Stemming

SpaCy no implementa stemming.

### Lemmatizacion

In [None]:
nlp = es_core_news_sm.load()
texto_esp = "¡Hola! ¿Cómo te llamas? Me llamo D. Tomás."
doc = nlp(texto_esp)
[word.lemma_ for word in doc]

In [None]:
nlp = es_core_news_sm.load()
texto_esp = "estamos jugando con las palabras. Ayer estuve en el pasado y mañana trabajaremos los futuros."
doc = nlp(texto_esp)
[word.lemma_ for word in doc]

In [None]:
nlp = es_core_news_sm.load()
texto_esp = "yo no traje el traje"
doc = nlp(texto_esp)
[word.pos_ for word in doc]
[word.lemma_ for word in doc]

### NER

In [None]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Microsoft Corp. is looking at buying United Kingdom startup for 175 million American Dollars, Peter Parker from NY Times says")

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

In [None]:
#from spacy import displacy
#displacy.serve(doc, style="ent")
# https://spacy.io/usage/visualizers

In [None]:
nlp = es_core_news_sm.load()
doc = nlp('''Iñaki Gabilondo insiste en que Iberia, la mayor aerolinea de España, esta preparando una oferta de 700 millones por la aerolinea mexicana Aeromexico''')

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

### Stopwords

In [None]:
stopwords = spacy.lang.en.stop_words.STOP_WORDS

In [None]:
list(stopwords)[:10]

In [None]:
text = '''In computing, stop words are words which are filtered out before or after processing of natural language data (text).
 Though "stop words" usually refers to the most common words in a language, there is no single universal list of stop words used by all natural language processing tools, and indeed not all tools even use such a list. 
 Some tools specifically avoid removing these stop words to support phrase search.'''
nlp_en = spacy.load('en')
doc = nlp_en(text)

In [None]:
tokens = [token.text for token in doc if not token.is_stop]
print('Original: %s' % (text))
print()
print(tokens)

In [None]:
customize_stop_words = [
    'computing', 'filtered'
]
for w in customize_stop_words:
    nlp_en.vocab[w].is_stop = True

doc = nlp_en(text)
tokens = [token.text for token in doc if not token.is_stop]
print('Original: %s' % (text))
print()
print(tokens)

In [None]:
nlp = es_core_news_sm.load()
texto = '''Primera parte del Ingenioso hidalgo Don Quijote de la Mancha.
Capitulo Primero. Que trata de la condicion y ejercicio del famoso hidalgo don Quijote de la Mancha.'''

doc = nlp(texto)

In [None]:
tokens = [token.text for token in doc if not token.is_stop]
print('Original: %s' % (texto))
print()
print(tokens)

In [None]:
customize_stop_words = [
    'Don', 'y'
]
for w in customize_stop_words:
    nlp.vocab[w].is_stop = True

doc = nlp(texto)
tokens = [token.text for token in doc if not token.is_stop]
print('Original: %s' % (texto))
print()
print(tokens)

### Extra info

In [None]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

In [None]:
nlp = es_core_news_sm.load()
doc = nlp("CaixaBank reclama a Abanca que mejore la oferta por su 15,3% en Nueva Pescanova")

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

In [None]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Autonomous cars shift insurance liability toward manufacturers")
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_,
            chunk.root.head.text)

In [None]:
nlp = es_core_news_sm.load()
doc = nlp("El coronavirus traspasa fronteras y aumenta su impacto en la economía")
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_,
            chunk.root.head.text)