# Textacy & Spacy
Librerias de procesado de NLP

In [1]:
'''
Versiones compatibles
Despues hay que reiniciar el entorno de ejecución
'''
# !pip install spacy
# !pip install textacy

# # Para siguientes ejecuciones, solo ejecutar esto y reiniciar despues el entorno
# !python -m spacy download en_core_web_lg
# !python -m spacy download es_core_news_lg


'\nVersiones compatibles\nDespues hay que reiniciar el entorno de ejecución\n'

# Spacy
https://spacy.io/

NOTA: Recuerda reiniciar el entorno de ejecucion despues de la instalacion

In [2]:
import spacy

nlp = spacy.load('en_core_web_lg')

## Text basics
Veamos como trabajar cn estos primeros ejemplos con la libreria ´spacy´. Cosas que podemos hacer:
1. Tokenizar en frases
2. Tokenizar en palabras
3. Acceder a los atributos de cada token
4. Acceder a las entidades del texto
5. Visualizar las entidades del texto

In [5]:
# The text we want to examine
text = """
London is the capital and most populous city of England and 
the United Kingdom.  Standing on the River Thames in the south east 
of the island of Great Britain, London has been a major settlement 
for two millennia. It was founded by the Romans, who named it Londinium.
"""

doc = nlp(text)

print(doc)

for num, sentence in enumerate(doc.sents):
  print(num, sentence)



London is the capital and most populous city of England and 
the United Kingdom.  Standing on the River Thames in the south east 
of the island of Great Britain, London has been a major settlement 
for two millennia. It was founded by the Romans, who named it Londinium.

0 
London is the capital and most populous city of England and 
the United Kingdom.  
1 Standing on the River Thames in the south east 
of the island of Great Britain, London has been a major settlement 
for two millennia.
2 It was founded by the Romans, who named it Londinium.



In [6]:
len(list(doc.sents))

3

In [7]:
for word in doc[:20]:
  print(word.text, word.lemma_, word.pos_, word.is_stop)


 
 SPACE False
London London PROPN False
is be AUX True
the the DET True
capital capital NOUN False
and and CCONJ True
most most ADV True
populous populous ADJ False
city city NOUN False
of of ADP True
England England PROPN False
and and CCONJ True

 
 SPACE False
the the DET True
United United PROPN False
Kingdom Kingdom PROPN False
. . PUNCT False
    SPACE False
Standing stand VERB False
on on ADP True


In [3]:
type(doc)

spacy.tokens.doc.Doc

## Syntactic analysis
Doing the school homework

In [9]:
from spacy import displacy

doc2 = nlp("London is the capital and most populous city of England and the United Kingdom")
displacy.render(doc2, jupyter=True, style="dep")

## Entities in text

In [11]:
for entity in doc.ents:
  print(entity.text, entity.label_, spacy.explain(entity.label_))

London GPE Countries, cities, states
England GPE Countries, cities, states
the United Kingdom GPE Countries, cities, states
the River Thames LOC Non-GPE locations, mountain ranges, bodies of water
south east LOC Non-GPE locations, mountain ranges, bodies of water
Great Britain GPE Countries, cities, states
London GPE Countries, cities, states
two millennia DATE Absolute or relative dates or periods
Romans NORP Nationalities or religious or political groups
Londinium GPE Countries, cities, states


In [12]:
# Doubts with labels
spacy.explain('ORG')

'Companies, agencies, institutions, etc.'

In [13]:
displacy.render(doc, style='ent', jupyter=True)

## Replacing names
Hide names for GDPR

In [14]:
# Replace a token with "REDACTED" if it is a name
def replace_name_with_placeholder(token):
    if token.ent_iob != 0 and token.ent_type_ == "PERSON":
        return "GDPR"
    else:
        return token.text

# Loop through all the entities in a document and check if they are names
def scrub(text):
    doc = nlp(text)
    with doc.retokenize() as retokenizer:
        for ent in doc.ents:
            retokenizer.merge(ent)
    tokens = map(replace_name_with_placeholder, doc)
    return " ".join(tokens)

s = """
In 1950, Alan Turing published his famous article "Computing Machinery and Intelligence". In 1957, Noam Chomsky’s 
Syntactic Structures revolutionized Linguistics with 'universal grammar', a rule based system of syntactic structures.
"""

print(scrub(s))


 In 1950 , GDPR published his famous article " Computing Machinery and Intelligence " . In 1957 , GDPR ’s 
 Syntactic Structures revolutionized Linguistics with ' universal grammar ' , a rule based system of syntactic structures . 



## Lematize

In [15]:
for w in doc:
  print(w.text, w.lemma_, w.pos_)


 
 SPACE
London London PROPN
is be AUX
the the DET
capital capital NOUN
and and CCONJ
most most ADV
populous populous ADJ
city city NOUN
of of ADP
England England PROPN
and and CCONJ

 
 SPACE
the the DET
United United PROPN
Kingdom Kingdom PROPN
. . PUNCT
    SPACE
Standing stand VERB
on on ADP
the the DET
River River PROPN
Thames Thames PROPN
in in ADP
the the DET
south south PROPN
east east PROPN

 
 SPACE
of of ADP
the the DET
island island NOUN
of of ADP
Great Great PROPN
Britain Britain PROPN
, , PUNCT
London London PROPN
has have AUX
been be AUX
a a DET
major major ADJ
settlement settlement NOUN

 
 SPACE
for for ADP
two two NUM
millennia millennium NOUN
. . PUNCT
It it PRON
was be AUX
founded found VERB
by by ADP
the the DET
Romans Romans PROPN
, , PUNCT
who who PRON
named name VERB
it it PRON
Londinium Londinium PROPN
. . PUNCT

 
 SPACE


## Stopwords

In [16]:
from spacy.lang.en.stop_words import STOP_WORDS

print(list(STOP_WORDS)[:20])

['upon', 'hereupon', 'however', 'top', 'further', 'i', 'quite', 'where', 'well', 'due', 'us', 'put', 'often', 'its', 'not', 'make', 'why', 'even', 'first', 'six']


In [17]:
lista_clean = [palabra for palabra in doc if not palabra.is_stop and not palabra.is_punct]
print(lista_clean)

[
, London, capital, populous, city, England, 
, United, Kingdom,  , Standing, River, Thames, south, east, 
, island, Great, Britain, London, major, settlement, 
, millennia, founded, Romans, named, Londinium, 
]


# Spanish
## Spacy  and entities

In [18]:
nlp_es = spacy.load('es_core_news_lg')

text = '''Londres (en inglés, London, pronunciado /ˈlʌndən/ ( escuchar)) es la capital y mayor ciudad de Inglaterra y del Reino Unido.2​3​ Situada a orillas del río Támesis, Londres es un importante asentamiento humano desde que fue fundada por los romanos con el nombre de Londinium hace casi dos milenios.4​ El núcleo antiguo de la urbe, la City de Londres, conserva básicamente su perímetro medieval de una milla cuadrada. Desde el siglo XIX el nombre «Londres» también hace referencia a toda la metrópolis desarrollada alrededor de este núcleo.5​ El grueso de esta conurbación forma la región de Londres y el área administrativa del Gran Londres,6​ gobernado por el alcalde y la asamblea de Londres.7​
Londres es una ciudad global, uno de los centros neurálgicos en el ámbito de las artes, el comercio, la educación, el entretenimiento, la moda, las finanzas, los medios de comunicación, la investigación, el turismo o el transporte.8​ Es el principal centro financiero del mundo9​10​11​ y una de las áreas metropolitanas con mayor PIB.12​13​ Londres es también una capital cultural mundial,14​15​16​17​ la ciudad más visitada considerando el número de visitas internacionales18​ y tiene el mayor sistema aeroportuario del mundo según el tráfico de pasajeros.19​ Asimismo, las 43 universidades de la ciudad conforman la mayor concentración de centros de estudios superiores de toda Europa.20​ En el año 2012 Londres se convirtió en la única ciudad en albergar la celebración de tres Juegos Olímpicos de Verano.21​
En esta ciudad multirracial convive gente de un gran número de culturas que hablan más de trescientos idiomas distintos.22​ La Autoridad del Gran Londres estima que en 2015 la ciudad tiene 8,63 millones de habitantes,23​ que supone el 12,5 % del total de habitantes del Reino Unido.24​ El área urbana del Gran Londres, con 10 470 00025​ habitantes, es la segunda más grande de Europa, pero su área metropolitana, con una población estimada de entre 12 y 14 millones,26​27​ es la mayor del continente. Desde 1831 a 1925 Londres, como capital del Imperio británico, fue la ciudad más poblada del mundo.'''
doc = nlp_es(text)

# Para ver las entidades que ha detectado
for entity in doc.ents:
    print(f"{entity.text} ({entity.label_})")

Londres (LOC)
London (LOC)
Inglaterra (LOC)
Reino Unido.2​3​ Situada (LOC)
Támesis (LOC)
Londres (LOC)
Londinium (LOC)
City de Londres (LOC)
Londres (LOC)
Londres (LOC)
Gran Londres,6​ (LOC)
Londres (LOC)
mundo9​10​11​ (LOC)
Londres (LOC)
Londres (LOC)
Juegos Olímpicos de Verano.21​
En esta ciudad multirracial (MISC)
La Autoridad del Gran Londres (MISC)
Reino Unido.24​ El área urbana del (LOC)
Gran Londres (LOC)
Europa (LOC)
millones,26​27​ (PER)
Londres (LOC)
Imperio británico (LOC)


In [16]:
# Doubts with labels
spacy.explain('MISC')

'Miscellaneous entities, e.g. events, nationalities, products or works of art'

## Most frequent words
In a Wikipedia page

In [15]:
# !pip install wikipedia

In [19]:
from spacy.lang.es.stop_words import STOP_WORDS

In [20]:
list(STOP_WORDS)[:20]

['eramos',
 'propio',
 'eran',
 'dia',
 'cuántos',
 'alli',
 'hacerlo',
 'consiguen',
 'salvo',
 'hacia',
 'realizar',
 'contra',
 'dicho',
 'tres',
 'adelante',
 'propia',
 'bien',
 'mismo',
 'es',
 'quiza']

In [27]:
import wikipedia
wikipedia.set_lang("en")
wiki = wikipedia.page(title="Cristiano Ronaldo")
text = wiki.content
text[0:1000]

"Cristiano Ronaldo dos Santos Aveiro  (Portuguese pronunciation: [kɾiʃˈtjɐnu ʁɔˈnaldu] ; born 5 February 1985) is a Portuguese professional footballer who plays as a forward for and captains both Saudi Pro League club Al Nassr and the Portugal national team. Widely regarded as one of the greatest players of all time, Ronaldo has won numerous individual accolades throughout his career, such as five Ballon d'Or awards, a record three UEFA Men's Player of the Year Awards, four European Golden Shoes, and was named five times the world's best player by FIFA, the most by a European player. He has won 33 trophies in his career, including seven league titles, five UEFA Champions Leagues, the UEFA European Championship and the UEFA Nations League. Ronaldo holds the records for most appearances (183), goals (140) and assists (42) in the Champions League, most appearances (30), assists (8), goals in the European Championship (14), international appearances (217) and international goals (135). He 

In [30]:
nombres = [w.text.lower() for w in nlp(text) if ((not w.is_stop) and (not w.is_punct) and (w.pos_ == 'ADJ'))]
nombres[:10]

['portuguese',
 'kɾiʃˈtjɐnu',
 'portuguese',
 'professional',
 'national',
 'greatest',
 'numerous',
 'individual',
 'best',
 'european']

In [31]:
from collections import Counter
word_freq = Counter(nombres)

word_freq.most_common(10)[1:]

[('second', 19),
 ('consecutive', 13),
 ('portuguese', 12),
 ('final', 12),
 ('best', 11),
 ('highest', 11),
 ('spanish', 9),
 ('free', 8),
 ('personal', 8)]

In [38]:
wikipedia.set_lang("es")
wiki = wikipedia.page(title="Regresión Lineal")
text = wiki.content
text[0:1000]

'En estadística, la regresión lineal o ajuste lineal es un modelo matemático usado para aproximar la relación de dependencia entre una variable dependiente \n  \n    \n      \n        Y\n      \n    \n    {\\displaystyle Y}\n  \n, \n  \n    \n      \n        m\n      \n    \n    {\\displaystyle m}\n  \n variables independientes \n  \n    \n      \n        \n          X\n          \n            i\n          \n        \n      \n    \n    {\\displaystyle X_{i}}\n  \n con \n  \n    \n      \n        m\n        ∈\n        \n          \n            Z\n          \n          \n            +\n          \n        \n      \n    \n    {\\displaystyle m\\in \\mathbb {Z} ^{+}}\n  \n y un término aleatorio \n  \n    \n      \n        ε\n      \n    \n    {\\displaystyle \\varepsilon }\n  \n. Este método es aplicable en muchas situaciones en las que se estudia la relación entre dos o más variables o predecir un comportamiento, algunas incluso sin relación con la tecnología. En caso de que no se pueda 

In [39]:
nombres = [w.lemma_ for w in nlp_es(text) if ((not w.is_stop) and (not w.is_punct) and (w.pos_ == 'ADJ'))]
nombres[:10]

['lineal',
 'lineal',
 'matemático',
 'usado',
 'variable',
 'dependiente',
 'm',
 'independiente',
 '^{+',
 'aleatorio']

In [40]:
from collections import Counter
word_freq = Counter(nombres)

word_freq.most_common(10)[1:]

[('lineal', 48),
 ('variable', 19),
 ('obtenido', 11),
 ('múltiple', 8),
 ('independiente', 7),
 ('simple', 7),
 ('mínimo', 7),
 ('dependiente', 6),
 ('aleatorio', 6)]

## Textacy London text
No we want to know things about London, from our previous text

In [45]:
# import spacy
import textacy.extract

#### CODE ####

# The text we want to examine
text = """London is the capital and most populous city of England and the United Kingdom. Standing on the River Thames in the south east of the island of Great Britain, London has been a major settlement for two millennia. London was founded by the Romans, 
who named it Londinium. London is a huge city. London has a lot of cute restaurants. Jaimito is my best friend.
"""

doc = nlp(text)
statements = textacy.extract.semistructured_statements(doc, entity="London", cue='be')

# Print the results
print("Here are the things I know about London:")

for statement in statements:
    subject, verb, fact = statement
    print(f" - {fact}")


Here are the things I know about London:
 - [the, capital, and, most, populous, city, of, England, and, the, United, Kingdom]
 - [a, major, settlement, for, two, millennia]
 - [a, huge, city]


In [44]:
# nlp_es = spacy.load('es_core_news_lg')

# doc = nlp(wiki.content)

# statements = textacy.extract.semistructured_statements(doc, entity="Benzema", cue='be')

# # Print the results
# print("Here are the things I know about Karim:")

# for statement in statements:
#     subject, verb, fact = statement
#     print(f" - {fact}")


## Textacy with Wikipedia API
We don't want the most frequent words, we want sentences about London

In [46]:
wikipedia.set_lang("en")
london = wikipedia.page("London")
text = london.content

In [47]:
doc = nlp(text)

# Extract semi-structured statements
statements = textacy.extract.semistructured_statements(doc, entity="London Underground", cue='be')

# Print the results
print("Here are the things I know about London:")

for statement in statements:
    subject, verb, fact = statement
    print(f" - {fact}")

Here are the things I know about London:
 - [the, world, 's, oldest, rapid, transit, system]
 - [the, oldest, and, third, longest, metro, system, in, the, world]
