In [None]:
# install the requirements
# pip install flair

# `flair`: la librería NLP de Zalando Research

La compañia Zalando tiene necesidades de aplicar NLP en distintos ámbitos y su equipo de investigaación ha liberado recientemente [`flair`](https://github.com/zalandoresearch/flair), su librería de NLP.

`flair` permite acceder a funcionalidades muy interesantes para procesar lenguaje natural, algunas de ellas muy modernas como:

- [etiquetar morfo-sintácticamente](https://github.com/zalandoresearch/flair/blob/master/resources/docs/TUTORIAL_2_TAGGING.md)
- extraer entidades
- clasificar automáticamente texto
- entrenar tus propios modelos para [construir otros clasificadores](https://towardsdatascience.com/text-classification-with-state-of-the-art-nlp-library-flair-b541d7add21f)
- [cargar vectores de palabras en decenas de lenguas](https://github.com/zalandoresearch/flair/blob/master/resources/docs/TUTORIAL_3_WORD_EMBEDDING.md)
- [usar vectores contextuales como BERT, ELMo](https://github.com/zalandoresearch/flair/blob/master/resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md)

Veamos cómo podemos acceder a algunas de sus funcionalidades.

## Análisis morfo-sintáctico

Para analizar sintácticamente un texto, necesitamos cargar un etiquetador con un modelo concreto de información morfo-sintáctica. Por ejemplo, uno capaz de analizar varias lenguas.


In [2]:
!pip install flair

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting flair
  Downloading flair-0.11.3-py3-none-any.whl (401 kB)
[K     |████████████████████████████████| 401 kB 7.7 MB/s 
[?25hCollecting deprecated>=1.2.4
  Downloading Deprecated-1.2.13-py2.py3-none-any.whl (9.6 kB)
Collecting segtok>=1.5.7
  Downloading segtok-1.5.11-py3-none-any.whl (24 kB)
Collecting mpld3==0.3
  Downloading mpld3-0.3.tar.gz (788 kB)
[K     |████████████████████████████████| 788 kB 49.6 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 56.0 MB/s 
[?25hCollecting conllu>=4.0
  Downloading conllu-4.5.2-py2.py3-none-any.whl (16 kB)
Collecting bpemb>=0.3.2
  Downloading bpemb-0.3.4-py3-none-any.whl (19 kB)
Collecting wikipedia-api
  Downloading Wikipedia-API-0.5.4.tar.gz (18 kB)
Collecting konoha<5.0.0,>=4.0.0
  Downloading konoha-4.6.5-py3-none-any.whl (20 kB

In [3]:
from flair.data import Sentence
from flair.models import SequenceTagger

# cargamos el analizador multi-idioma
tagger = SequenceTagger.load("pos-multi-fast")



Downloading:   0%|          | 0.00/72.1M [00:00<?, ?B/s]

2022-12-13 07:00:37,924 loading file /root/.flair/models/upos-multi-fast/db3a6189888a201a507dd9cae73ef157bf3e8d27cf0c5b6c96e9a175a77d70e3.83bafee89d19e198771b16525069a2df5e0a8cc35c23298f7844c7a636eadc13
2022-12-13 07:00:38,107 SequenceTagger predicts: Dictionary with 21 tags: <unk>, O, PROPN, PUNCT, ADJ, NOUN, VERB, DET, ADP, AUX, PRON, PART, SCONJ, NUM, ADV, CCONJ, X, INTJ, SYM, <START>, <STOP>


In [4]:
sentence1 = Sentence(
    "Facebook nació hace década y media tras una noche de copas de Mark Zuckerberg. "
)
tagger.predict(sentence1)
# imprimimos el análisis
print(sentence1.to_tagged_string())

sentence2 = Sentence(
    "Grand débat national: suivez Emmanuel Macron en direct de Bordeaux. "
)
tagger.predict(sentence2)
# imprimimos el análisis
print(sentence2.to_tagged_string())

sentence3 = Sentence(
    "Hier an der Zufahrt zur Startrampe 39A, wo vor 50 Jahren die gigantischen Saturn-Raketen der Apollo-Mondmissionen im Schneckentempo vorbeigefahren sind, prangen nun die blauen Lettern des Raumfahrtunternehmens von Elon Musk an einem Hangar."
)
tagger.predict(sentence3)
# imprimimos el análisis
print(sentence3.to_tagged_string())

Sentence: "Facebook nació hace década y media tras una noche de copas de Mark Zuckerberg ." → ["Facebook"/PROPN, "nació"/VERB, "hace"/VERB, "década"/NOUN, "y"/CCONJ, "media"/NOUN, "tras"/ADP, "una"/DET, "noche"/NOUN, "de"/ADP, "copas"/NOUN, "de"/ADP, "Mark"/PROPN, "Zuckerberg"/PROPN, "."/PUNCT]
Sentence: "Grand débat national : suivez Emmanuel Macron en direct de Bordeaux ." → ["Grand"/ADJ, "débat"/NOUN, "national"/ADJ, ":"/PUNCT, "suivez"/VERB, "Emmanuel"/PROPN, "Macron"/PROPN, "en"/ADP, "direct"/NOUN, "de"/ADP, "Bordeaux"/PROPN, "."/PUNCT]
Sentence: "Hier an der Zufahrt zur Startrampe 39A , wo vor 50 Jahren die gigantischen Saturn-Raketen der Apollo-Mondmissionen im Schneckentempo vorbeigefahren sind , prangen nun die blauen Lettern des Raumfahrtunternehmens von Elon Musk an einem Hangar ." → ["Hier"/ADV, "an"/ADP, "der"/DET, "Zufahrt"/NOUN, "zur"/ADP, "Startrampe"/NOUN, "39A"/PROPN, ","/PUNCT, "wo"/ADV, "vor"/ADP, "50"/NUM, "Jahren"/NOUN, "die"/DET, "gigantischen"/ADJ, "Saturn-Raket

## Reconocimiento de entidades

Para el reconocimiento de entidades varios modelos en diferentes lenguas. Aquí probamos con uno entrenado solo para inglés.

In [5]:
from flair.data import Sentence
from flair.models import SequenceTagger

# cargamos el reconocedor de entidades
tagger = SequenceTagger.load("ner-fast")



Downloading:   0%|          | 0.00/257M [00:00<?, ?B/s]

2022-12-13 07:00:51,258 loading file /root/.flair/models/ner-english-fast/4c58e7191ff952c030b82db25b3694b58800b0e722ff15427f527e1631ed6142.e13c7c4664ffe2bbfa8f1f5375bd0dced866b8c1dd7ff89a6d705518abf0a611
2022-12-13 07:00:54,369 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>


In [6]:
# analizamos una oración
sentence = Sentence(
    "Behind closed doors, freshman Rep. Alexandria Ocasio-Cortez threatened to put those voting with Republicans “on a list” for a primary challenge in the 2020 election."
)
tagger.predict(sentence)

# imprimimos el análisis
print(sentence.to_tagged_string())

# iteramos por la entidades
for entity in sentence.get_spans("ner"):
    print(entity)

# o imprimimos la estructura de datos con el análisis completo
print(sentence.to_dict(tag_type="ner"))

Sentence: "Behind closed doors , freshman Rep . Alexandria Ocasio-Cortez threatened to put those voting with Republicans “ on a list ” for a primary challenge in the 2020 election ." → ["Alexandria Ocasio-Cortez"/PER, "Republicans"/MISC]
Span[7:9]: "Alexandria Ocasio-Cortez" → PER (0.9985)
Span[15:16]: "Republicans" → MISC (0.9997)
{'text': 'Behind closed doors, freshman Rep. Alexandria Ocasio-Cortez threatened to put those voting with Republicans “on a list” for a primary challenge in the 2020 election.', 'ner': [{'value': 'PER', 'confidence': 0.9985336065292358}, {'value': 'MISC', 'confidence': 0.9996659755706787}]}


In [7]:
entity.tag

'MISC'

## Análisis de Opinión

También podemos utilizar un clasificador de textos y cargar el modelo entrenado con sentimiento, para poder detectar opiniones positivas y negativas.

In [8]:
from flair.models import TextClassifier
from flair.data import Sentence

classifier = TextClassifier.load("es-sentiment")

2022-12-13 07:01:31,229 loading file es-sentiment


FileNotFoundError: ignored

In [None]:
sentence1 = Sentence("I love ice-cream!")
classifier.predict(sentence1)
print("La frase '{}' es {}".format(sentence1.to_plain_string(), sentence1.labels))

sentence2 = Sentence("Don't ever go to this restaurant. The food was horrible :-(")
classifier.predict(sentence2)
print("La frase '{}' es {}".format(sentence2.to_plain_string(), sentence2.labels))

# TextBlob

In [2]:
from textblob import TextBlob

In [3]:
t_es = TextBlob(
    "La deuda pública ha marcado nuevos récords en España en el tercer trimestre"
)

In [4]:
print(t_es.translate(from_lang="es", to="en"))

Public debt has marked new records in Spain in the third quarter


In [11]:
# análisis de opinión
sent_es_str = "A mi no me gusta mucho, es terrible y horrible"
sent_es_str = "A mi me gusta mucho, es linda y bueno"
t_es = TextBlob(sent_es_str)
t_en = t_es.translate(from_lang="es", to="en")
print(f'ESPANOL: {sent_es_str}')
print(f'ENGLISH: {t_en}')
print(t_en.sentiment)

ESPANOL: A mi no me gusta mucho, es terrible y horrible
ENGLISH: I don't like it very much, it's terrible and horrible
Sentiment(polarity=-1.0, subjectivity=1.0)


# VADER


In [23]:
!pip install vaderSentiment

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[K     |████████████████████████████████| 125 kB 12.0 MB/s 
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [24]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
 
analyzer = SentimentIntensityAnalyzer()

In [25]:
vs = analyzer.polarity_scores("This table is black")
print(vs)

{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}


In [29]:
# análisis de opinión
sent_es_str = "A mi no me gusta mucho, es terrible y horrible"
sent_es_str = "A mi me gusta mucho, es linda y bueno"
sent_es_str = "Es una cosa"
t_es = TextBlob(sent_es_str)
t_en = t_es.translate(from_lang="es", to="en")
print(f'ESPANOL: {sent_es_str}')
print(f'ENGLISH: {t_en}')

vs = analyzer.polarity_scores(t_en)
print(vs)

if vs['compound'] > 0.2:
  pol_int = 1
elif vs['compound'] < -0.2:
  pol_int = -1 
else:
  pol_int = 0

print(f'pol_int: {pol_int}')
# .format(sentence1.to_plain_string(), sentence1.labels))

ESPANOL: Es una cosa
ENGLISH: It is one thing
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
pol_int: 0


# Flair

In [13]:
!pip install flair

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting flair
  Downloading flair-0.11.3-py3-none-any.whl (401 kB)
[K     |████████████████████████████████| 401 kB 14.8 MB/s 
[?25hCollecting conllu>=4.0
  Downloading conllu-4.5.2-py2.py3-none-any.whl (16 kB)
Collecting mpld3==0.3
  Downloading mpld3-0.3.tar.gz (788 kB)
[K     |████████████████████████████████| 788 kB 59.7 MB/s 
[?25hCollecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[K     |████████████████████████████████| 981 kB 61.6 MB/s 
[?25hCollecting ftfy
  Downloading ftfy-6.1.1-py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 1.8 MB/s 
[?25hCollecting janome
  Downloading Janome-0.4.2-py2.py3-none-any.whl (19.7 MB)
[K     |████████████████████████████████| 19.7 MB 1.3 MB/s 
[?25hCollecting transformers>=4.0.0
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 54.1 

In [15]:
from flair.models import TextClassifier
from flair.data import Sentence

classifier = TextClassifier.load("en-sentiment")

2022-12-13 07:38:27,037 https://nlp.informatik.hu-berlin.de/resources/models/sentiment-curated-distilbert/sentiment-en-mix-distillbert_4.pt not found in cache, downloading to /tmp/tmp1eghbztq


100%|██████████| 265512723/265512723 [00:07<00:00, 36906944.78B/s]

2022-12-13 07:38:34,298 copying /tmp/tmp1eghbztq to cache at /root/.flair/models/sentiment-en-mix-distillbert_4.pt





2022-12-13 07:38:35,055 removing temp file /tmp/tmp1eghbztq
2022-12-13 07:38:35,136 loading file /root/.flair/models/sentiment-en-mix-distillbert_4.pt


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [21]:
# análisis de opinión
sent_es_str = "A mi no me gusta mucho, es terrible y horrible"
# sent_es_str = "A mi me gusta mucho, es linda y bueno"
sent_es_str = "Es malo"
t_es = TextBlob(sent_es_str)
t_en = t_es.translate(from_lang="es", to="en")
print(f'ESPANOL: {sent_es_str}')
print(f'ENGLISH: {t_en}')

sentence1 = Sentence(t_en)
classifier.predict(sentence1)
print(f"La frase '{sentence1.to_plain_string()}' es {sentence1.labels}")

# .format(sentence1.to_plain_string(), sentence1.labels))

ESPANOL: Es malo
ENGLISH: It is bad
La frase 'It is bad' es ['Sentence: "I t   i s   b a d"'/'NEGATIVE' (0.6449)]


In [None]:
# análisis de opinión
opinion1 = TextBlob("This new restaurant is great. I had so much fun!! :-P")
print(opinion1.sentiment)

# sentiment-spanish

In [9]:
!pip install sentiment-analysis-spanish

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentiment-analysis-spanish
  Downloading sentiment_analysis_spanish-0.0.25-py3-none-any.whl (30.0 MB)
[K     |████████████████████████████████| 30.0 MB 98.3 MB/s 
[?25hInstalling collected packages: sentiment-analysis-spanish
Successfully installed sentiment-analysis-spanish-0.0.25


In [10]:
from sentiment_analysis_spanish import sentiment_analysis


In [11]:
sentiment = sentiment_analysis.SentimentAnalysisSpanish()
print(sentiment.sentiment("me gusta la tombola es genial"))


0.9304396176531412


https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


In [12]:
print(sentiment.sentiment("me parece terrible esto que me estás diciendo"))

2.1830853580533075e-06


# John Snow SnowNLP

In [1]:
# This is only to setup PySpark and Spark NLP on Colab
!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash

--2022-12-13 07:14:57--  https://setup.johnsnowlabs.com/colab.sh
Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]
--2022-12-13 07:14:57--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1191 (1.2K) [text/plain]
Saving to: ‘STDOUT’


2022-12-13 07:14:57 (50.3 MB/s) - written to stdout [1191/1191]

Installing PySpark 3.2.1 and Spark NLP 4.2.4
setup Colab for PySpark 3.2.1 and Spark NLP

In [2]:
import sparknlp

spark = sparknlp.start()

ModuleNotFoundError: ignored

In [None]:
document = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

embeddings = BertSentenceEmbeddings\
    .pretrained('labse', 'xx') \
    .setInputCols(["document"])\
    .setOutputCol("sentence_embeddings")

sentimentClassifier = ClassifierDLModel.pretrained("classifierdl_bert_sentiment", "es") \
  .setInputCols(["document", "sentence_embeddings"]) \
  .setOutputCol("class")

fr_sentiment_pipeline = Pipeline(stages=[document, embeddings, sentimentClassifier])

light_pipeline = LightPipeline(fr_sentiment_pipeline.fit(spark.createDataFrame([['']]).toDF("text")))

result1 = light_pipeline.annotate("Estoy seguro de que esta vez pasará la entrevista.")

result2 = light_pipeline.annotate("Soy una persona que intenta desayunar todas las mañanas sin falta.")

result3 = light_pipeline.annotate("No estoy seguro de si mi salario mensual es suficiente para vivir.")

print(result1["class"], result2["class"], sep = "\n")

In [None]:
document_assembler = DocumentAssembler() \
    .setInputCol('text') \
    .setOutputCol('document')

tokenizer = Tokenizer() \
    .setInputCols(['document']) \
    .setOutputCol('token')

sequenceClassifier = BertForSequenceClassification.pretrained("beto_sentiment", "en")\
  .setInputCols(["document",'token'])\
  .setOutputCol("class")

pipeline = Pipeline(stages=[
    document_assembler, 
    tokenizer,
    sequenceClassifier   
])

# couple of simple examples
example = spark.createDataFrame([["Te quiero. Te amo."]]).toDF("text")

result = pipeline.fit(example).transform(example)

# result is a DataFrame
result.select("text", "class.result").show()