In [1]:
%%shell
pip install transformers[torch] emoji sentencepiece





In [2]:
import torch
from transformers import pipeline
from transformers import (
    BertForMaskedLM,
    BertForMultipleChoice,
    BertForSequenceClassification,
    BertForQuestionAnswering,
    BertForNextSentencePrediction,
    BertForTokenClassification,
    BertGenerationPreTrainedModel,
    BertweetTokenizer,
    RobertaForSequenceClassification,
    )

In [7]:
model = RobertaForSequenceClassification.from_pretrained(
    "finiteautomata/bertweet-base-sentiment-analysis"
)
display(model)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(130, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [8]:
type(model)

transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification

In [10]:
isinstance(model, torch.nn.Module)

True

In [11]:
model = model.to("cuda")

In [33]:
text = "I'm so happy because I won a prize on the festival"

In [19]:
tokenizer = BertweetTokenizer.from_pretrained(
    "finiteautomata/bertweet-base-sentiment-analysis"
)

In [34]:
inputs = tokenizer(text, return_tensors="pt")
inputs_cuda = {key: value.to("cuda") for key, value in inputs.items()}

In [35]:
inputs_cuda

{'input_ids': tensor([[   0,  545,   40,   39,  225,  153,    8,  840,   11, 5906,   24,    6,
          5189,    2]], device='cuda:0'),
 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0'),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [36]:
pred = model(**inputs_cuda)

In [37]:
pred.logits.argmax()

tensor(2, device='cuda:0')

In [38]:
model.config.id2label[int(pred.logits.argmax().cpu())]

'POS'

In [41]:
pipe = pipeline(
    model="finiteautomata/bertweet-base-sentiment-analysis",
    task="text-classification"
)

In [42]:
pipe("I'm so happy today, It's friday!")

[{'label': 'POS', 'score': 0.9927896857261658}]

# Token Classification

In [43]:
pipe = pipeline(
    model="pysentimiento/robertuito-pos",
    task="token-classification"
)

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/330 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/829k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [45]:
pipe("Juan y Maria estan comiendo helado")

[{'entity': 'B-PROPN',
  'score': 0.99989116,
  'index': 1,
  'word': '▁Juan',
  'start': 0,
  'end': 4},
 {'entity': 'B-CONJ',
  'score': 0.9992805,
  'index': 2,
  'word': '▁y',
  'start': 4,
  'end': 6},
 {'entity': 'B-PROPN',
  'score': 0.99989617,
  'index': 3,
  'word': '▁Maria',
  'start': 6,
  'end': 12},
 {'entity': 'B-AUX',
  'score': 0.9104122,
  'index': 4,
  'word': '▁estan',
  'start': 12,
  'end': 18},
 {'entity': 'B-VERB',
  'score': 0.9999285,
  'index': 5,
  'word': '▁comiendo',
  'start': 18,
  'end': 27},
 {'entity': 'B-NOUN',
  'score': 0.99919623,
  'index': 6,
  'word': '▁helado',
  'start': 27,
  'end': 34}]

# Sentence Clasification

In [46]:
pipe = pipeline(
    model="papluca/xlm-roberta-base-language-detection",
    task="text-classification"
)

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [47]:
pipe("Hello, how are you?")

[{'label': 'en', 'score': 0.8399174213409424}]

In [48]:
pipe("Hola, mi nombre es Pedro Pablo")

[{'label': 'es', 'score': 0.9936506152153015}]

# Traduccion Neural

In [3]:
pipe = pipeline(
    model="Helsinki-NLP/opus-mt-es-en",
    task="translation"
)

source.spm:   0%|          | 0.00/826k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.59M [00:00<?, ?B/s]



In [4]:
pipe("Hola, mi nombre es Juan y estoy programando")

[{'translation_text': "Hi, my name is Juan and I'm programming."}]

# Llenado de mascaras

In [5]:
pipe = pipeline(
    model="xlm-roberta-base",
    task="fill-mask"
)

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [9]:
pipe("Ayer fui a <mask> y compre pan")

[{'score': 0.1781957894563675,
  'token': 12024,
  'token_str': 'pasar',
  'sequence': 'Ayer fui a pasar y compre pan'},
 {'score': 0.13185276091098785,
  'token': 2349,
  'token_str': 'casa',
  'sequence': 'Ayer fui a casa y compre pan'},
 {'score': 0.06908507645130157,
  'token': 16839,
  'token_str': 'market',
  'sequence': 'Ayer fui a market y compre pan'},
 {'score': 0.023134158924221992,
  'token': 36739,
  'token_str': 'bazar',
  'sequence': 'Ayer fui a bazar y compre pan'},
 {'score': 0.021794216707348824,
  'token': 41885,
  'token_str': 'comer',
  'sequence': 'Ayer fui a comer y compre pan'}]

# Resumen Abstractivo

In [16]:
pipe = pipeline(
    model="josmunpen/mt5-small-spanish-summarization",
    task="summarization"
)

config.json:   0%|          | 0.00/648 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.33M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

In [19]:
text = """
Surgió mi interés de descifrar su ascendencia, y buscando la suya encontré la mía en los verdes frenéticos de mayo hasta el mar y las lluvias feraces y los vientos eternos de los campos de Galicia. Solo entonces entendí de dónde había sacado la abuela aquella credulidad que le permitía vivir en un mundo sobrenatural donde todo era posible, donde las explicaciones racionales carecían por completo de validez.
"""
from pprint import pprint
pprint(text)

('\n'
 'Surgió mi interés de descifrar su ascendencia, y buscando la suya encontré '
 'la mía en los verdes frenéticos de mayo hasta el mar y las lluvias feraces y '
 'los vientos eternos de los campos de Galicia. Solo entonces entendí de dónde '
 'había sacado la abuela aquella credulidad que le permitía vivir en un mundo '
 'sobrenatural donde todo era posible, donde las explicaciones racionales '
 'carecían por completo de validez.\n')


In [18]:
pipe(text)



[{'summary_text': 'el camino de la nueva realidad'}]

# Question Answering

In [20]:
pipe = pipeline(
    model="PlanTL-GOB-ES/roberta-large-bne-sqac",
    task="question-answering"
)

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.07k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/858k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/516k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.48M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [21]:
question = "De que ciudad se habla?"

In [22]:
pipe(
    question=question,
    context=text
)

{'score': 0.9839567542076111, 'start': 190, 'end': 197, 'answer': 'Galicia'}

# Generacion de Texto

In [23]:
pipe = pipeline(
    model="ismaelfaro/gpt2-poems.es",
    task="text-generation"
)

config.json:   0%|          | 0.00/827 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/510M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/236 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

In [25]:
res = pipe("Un sueño en un sueño", max_new_tokens=100)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [28]:
import re
clean_text = re.sub(r"\\n", "\n", res[0]["generated_text"])
print(clean_text)

Un sueño en un sueño para ti 
a mis sueños. 
Por el cuerpo, 
dejando sobre ellas, 
mi mano se agota. 
Luego viene a tu sueño. 
No hay casa que yo fui. 
Estabando mis sueños 
su sueño me duele. 
Mis sueños quedan, 
su sueño me duele. 



In [29]:
pipe = pipeline(
    model="Recognai/bert-base-spanish-wwm-cased-xnli",
    task="zero-shot-classification"
)

config.json:   0%|          | 0.00/834 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/528 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/242k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [37]:
label = ["pez", "perro", "pajaro"]

In [42]:
text = "Vive en el mar y tiene branquias"

In [43]:
pipe(
    text,
    candidate_labels=label
)

{'sequence': 'Vive en el mar y tiene branquias',
 'labels': ['pez', 'pajaro', 'perro'],
 'scores': [0.9506666660308838, 0.027289383113384247, 0.022043956443667412]}