# Chapter 17: Pipelines for NLP Tasks

In [2]:
!pip install transformers portalocker

## 17.3 HuggingFace Pipelines

![](https://raw.githubusercontent.com/dvgodoy/assets/main/PyTorchInPractice/images/ch15/hf_nlp_tasks.png)

### 17.3.1 Models

In [10]:
from transformers.pipelines import SUPPORTED_TASKS

[(task, conf['default'].get('model', {}).get('pt', (None,))[0])
 for task, conf in SUPPORTED_TASKS.items()
 if conf['type'] == 'text']

[('text-classification', 'distilbert-base-uncased-finetuned-sst-2-english'),
 ('token-classification', 'dbmdz/bert-large-cased-finetuned-conll03-english'),
 ('question-answering', 'distilbert-base-cased-distilled-squad'),
 ('table-question-answering', 'google/tapas-base-finetuned-wtq'),
 ('fill-mask', 'distilroberta-base'),
 ('summarization', 'sshleifer/distilbart-cnn-12-6'),
 ('translation', None),
 ('text2text-generation', 't5-base'),
 ('text-generation', 'gpt2'),
 ('zero-shot-classification', 'facebook/bart-large-mnli'),
 ('conversational', 'microsoft/DialoGPT-medium')]

### 17.3.2 Tokenizers

In [11]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [12]:
tokenizer.vocab_size, tokenizer.all_special_tokens

(30522, ['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]'])

In [13]:
sentences = ("The core of the planet is becoming unexplicably unstable.",
             "The shift in the company's core business markets had impacted their quartely results.")

tokens = tokenizer.tokenize(sentences[0])
tokens

['the',
 'core',
 'of',
 'the',
 'planet',
 'is',
 'becoming',
 'une',
 '##x',
 '##pl',
 '##ica',
 '##bly',
 'unstable',
 '.']

In [14]:
token_ids = tokenizer.encode(sentences[0])
token_ids

[101,
 1996,
 4563,
 1997,
 1996,
 4774,
 2003,
 3352,
 16655,
 2595,
 24759,
 5555,
 6321,
 14480,
 1012,
 102]

In [15]:
tokenizer.decode(token_ids)

'[CLS] the core of the planet is becoming unexplicably unstable. [SEP]'

In [16]:
token_dict = tokenizer.encode_plus(sentences[0])
token_dict

{'input_ids': [101, 1996, 4563, 1997, 1996, 4774, 2003, 3352, 16655, 2595, 24759, 5555, 6321, 14480, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [17]:
token_dict_mult = tokenizer(*sentences)
token_dict_mult

{'input_ids': [101, 1996, 4563, 1997, 1996, 4774, 2003, 3352, 16655, 2595, 24759, 5555, 6321, 14480, 1012, 102, 1996, 5670, 1999, 1996, 2194, 1005, 1055, 4563, 2449, 6089, 2018, 19209, 2037, 24209, 24847, 2135, 3463, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [18]:
tokenizer.decode(token_dict_mult['input_ids'])

"[CLS] the core of the planet is becoming unexplicably unstable. [SEP] the shift in the company's core business markets had impacted their quartely results. [SEP]"

### 17.3.3 Zero-Shot Text Classification

![](https://raw.githubusercontent.com/dvgodoy/assets/main/PyTorchInPractice/images/ch0/data_step1.png)

In [6]:
!wget https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/train.csv
!wget https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/test.csv
!wget https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/classes.txt

![](https://raw.githubusercontent.com/dvgodoy/assets/main/PyTorchInPractice/images/ch0/data_step2.png)

In [7]:
import numpy as np

chr_codes = np.array([
     36,   151,    38,  8220,   147,   148,   146,   225,   133,    39,  8221,  8212,   232,   149,   145,   233,
  64257,  8217,   163,   160,    91,    93,  8211,  8482,   234,    37,  8364,   153,   195,   169
])
chr_subst = {f' #{c};':chr(c) for c in chr_codes}
chr_subst.update({' amp;': '&', ' quot;': "'", ' hellip;': '...', ' nbsp;': ' ', '&lt;': '', '&gt;': '',
                  '&lt;em&gt;': '', '&lt;/em&gt;': '', '&lt;strong&gt;': '', '&lt;/strong&gt;': ''})

In [8]:
def replace_chars(sent):
    to_replace = [c for c in list(chr_subst.keys()) if c in sent]
    for c in to_replace:
        sent = sent.replace(c, chr_subst[c])
    return sent

def preproc_description(desc):
    desc = desc.replace('\\', ' ').strip()
    return replace_chars(desc)

In [9]:
from torchdata.datapipes.iter import FileLister
from torch.utils.data import DataLoader

def create_raw_datapipe(fname):
    datapipe = FileLister(root='.')
    datapipe = datapipe.filter(filter_fn=lambda v: v.endswith(fname))
    datapipe = datapipe.open_files(mode='rt', encoding="utf-8")
    datapipe = datapipe.parse_csv(delimiter=",", skip_lines=0)
    datapipe = datapipe.map(lambda row: (int(row[0])-1, preproc_description(row[2])))
    return datapipe

![](https://raw.githubusercontent.com/dvgodoy/assets/main/PyTorchInPractice/images/ch0/data_step4.png)

In [None]:
datapipes = {}
datapipes['train'] = create_raw_datapipe('train.csv').shuffle(buffer_size=125000)
datapipes['test'] = create_raw_datapipe('test.csv')

![](https://raw.githubusercontent.com/dvgodoy/assets/main/PyTorchInPractice/images/ch0/data_step5.png)

In [None]:
dataloaders = {}
dataloaders['train'] = DataLoader(dataset=datapipes['train'], batch_size=32, shuffle=True)
dataloaders['test'] = DataLoader(dataset=datapipes['test'], batch_size=32)

![](https://raw.githubusercontent.com/dvgodoy/assets/main/PyTorchInPractice/images/ch0/model_step5.png)

In [22]:
import torch
from transformers import pipeline

device = 0 if torch.cuda.is_available() else -1

classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=device)

In [23]:
import warnings
warnings.filterwarnings("ignore")

candidate_labels = ['world', 'sports', 'business', 'science and technology']

labels, sentences = next(iter(dataloaders['test']))

out = classifier(list(sentences), candidate_labels)

In [24]:
out[0]

{'sequence': "Unions representing workers at Turner   Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul.",
 'labels': ['business', 'world', 'sports', 'science and technology'],
 'scores': [0.5680877566337585,
  0.32770952582359314,
  0.05973348021507263,
  0.044469203799963]}

In [25]:
pred_label = out[0]['labels'][0]
pred_class = candidate_labels.index(pred_label)
pred_label, pred_class

('business', 2)

In [26]:
pred_labels = torch.as_tensor([candidate_labels.index(s['labels'][0]) for s in out])
(pred_labels == labels).float().mean()

tensor(0.4062)