# Summary

`# TODO: summary here`

In [1]:
%load_ext autoreload
%autoreload 3

In [2]:
from abc import ABC, abstractmethod
from collections.abc import Iterable, Mapping
import gc
import matplotlib.pyplot as plt
import numpy as np
import os
from pathlib import Path
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, \
    PegasusTokenizerFast, pipeline

from htools import *
from incendio.utils import DEVICE, gpu_setup

In [3]:
gpu_setup(False)

cpu


  and should_run_async(code)


In [5]:
version = 'tuner007/pegasus_paraphrase'
net = PegasusForConditionalGeneration.from_pretrained(version).to(DEVICE)
tok = PegasusTokenizerFast.from_pretrained(version)

  m.ParseFromString(open(filename, "rb").read())


In [6]:
texts = [
    'Educational games and digital learning materials to provide K-12 '
    'students with enriching experiences.',
    'The world\'s largest social network. Helping people build and maintain '
    'relationships in a disconnected world.',
    'I hate school. I wish my teacher would leave me alone. I don\'t think '
    ' he likes me.',
    'Today the president announced new plans to revamp the private '
    'healthcare system. Pundits questioned how he would manage to pass the '
    'bill.'
]

In [6]:
res = tok.prepare_seq2seq_batch(texts, truncation=True, padding='longest')

In [7]:
res

{'input_ids': tensor([[11263,   727,   111,  1016,   761,   917,   112,   319,  1046,  6054,
           392,   122, 26838,  1747,   107,     1,     0,     0,     0,     0,
             0,     0,     0],
        [  139,   278,   131,   116,  1368,   525,   952,   107, 22844,   200,
           736,   111,  1634,  2074,   115,   114, 20402,   278,   107,     1,
             0,     0,     0],
        [  125,  4180,   399,   107,   125,  1216,   161,  2118,   192,   858,
           213,  1600,   107,   125,   272,   131,   144,   311,   178,  5606,
           213,   107,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [10]:
gen = net.generate(**res)

tensor([[   0, 1046, 6054,  392,  137,  207, 2387,  727,  111, 1016,  761,  917,
          107,    1,    0,    0,    0],
        [   0,  139,  278,  131,  116, 1368,  525,  952,  107,    1,    0,    0,
            0,    0,    0,    0,    0],
        [   0,  125,  272,  131,  144,  172,  399,  111,  161, 2118,  591,  131,
          144,  172,  213,  107,    1]])

In [19]:
tok.batch_decode(gen.tolist(), skip_special_tokens=True)

['K-12 students can use educational games and digital learning materials.',
 "The world's largest social network.",
 "I don't like school and my teacher doesn't like me."]

In [54]:
@add_docstring(net.generate)
def paraphrase(text, n=1, temperature=1.5, **gen_kwargs):
    batch = tok.prepare_seq2seq_batch([text], truncation=True, 
                                      padding='longest').to(DEVICE)
    paraphrased = net.generate(**batch, num_return_sequences=n,
                               temperature=temperature, **gen_kwargs)
    return tok.batch_decode(paraphrased.tolist(), skip_special_tokens=True)

  and should_run_async(code)


In [52]:
paraphrase(texts[3], n=5, num_beams=10)

['The president announced plans to reform the private healthcare system.',
 'Pundits wondered how the president would pass the bill.',
 'Pundits questioned how the president would pass the bill.',
 'Pundits were questioning how the president would pass the bill.',
 'Pundits questioned how he would get the bill passed.']

In [53]:
texts[-1]

'Today the president announced new plans to revamp the private healthcare system. Pundits questioned how he would manage to pass the bill.'

In [48]:
net.generate()

{'input_ids': <Parameter "input_ids: Union[torch.LongTensor, NoneType] = None">,
 'decoder_input_ids': <Parameter "decoder_input_ids: Union[torch.LongTensor, NoneType] = None">,
 'max_length': <Parameter "max_length: Union[int, NoneType] = None">,
 'min_length': <Parameter "min_length: Union[int, NoneType] = None">,
 'do_sample': <Parameter "do_sample: Union[bool, NoneType] = None">,
 'early_stopping': <Parameter "early_stopping: Union[bool, NoneType] = None">,
 'num_beams': <Parameter "num_beams: Union[int, NoneType] = None">,
 'temperature': <Parameter "temperature: Union[float, NoneType] = None">,
 'top_k': <Parameter "top_k: Union[int, NoneType] = None">,
 'top_p': <Parameter "top_p: Union[float, NoneType] = None">,
 'repetition_penalty': <Parameter "repetition_penalty: Union[float, NoneType] = None">,
 'bad_words_ids': <Parameter "bad_words_ids: Union[Iterable[int], NoneType] = None">,
 'bos_token_id': <Parameter "bos_token_id: Union[int, NoneType] = None">,
 'pad_token_id': <Para

## ParaphrasePipeline

In [35]:
class ParaphrasePipeline:
    
    def __init__(self, name='tuner007/pegasus_paraphrase', net=None, 
                 tok=None):
        self.name = name
        self.net = (net 
                    or PegasusForConditionalGeneration.from_pretrained(name))\
                    .to(DEVICE)
        self.tok = tok or PegasusTokenizerFast.from_pretrained(self.name)
        
    @add_docstring(PegasusForConditionalGeneration.generate)
    def __call__(self, text, n=1, temperature=1.5, **kwargs):
        # TODO: not sure how many rows of text can be done in a single batch.
        # Maybe look at other pipelines to see what they do. I'm thinking we
        # could auto-batch longer sequences (e.g. a list w/ 10_000 strings
        # might become 100 batches of 100).
        texts = tolist(text)
        batch = self.tok.prepare_seq2seq_batch(texts, truncation=True, 
                                               padding='longest').to(DEVICE)
        
        # Number of beams must be >= number of sequences to return.
        num_beams = max(n, self.net.config.num_beams,
                        kwargs.pop('num_beams', -1))
        gen_tokens = self.net.generate(**batch, num_return_sequences=n,
                                       temperature=temperature,
                                       num_beams=num_beams,
                                       **kwargs)
        gen = self.tok.batch_decode(gen_tokens.tolist(), 
                                    skip_special_tokens=True)
        if not isinstance(text, str) and len(text) > 1: 
            gen = [gen[i*n:(i+1)*n] for i in range(len(text))]
        return gen
    
# Slightly updated version (if at all) from incendio. Ended up deleting this
# # because I found we can use Text2TextGenerationPipeline.
# class ParaphrasePipeline:
#     """Similar to a transformers Pipeline, this provides a high level 
#     interface for paraphrasing text. It's pretty slow so it's worth using this
#     on a GPU when processing many examples.
#     """
    
#     def __init__(self, name='tuner007/pegasus_paraphrase', net=None, 
#                  tok=None):
#         """
#         Parameters
#         ----------
#         name: str
#             Name of pretrained model to load. This will download weights from
#             Huggingface's model hub (https://huggingface.co/models. In 
#             practice the name should rarely change but we want to give users
#             the option in case you train a better paraphrasing model.
            
#             Name                            Parameters    Download Size
#             tuner007/pegasus_paraphrase     568,822,784   2.28 GB
#             ramsrigouthamg/t5_paraphraser   222,903,936   892 MB
            
#         net: None or nn.Module
#             Pytorch model, usually PegasusForConditionalGeneration. If None,
#             a new model will be instantiated.
#         tok: None or transformers tokenizer
#             A new one will be instantiated by default, but you can also pass
#             one in. This must be the correct tokenizer for the `net` being 
#             used.
#         """
#         self.name = name
#         self.net = (net 
#                     or PegasusForConditionalGeneration.from_pretrained(name))\
#                     .to(DEVICE)
#         self.tok = tok or PegasusTokenizerFast.from_pretrained(self.name)
        
#     @add_docstring(PreTrainedModel.generate)
#     def __call__(self, text, n=1, **kwargs):
#         """Paraphrase one or more pieces of text. We do no auto-batching yet
#         so you may need to split your data up into mini batches when working
#         with many rows.
        
#         Parameters
#         ----------
#         text: str or Iterable[str]
#         n: int
#             Number of variations to generate per sample.
#         kwargs: any
#             Passed on to net's generate function. Its docstring is included
#             below for convenience.
            
#         Returns
#         -------
#         list: If input is a single string, a list of n strings is returned.
#         If input is a lsit of strings, a list of nested lists, each of length
#         n, is returned.
#         """
#         texts = tolist(text)
#         batch = self.tok.prepare_seq2seq_batch(texts, truncation=True, 
#                                                padding='longest').to(DEVICE)
        
#         # Number of beams must be >= number of sequences to return.
#         num_beams = max(n, self.net.config.num_beams,
#                         kwargs.pop('num_beams', -1))
#         gen_tokens = self.net.generate(**batch, num_return_sequences=n,
#                                        num_beams=num_beams,
#                                        **{'temperature': 1.5, **kwargs})
#         gen = self.tok.batch_decode(gen_tokens.tolist(), 
#                                     skip_special_tokens=True)
#         if not isinstance(text, str) and len(text) > 1: 
#             gen = [gen[i*n:(i+1)*n] for i in range(len(text))]
#         return gen

In [36]:
p_pipe = ParaphrasePipeline(net=net, tok=tok)

In [37]:
p_pipe.net.config.num_beams

8

In [38]:
p_pipe('It was a beautiful rainy day.')

['There was a beautiful day.']

In [303]:
p_pipe('It was a beautiful rainy day.', n=3)

['There was a beautiful day.',
 'It was a nice day.',
 'It was raining but it was nice.']

In [54]:
p_pipe(['It was a beautiful rainy day.', 'The duck was yellow and fluffy.'])

[['There was a beautiful day.'], ['The duck was fluffy and yellow.']]

In [55]:
p_pipe(['It was a beautiful rainy day.', 'The duck was yellow and fluffy.'], 
     n=3)

[['There was a beautiful day.',
  'It was a nice day.',
  'It was raining but it was nice.'],
 ['The duck was fluffy and yellow.',
  'The duck was fluffy.',
  'The duck was big and fluffy.']]

In [56]:
p_pipe(['It was a beautiful rainy day.'], n=2)

['There was a beautiful day.', 'It was a nice day.']

In [13]:
# Decided to abandon this. Too many ways children differ: paraphrase pipeline
# can't create pipe with pipeline(name) because it's not part of huggingface,
# paraphrase transform doesn't need to check if listlike because preprocess
# does nothing and __call__ literally just calls pipe. Leaving this here as
# an examle of init_subclass.
class TransformerTransformBase(ABC):
    
    def __init__(self):
        pass
    
    def __call__(self, text, **kwargs):
        if listlike(text):
            return [self.transform(t, **kwargs) for t in text]
        return self.transform(text, **kwargs)
    
    @abstractmethod
    def transform(self, text, **kwargs):
        pass
    
    def __init_subclass__(cls, **kwargs):
        if not hasattr(cls, 'pipe_name'):
            raise RuntimeError(f'{cls} must have class attr "pipe_name".')

In [14]:
with assert_raises(RuntimeError):
    class Tmp(TransformerTransformBase):
        """a"""

As expected, got RuntimeError(<class '__main__.Tmp'> must have class attr "pipe_name".).


## FillMaskTransform

In [90]:
@auto_repr
class FillMaskTransform:    

    MASK = '<mask>'
    name = 'fill-mask'
    
    def __init__(self, pipe=None, n=1, max_n=3):
        """
        Parameters
        ----------
        n: int
            n is intentionally bigger than the default n in __call__. This is
            the number of candidates generated, so if we use strategy='random'
            it makes sense for this to be larger.
        """
        self.pipe = pipe or pipeline(self.name)
        self.n = n
        self.max_n = max_n
        
        assert type(self.pipe).__name__ == 'FillMaskPipeline'
    
    def _preprocess(self, text, min_keep=3, errors='raise'):
        """
        errors: str
            If 'warn', we show a warning when min_keep is violated but allow
            masking to take place.
        """
        if listlike(text):
            return [self._preprocess(row, min_keep, errors) for row in text]
        
        tokens = text.split()
        if len(tokens) < min_keep + 1:
            msg = (f'Text "{text[:25]}..." is too short to mask while '
                   f'enforcing min_keep={min_keep}.')
            if errors == 'warn':
                warnings.warn(msg)
            else:
                raise ValueError(msg)
        
        idx = np.random.choice(range(len(tokens)))
        return ' '.join(self.MASK if i == idx else t 
                        for i, t in enumerate(tokens))
    
    def __call__(self, text, n=None, n_mask=1, min_keep=3, return_all=False, 
                 errors:('warn', 'raise')='raise', 
                 strategy:('random', 'best')='best'):
        """
        n: int or None
            If -1, return all generated examples for the given mask count.
            This can become very large when n_mask is large. Recall pipeline
            can only fill a single mask at a time. e.g. if self.max_n is
            3, n=-1, and n_mask is 4, we first mask once and generate 3
            samples. Then we mask each of those 3 and generate a total of 9 
            samples, then 27, then finally 81 which is what will be returned.
            The intermediate samples can be returned with `return_all=True`.
        """
        # Make sure we generate adequate number of sequences. Model topk must
        # be >= our desired n.
        n = n or self.n
        if n > self.max_n:
            self.max_n = n
            
        # Each item will be a list of strings. Each string in res[i]
        # will have i words changed. If text is a sequence of strings, we must
        # handle each one separately because each is passed through pipeline
        # repeatedly.
        if listlike(text):
            return [self(row, n, n_mask, min_keep, return_all, errors) 
                    for row in text]

        res = [[text]]
        for i in range(n_mask):
            seqs = self.pipe(self._preprocess(res[-1], min_keep=min_keep,
                                              errors=errors))
            # Transformers returns either list of dicts or list of list of 
            # dicts depending on whether input list has 1 item or multiple.
            if isinstance(seqs[0], list): 
                seqs = [seq for group in seqs for seq in group]
            text = [seq['sequence'].replace('<s>', '').replace('</s>', '') 
                    for seq in seqs]
            
            # Keep all generated samples when n is -1.
            if n != -1:
                if strategy == 'random':
                    text = np.random.choice(text, n, replace=False)
                elif strategy == 'best':
                    text = text[:n]
            res.append(text)
        if not return_all: res = res[n_mask]
        return res
    
    @property
    def max_n(self):
        return self.pipe.topk
    
    @max_n.setter
    def max_n(self, max_n):
        if not isinstance(max_n, int):
            raise TypeError('max_n must be an integer.')
        if max_n < self.n:
            raise ValueError(f'max_n must be >= self.n (currently {self.n}.')
        self.pipe.topk = max_n

In [72]:
t = 'I went to the store today to buy eggs.'
ts = [t, 'The bird swooped down onto the picnic table and squawked loudly.']

In [73]:
# m_pipe = pipeline('fill-mask')

In [74]:
m_tfm = FillMaskTransform(m_pipe)
m_tfm

FillMaskTransform(pipe=<transformers.pipelines.FillMaskPipeline object at 0x1b45980438>, n=1)

In [75]:
m_tfm._preprocess(t)

'I went to the store today <mask> buy eggs.'

In [76]:
m_tfm._preprocess(ts)

['I went to <mask> store today to buy eggs.',
 'The bird swooped <mask> onto the picnic table and squawked loudly.']

In [77]:
m_tfm(t, 2)

['I went to the store today to buy eggs.',
 'I went to the store today and buy eggs.']

In [78]:
m_tfm(t, 3, return_all=True)

[['I went to the store today to buy eggs.'],
 ['I went to the store today to buy pizza',
  'I went to the store today to buy...',
  'I went to the store today to buy groceries']]

In [79]:
m_tfm(t, 4, n_mask=2, strategy='best')

['I went to grocery store today to buy eggs.',
 'I went to the store today to buy eggs.',
 'I went to my store today to buy eggs.',
 'I went to a store today to buy eggs.']

In [80]:
m_tfm(t, 4, n_mask=2, strategy='random')

array(['I went to the store wanting to buy eggs.',
       'I went to the store today to buy eggs.',
       'I went through the store today to buy eggs.',
       'I went around the store today to buy eggs.'], dtype='<U46')

In [81]:
m_tfm(t, 3, n_mask=2, return_all=True)

[['I went to the store today to buy eggs.'],
 ['I went to the supermarket today to buy eggs.',
  'I went to the store today to buy eggs.',
  'I went to the grocery today to buy eggs.'],
 ['I went to the supermarket today to buy eggs.',
  'I went to the supermarket today to purchase eggs.',
  'I went to the supermarket today to get eggs.']]

In [82]:
m_tfm(t, 1, return_all=True)

[['I went to the store today to buy eggs.'],
 ['I went to grocery store today to buy eggs.']]

In [83]:
m_tfm(ts)

[['I went to the store today to buy pizza'],
 ['The bird swooped down onto the picnic table and squawked.']]

In [84]:
m_tfm(ts, 2)

[['I went to the store today to buy pizza',
  'I went to the store today to buy...'],
 ['The bird swooped down onto a picnic table and squawked loudly.',
  'The bird swooped down onto the picnic table and squawked loudly.']]

In [85]:
m_tfm(ts, n=None, n_mask=2)

[['I went to the store today to buy pizza'],
 ['The bird swooped down onto the picnic table and sang loudly.']]

In [86]:
m_tfm(ts, n=None, n_mask=2, return_all=True)

[[['I went to the store today to buy eggs.'],
  ['I went to the store today to buy pizza'],
  ['I went to the store today to buy pizza']],
 [['The bird swooped down onto the picnic table and squawked loudly.'],
  ['The bird swooped down onto the picnic table and squawked loudly.'],
  ['The birds swooped down onto the picnic table and squawked loudly.']]]

In [87]:
m_tfm = FillMaskTransform(m_pipe, 2, 5)
m_tfm(t)

  and should_run_async(code)


['I went to the store today to buy eggs.',
 'I headed to the store today to buy eggs.']

In [88]:
m_tfm(ts)

[['I went to the store today to buy eggs.',
  'I went to the store today to purchase eggs.'],
 ['The bird swooped down onto the picnic table and squawked loudly.',
  'The bird swooped down from the picnic table and squawked loudly.']]

In [89]:
m_tfm(ts, n=6)

[['I went to grocery store today to buy eggs.',
  'I went to the store today to buy eggs.',
  'I went to my store today to buy eggs.',
  'I went to a store today to buy eggs.',
  'I went to another store today to buy eggs.',
  'I went to our store today to buy eggs.'],
 ['The bird swooped down onto the picnic table and sang loudly.',
  'The bird swooped down onto the picnic table and cheered loudly.',
  'The bird swooped down onto the picnic table and cried loudly.',
  'The bird swooped down onto the picnic table and complained loudly.',
  'The bird swooped down onto the picnic table and roared loudly.',
  'The bird swooped down onto the picnic table and spoke loudly.']]

## ParaphraseTransform

In [39]:
@auto_repr
class ParaphraseTransform:
    """Not sure how useful this will really be but this basically just 
    wraps ParaphrasePipeline to share a more similar interface with the other
    NLP transforms.
    """
    
    name = 'tuner007/pegasus_paraphrase'

    def __init__(self, pipe=None, n=1):
        # We let user pass in pipe at least for now since re-instantiating the
        # class can be very slow during development. Need to consider whether
        # I want this behavior to remain.
        self.pipe = pipe or ParaphrasePipeline(self.name)
        self.n = n
            
        assert type(self.pipe).__name__ == 'ParaphrasePipeline'
        
    def _preprocess(self, text):
        """Does nothing (just want shared interface with other transforms)."""
        return text
    
    def __call__(self, text, n=None, **kwargs):
        return self.pipe(text, n=n or self.n, **kwargs)

In [304]:
p_tfm = ParaphraseTransform(p_pipe)
p_tfm

ParaphraseTransform(pipe=<__main__.ParaphrasePipeline object at 0x1a47cda198>)

In [305]:
p_tfm('The beach is loud and crowded today.', n=3)

['The beach is crowded.',
 'There is a lot of people on the beach.',
 'There is a lot of people at the beach.']

In [174]:
p_tfm(ts)

[['I went to buy eggs.'], ['The bird swooped down onto the picnic table.']]

In [175]:
p_tfm(ts, n=2)

[['I went to buy eggs.', 'I went to the store to buy eggs.'],
 ['The bird swooped down onto the picnic table.',
  'The bird swooped down on the picnic table.']]

In [227]:
def listlike(x):
    """Checks if an object is a list/tuple/set/array etc. Strings and
    mappings (e.g. dicts) are not considered list-like.
    """
    return isinstance(x, Iterable) and not isinstance(x, (str, Mapping))

In [226]:
for obj in ('a', 6, [], (), {}, set(), [3, 4], ['a', 'b'], ('a',), {'a': 'b'},
            {'a', 'b'}, np.arange(5)):
    print(type(obj), listlike(obj))

<class 'str'> False
<class 'int'> False
<class 'list'> True
<class 'tuple'> True
<class 'dict'> False
<class 'set'> True
<class 'list'> True
<class 'list'> True
<class 'tuple'> True
<class 'dict'> False
<class 'set'> True
<class 'numpy.ndarray'> True


## GenerativeTransform

In [70]:
@auto_repr
class GenerativeTransform:
    
    name = 'text-generation'
    
    def __init__(self, pipe=None, n=1):
        # Allow user to pass in n here to reduce likelihood of needing to 
        # create a partial from __call__. Maybe should add other __call__
        # kwargs here?
        self.pipe = pipe or pipeline(self.name)
        self.n = n
        
        assert type(self.pipe).__name__ == 'TextGenerationPipeline'
    
    def _preprocess(self, text, drop=None, drop_pct=None, rand_low=None, 
                    rand_high=None, min_keep=3, return_tuple=False):
        """Truncate text."""
        if listlike(text):
            return [self._preprocess(row, drop, drop_pct, rand_low, rand_high,
                                     min_keep, return_tuple) for row in text]
        
        tokens = text.split()
        if len(tokens) <= min_keep:
            n_drop = 0
        else:
            # Default is to truncate the last 20% of the sequence.
            if drop:
                n_drop = drop
            elif drop_pct:
                n_drop = int(drop_pct * len(tokens))
            elif rand_low is not None and rand_high is not None:
                n_drop = np.random.randint(rand_low, rand_high)
            else:
                n_drop = int(np.ceil(.2 * len(tokens)))
            n_drop = np.clip(n_drop, 0, len(tokens) - min_keep)
            tokens = tokens[:-n_drop]
        truncated = ' '.join(tokens)
        return (truncated, n_drop) if return_tuple else truncated
    
    def __call__(self, text, n=None, min_length=2, max_length=7, 
                 **generate_kwargs):
        n = n or self.n
        if listlike(text):
            return [self(row, n, min_length, max_length, **generate_kwargs) 
                    for row in text]
    
        # `generate` counts current length as part of min_length. 
        text = self._preprocess(text)
        n_curr = len(self.pipe.tokenizer.tokenize(text))
        res = self.pipe(text, min_length=n_curr + min_length,
                        max_length=n_curr + max_length,
                        num_return_sequences=n, **generate_kwargs)
        return [row['generated_text'] for row in res]

In [535]:
# g_pipe = pipeline('text-generation')
g_tfm = GenerativeTransform(g_pipe)
g_tfm

GenerativeTransform(pipe=<transformers.pipelines.TextGenerationPipeline object at 0x1d081425c0>, n=1)

In [536]:
g_tfm._preprocess(t)

'I went to the store today to'

In [537]:
g_tfm._preprocess(ts, return_tuple=True)

[('I went to the store today to', 2),
 ('The bird swooped down onto the picnic table', 3)]

In [538]:
g_tfm(t)

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


['I went to the store today to buy a shirt, but I felt']

In [539]:
g_tfm(t, n=2)

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


['I went to the store today to buy some clothes. The first thing',
 'I went to the store today to see where it went and I asked']

In [540]:
g_tfm(ts)

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence
Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


[['I went to the store today to check out everything. I wanted to'],
 ['The bird swooped down onto the picnic table so much that two of them noticed']]

In [541]:
g_tfm(ts, n=2)

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence
Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


[['I went to the store today to pick up my kids, got some',
  "I went to the store today to check out his new book, '"],
 ['The bird swooped down onto the picnic table, then proceeded to lay itself down',
  'The bird swooped down onto the picnic table, where he and the other dogs']]

In [285]:
# I'm now thinking this isn't that useful. It also loses our kwarg names 
# unless I pull some tricks altering signatures. Don't think this offers 
# enough benefit to justify that.
class TransformerTransform:
    
    def __init__(self, mode, pipe=None):
        self.mode = mode
        self._transformer = self._get_transformer(pipe)
        
    def _preprocess(self, text, **kwargs):
        return self._transformer._preprocess(text, **kwargs)
    
    def __call__(self, text, **kwargs):
        return self._transformer(text, **kwargs)
    
    def _get_transformer(self, pipe):
        if self.mode == 'mask':
            return FillMaskTransform(pipe)
        elif self.mode == 'generate':
            return GenerativeTransform(pipe)
        elif self.mode == 'paraphrase':
            return ParaphraseTransform(pipe)
        else:
            raise ValueError('mode must be in (mask, generate, paraphrase).')

In [287]:
mt = TransformerTransform('mask', m_pipe)

  and should_run_async(code)


In [288]:
mt._preprocess(t)

'I went to <mask> store today to buy eggs.'

In [289]:
mt._preprocess(ts, n=2)

['I <mask> to the store today to <mask> eggs.',
 'The <mask> swooped down <mask> the picnic table and squawked loudly.']

In [290]:
mt(t)

['I went to the store today to buy eggs.',
 'I went into the store today to buy eggs.',
 'I went through the store today to buy eggs.',
 'I went around the store today to buy eggs.',
 'I went in the store today to buy eggs.']

In [291]:
mt(ts)

[['I went to the store today to buy eggs.',
  'I went into the store today to buy eggs.',
  'I went through the store today to buy eggs.',
  'I went around the store today to buy eggs.',
  'I went in the store today to buy eggs.'],
 ['The bird swooped down onto the picnic table and squawked loudly.',
  'The bird swooped down onto the picnic table, squawked loudly.',
  'The bird swooped down onto the picnic table then squawked loudly.',
  'The bird swooped down onto the picnic table who squawked loudly.',
  'The bird swooped down onto the picnic table but squawked loudly.']]

## BackTranslationTransform

Realized Huggingface provides no models for translating back to English.

In [7]:
class BackTranslationTransform:
    
    def __init__(self, pipe=None, n=1, from_lang='en', to_lang='fr',
                 pipe_rev=None):
        self.name = f'translation_{from_lang}_to_{to_lang}'
        self.name_rev = f'translation_{to_lang}_to_{from_lang}'
        self.pipe = pipe or pipeline(self.name)
        self.pipe_rev = pipe_rev or pipeline(self.name_rev)
        
    def _preprocess(self, text):
        return text
    
    def __call__(self, text):
        trans = self.pipe(text)
        print(trans)
        return self.pipe_rev(trans)

In [9]:
# b_tfm = BackTranslationTransform()

## ParaphraseTransform v2

Found there actually is a built-in version of the pipeline that I think will work. Try it out.

In [14]:
from transformers import Text2TextGenerationPipeline, PreTrainedModel

  and should_run_async(code)


In [12]:
Text2TextGenerationPipeline.__name__

'Text2TextGenerationPipeline'

In [28]:
# export
@auto_repr
class ParaphraseTransform:
    """Text transform that paraphrases input text as a method of data
    augmentation. This is rather slow so it's recommended to precompute 
    samples and save them, but you could generate samples on the fly if 
    desired. One further downside of that approach is you'll have a huge
    paraphrasing model on the GPU while (presumably) training another model.
    
    Note: This just wraps ParaphrasePipeline to share a more similar interface 
    with the other NLP transforms. Since no preprocessing is required, it's
    basically identical to ParaphrasePipeline.
    """

    def __init__(self, pipe=None, n=1, name='tuner007/pegasus_paraphrase'):
        """
        Parameters
        ----------
        pipe: ParaphrasePipeline or None
        n: int
            Default number of samples to generate. You can override this in
            __call__.
        """
        if pipe:
            self.pipe = pipe
            self.name = pipe.model.config._name_or_path
        else:
            self.pipe = Text2TextGenerationPipeline(
                PegasusForConditionalGeneration.from_pretrained(name),
                PegasusTokenizer.from_pretrained(name),
                device=0 if torch.cuda.is_available() else -1
            )
            self.name = name
        self.n = n
            
        assert type(self.pipe).__name__ == 'Text2TextGenerationPipeline'
        if 'cuda' not in str(self.pipe.device) and torch.cuda.is_available():
            warnings.warn('The pipeline passed in is not using cuda. '
                          'Did you mean to use the available GPU?')
                
    def _preprocess(self, text):
        """Does nothing (just want shared interface with other transforms)."""
        return text
    
    @add_docstring(PreTrainedModel.generate)
    def __call__(self, text, n=None, **kwargs):
        """
        Parameters
        ----------
        text: str or Iterable[str]
            Raw text to transform.
        n: int or None
            If None, use the default self.n.
        kwargs: any
            Additional kwargs are passed to the model's text generation 
            method. Its docstring is included below for convenience.
            
        Returns
        -------
        list: either a list of n strings (if input text is a single string) 
        or a list of lists, each of length n.
        """
        n = n or self.n
        rows = [row['generated_text'] for row in 
                self.pipe(text, num_return_sequences=n, **kwargs)]
        if listlike(text): 
            rows = [rows[i*n:(i+1)*n] for i in range(len(text))]
        return rows

## Test on data

In [351]:
lines = [line.strip() for line 
         in '\n'.join(load(f'/Users/hmamin/data/bbc/tech/{n:03}.txt') 
         for n in range(1, 402)).split('.') if line]
len(lines)

9887

In [544]:
m_res = m_tfm(lines[:100], errors='warn')



In [549]:
len(flatten(m_res))

500

In [545]:
m_res[:2]

[["Ink helps drive democracy in Asia The Kyrgyz Republic, a small, mountainous state of the former Soviet republic, is using invisible ink and fingerprint readers in the country's elections as part of a drive to prevent multiple voting",
  "Ink helps drive democracy in Asia The Kyrgyz Republic, a small, mountainous state of the former Soviet republic, is using invisible ink and electronic readers in the country's elections as part of a drive to prevent multiple voting",
  "Ink helps drive democracy in Asia The Kyrgyz Republic, a small, mountainous state of the former Soviet republic, is using invisible ink and pen readers in the country's elections as part of a drive to prevent multiple voting",
  "Ink helps drive democracy in Asia The Kyrgyz Republic, a small, mountainous state of the former Soviet republic, is using invisible ink and paper readers in the country's elections as part of a drive to prevent multiple voting",
  "Ink helps drive democracy in Asia The Kyrgyz Republic, a sma

In [543]:
g_res = g_tfm(lines[:100])

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence
Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence
Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence
Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence
Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence
Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence
Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence
Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence
Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence
Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence
Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence
Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence
Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence
Setting `pad

In [548]:
len(flatten(g_res))

100

In [550]:
lines[:2]

["Ink helps drive democracy in Asia\n\nThe Kyrgyz Republic, a small, mountainous state of the former Soviet republic, is using invisible ink and ultraviolet readers in the country's elections as part of a drive to prevent multiple voting",
 'This new technology is causing both worries and guarded optimism among different sectors of the population']

In [546]:
g_res[:2]

 ['This new technology is causing both worries and guarded optimism among different groups in Europe about cybersecurity," said']]

In [558]:
p_res = p_tfm(lines[:25])

In [559]:
len(flatten(p_res))

25

In [560]:
lines[:5]

["Ink helps drive democracy in Asia\n\nThe Kyrgyz Republic, a small, mountainous state of the former Soviet republic, is using invisible ink and ultraviolet readers in the country's elections as part of a drive to prevent multiple voting",
 'This new technology is causing both worries and guarded optimism among different sectors of the population',
 'In an effort to live up to its reputation in the 1990s as "an island of democracy", the Kyrgyz President, Askar Akaev, pushed through the law requiring the use of ink during the upcoming Parliamentary and Presidential elections',
 'The US government agreed to fund all expenses associated with this decision',
 'The Kyrgyz Republic is seen by many experts as backsliding from the high point it reached in the mid-1990s with a hastily pushed through referendum in 2003, reducing the legislative branch to one chamber with 75 deputies']

In [561]:
p_res[:5]

[["The Kyrgyz Republic, a small, mountainous state of the former Soviet republic, is using invisible ink and ultraviolet readers in the country's elections as part of a drive to prevent multiple voting."],
 ['There are both worries and guarded optimism among different sectors of the population.'],
 ['The law requiring the use of ink during the upcoming Parliamentary and Presidential elections was pushed through by the President in an effort to live up to its reputation as an island of democracy.'],
 ['All expenses associated with this decision will be funded by the US government.'],
 ['The high point of the Kyrgyz Republic was in the mid 1990s when it had a legislative branch with 75 deputies and was seen by many experts as backsliding.']]

Note: with GPU, paraphrase transform takes ~9 seconds to generate variations of 100 input sentences. (Different inputs than used here but I don't think the length differs dramatically.)

## RandomPipeline

Considering the idea of making a pipeline that accepts multiple callables and applies each one in order with a different (or same) probability P. We could construct this with RandomTransform manually but it seems like if everything or most things are random transforms, we might as well obscure this from the user.

Trying to brainstorm what desired interface might look like.

```
pipeline = RandomPipeline(fill_mask, paraphrase, back_translate, p=.5)

pipeline = RandomPipeline(fill_mask, paraphrase, back_translate, 
                          p=[.5, 1., .5])

pipeline = RandomPipeline.from_dict(
    {fill_mask: .5, paraphrase: 1., back_translate: .25}
)
```

Leaning towards no inverse_transform, or at least making it optional. Some
transforms, like ParaphraseTransform, aren't really meant to be reversible.
I suppose I could keep a mapping between original and transformed items but
that might become infeasible as we process more data.

Could also use pipeline.transform(text)

```augmented = pipeline(text)
text = pipeline.inverse_transform(text)
```

In [3]:
from numbers import Real
from incendio.data import RandomTransform

  and should_run_async(code)


In [75]:
def tolist(x, length_like=None, length=None, 
           error_message='x length does not match desired length.'):
    """Helper to let a function accept a single value or a list of values for
    a certain parameter. 
    
    WARNING: if x is a primitive and you specify a length (either via 
    `length_like` or `length`, the resulting list will contain multiple 
    references to the same item). This is mostly intended for use on lists of
    floats or ints so I don't think it's a problem, but keep this in mind when
    considering using this on mutable objects.
    
    Parameters
    ----------
    x: Iterable
        Usually an object that could either be a list/tuple or a primitive,
        depending on what user passed in.
    strict: bool
        If True, returned value will always be a list. If False, we allow
        tuples/sets/etc. to retain their initial type.

    Returns
    -------
    Iterable: list if strict is True or if x is a primitive. If strict is
    False and x is already a tuple/set/something similar, its type will be
    retained.

    Examples
    --------
    def train(lrs):
        lrs = tolist(lrs)
        ...

    >>> train(3e-3)
    >>> train([3e-4, 3e-3])
    """
    if length_like is not None: length = len(length_like)
        
    # Case 1. List-like x
    if listlike(x):
        if length: 
            assert len(x) == length, error_message
        return list(x)
    
    # Case 2. Dict-like x
    if isinstance(x, Mapping):
        raise ValueError('x must not be a mapping. It should probably be a '
                         'primitive (str, int, etc.) or a list-like object '
                         '(tuple, list, set).')
        
    # Case 3. Primitive x
    return [x] * (length or 1)

In [76]:
p = .5
t = [1, 2, 3]
# p = p if listlike(p) and len(p) == len(t) else [p] * len(t)
p

0.5

In [77]:
tolist(.5)

[0.5]

In [78]:
tolist(.5, length=2)

[0.5, 0.5]

In [79]:
tolist(.5, length_like=t)

[0.5, 0.5, 0.5]

In [80]:
tolist({3, 4, -1})

[3, 4, -1]

In [81]:
with assert_raises(AssertionError):
    tolist({3, 4}, length_like=t)

As expected, got AssertionError(x length does not match desired length.).


In [82]:
tolist({3, 4, -1}, length_like=t)

[3, 4, -1]

In [83]:
with assert_raises(ValueError):
    tolist({3: 1})

As expected, got ValueError(x must not be a mapping. It should probably be a primitive (str, int, etc.) or a list-like object (tuple, list, set).).


In [104]:
class RandomPipeline(BasicPipeline):
    """Create a pipeline of callables that are applied in sequence, each with
    some random probability p (this can be the same or different for each 
    step). This is useful for on-the-fly data augmentation (think in the
    __getitem__ method of a torch Dataset).
    """
    
    def __init__(self, *transforms, p=.5):
        """
        Parameters
        ----------
        transforms: callable
            Functions or callable classes that accept a single argument (use
            functools.partial if necessary). They will be applied in the order
            you pass them in.
        p: float or Iterable[float]
            Probability that each transform will be applied. If a single 
            float, each transform will have the same probability. If a list,
            its length msut match the number of transforms passed in: p[0] 
            will be assigned to transforms[0], p[1] to transforms[1], and so 
            on.
        """
        p = tolist(p, transforms, error_message='p must be a float or a list '
                   'with one float for each transform.')
        if any(n <= 0 or n > 1 for n in p):
            raise ValueError('p must be in range (0, 1]. I.E. you can choose '
                             'to always apply a transform, but if you never '
                             'want to apply it there\'s no need to include '
                             'it in the pipeline.')
        
        super().__init__(*[RandomTransform(t, p_) 
                           for t, p_ in zip(transforms, p)])
        
    @classmethod
    def from_dict(cls, t2p):
        """
        Parameters
        ----------
        t2p: dict[callable, float]
            Maps transform to its corresponding probability.
            
        Examples
        --------
        transforms = {times_3: .33,
                      to_string: 1.0,
                      dashed_join: .67,
                      to_upper: .95}
        pipeline = RandomPipeline.from_dict(transforms)
        """
        return cls(*t2p.keys(), p=t2p.values())

In [100]:
def to_upper(t):
    return t.upper()

def times_3(t):
    return t * 3

def join(t, sep='---'):
    return sep.join(t)

In [101]:
t = 'dog'
rp = RandomPipeline(to_upper, times_3, join)
rp

RandomPipeline(
	RandomTransform(to_upper, p=0.5),
	RandomTransform(times_3, p=0.5),
	RandomTransform(join, p=0.5)
)

In [102]:
for i in range(10):
    print(rp(t))

d---o---g---d---o---g---d---o---g
D---O---G---D---O---G---D---O---G
D---O---G
DOGDOGDOG
d---o---g
DOGDOGDOG
dog
dogdogdog
d---o---g
D---O---G---D---O---G---D---O---G


In [103]:
for i in range(10):
    print(rp(t))

D---O---G
dogdogdog
d---o---g
dogdogdog
dogdogdog
D---O---G
dogdogdog
D---O---G---D---O---G---D---O---G
d---o---g---d---o---g---d---o---g
D---O---G---D---O---G---D---O---G


In [88]:
t = 'dog'
rp = RandomPipeline(to_upper, times_3, join, p=[1., .6, 1])
rp

RandomPipeline(
	RandomTransform(to_upper, p=1.0),
	RandomTransform(times_3, p=0.6),
	RandomTransform(join, p=1)
)

In [89]:
for i in range(10):
    print(rp(t))

D---O---G---D---O---G---D---O---G
D---O---G
D---O---G---D---O---G---D---O---G
D---O---G---D---O---G---D---O---G
D---O---G
D---O---G---D---O---G---D---O---G
D---O---G---D---O---G---D---O---G
D---O---G---D---O---G---D---O---G
D---O---G
D---O---G


In [95]:
t = 'dog'
with assert_raises(ValueError):
    rp = RandomPipeline(join, p=0)
    rp

As expected, got ValueError(p must be in range (0, 1]. I.E. you can choose to always apply a transform, but if you never want to apply it there's no need to include it in the pipeline.).


In [97]:
with assert_raises(AssertionError):
    rp = RandomPipeline(to_upper, times_3, join, p=[.2, 1])
    rp

As expected, got AssertionError(p must be a float or a list with one float for each transform.).


In [92]:
t = 'dog'
tfms = {times_3: .33,
        join: .67,
        to_upper: .95}
rp = RandomPipeline.from_dict(tfms)
rp

RandomPipeline(
	RandomTransform(times_3, p=0.33),
	RandomTransform(join, p=0.67),
	RandomTransform(to_upper, p=0.95)
)

In [93]:
rp.funcs

[RandomTransform(times_3, p=0.33),
 RandomTransform(join, p=0.67),
 RandomTransform(to_upper, p=0.95)]

In [94]:
for i in range(10):
    print(rp(t))

DOG
D---O---G---D---O---G---D---O---G
D---O---G---D---O---G---D---O---G
d---o---g
D---O---G
DOG
D---O---G
D---O---G---D---O---G---D---O---G
D---O---G
D---O---G


In [1]:
class BacktranslateTransform:

    names = ['Helsinki-NLP/opus-mt-en-ROMANCE',
             'Helsinki-NLP/opus-mt-ROMANCE-en']
    
    language_codes = {
        'es': 'spanish',
        'it': 'italian',
        'pt': 'portuguese',
        'pt_br': 'portuguese (brazil)',
        'ro': 'romanian',
        'ca': 'catalan',
        'gl': 'galician',
        'pt_BR': 'portuguese (brazil?)',
        'la': 'latin',
        'wa': 'walloon',
        'fur': 'friulian (?)',
        'oc': 'occitan',
        'fr_CA': 'french (canada)',
        'sc': 'sardianian',
        'es_ES': 'spanish',
        'es_MX': 'spanish (mexico)',
        'es_AR': 'spanish (argentina)',
        'es_PR': 'spanish (puerto rico)',
        'es_UY': 'spanish (uruguay)',
        'es_CL': 'spanish (chile)',
        'es_CO': 'spanish (colombia)',
        'es_CR': 'spanish (croatia)',
        'es_GT': 'spanish (guatemala)',
        'es_HN': 'spanish (honduras)',
        'es_NI': 'spanish (nicaragua)',
        'es_PA': 'spanish (panama)',
        'es_PE': 'spanish (peru)',
        'es_VE': 'spanish (venezuela)',
        'es_DO': 'spanish (dominican republic)',
        'es_EC': 'spanish (ecuador)',
        'es_SV': 'spanish (el salvador)',
        'an': 'aragonese',
        'pt_PT': 'portuguese (portugal)',
        'frp': 'franco provencal',
        'lad': 'ladino',
        'vec': 'venetian',
        'fr_FR': 'france (france)',
        'co': 'corsican',
        'it_IT': 'italian (italy)',
        'lld': 'ladin',
        'lij': 'ligurian',
        'lmo': 'lombard',
        'nap': 'neapolitan',
        'rm': 'rhaetian (?)',
        'scn': 'sicilian',
        'mwl': 'mirandese'
    }

    def __init__(self, to_langs, pipes=()):
        if not pipes:
            pipes = [
                TranslationPipeline(
                    model=AutoModelForSeq2SeqLM.from_pretrained(name),
                    tokenizer=AutoTokenizer.from_pretrained(name),
                    device=1 - torch.cuda.is_available()
                ) for name in names
            ]
        self.pipes = pipes
        self.to_langs = tolist(to_langs)

    # def __call__(self, text, to_langs=()):
    #     text = tolist(text)
    #     to_langs = tolist(to_langs) or self.to_langs
    #     for lang in to_langs:
    #         text = [f'>>{lang}<< {t}' for t in text]
    #         text = [row['translation_text'] for row in self.pipes[0](text)]
    #         text = [row['translation_text'] for row in self.pipes[1](text)]
    #     return text

    def __call__(self, text, to_langs=()):
        text = tolist(text)
        to_langs = tolist(to_langs) or self.to_langs
        steps = []
        for lang in to_langs:
            text = [f'>>{lang}<< {t}' for t in text]
            text = [row['translation_text'] for row in self.pipes[0](text)]
            text = [row['translation_text'] for row in self.pipes[1](text)]
            steps.append(text)
        return steps

    def __repr__(self):
        lang_str = ", ".join(repr(lang) for lang in self.to_langs)
        return f'{func_name(self)}(to_langs=[{lang_str}])'