In [2]:
# default_exp nlp

# NLP

> API details.

In [3]:
#hide
from nbdev.showdoc import *

In [4]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [5]:
# export
from bs4 import BeautifulSoup
from collections import Counter
from collections.abc import Iterable
from functools import partial
from multipledispatch import dispatch
import multiprocessing
import numpy as np
import os
import pandas as pd
from pathlib import Path
import re
import requests
from sklearn.decomposition import PCA
from sklearn.utils.validation import check_is_fitted
import spacy
from textblob import TextBlob
import torch
from tqdm.auto import tqdm
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, \
    PegasusTokenizerFast, Text2TextGenerationPipeline, \
    AutoModelForSeq2SeqLM, AutoTokenizer, TranslationPipeline, pipeline
from tldextract import extract
from transformers.modeling_utils import PreTrainedModel
import warnings
import wordninja as wn

from htools import save, load, add_docstring, tolist, auto_repr, listlike, \
    flatten, immutify_defaults, ifnone, item, lmap, func_name, valuecheck
from incendio.utils import DEVICE

In [6]:
# Only needed for testing.
from string import ascii_lowercase

from htools import assert_raises

In [None]:
# Nonsense sample text.
text = [
    f"Row {i}: I went, yesterday; she wasn't here after school? Today. --2"
    for i in range(25_000)
]

In [None]:
df = pd.DataFrame(text, columns=['a'])
df.tail()

Unnamed: 0,a
24995,"Row 24995: I went, yesterday; she wasn't here ..."
24996,"Row 24996: I went, yesterday; she wasn't here ..."
24997,"Row 24997: I went, yesterday; she wasn't here ..."
24998,"Row 24998: I went, yesterday; she wasn't here ..."
24999,"Row 24999: I went, yesterday; she wasn't here ..."


In [None]:
# export
tokenizer = partial(spacy.load, name='en_core_web_sm',
                    disable=('ner', 'parser', 'tagger'))

In [None]:
NLP = tokenizer()

In [None]:
# export
def tokenize(text, nlp):
    """Word tokenize a single string. 
    
    Parameters
    ----------
    x: str
        A piece of text to tokenize.
    nlp: spacy tokenizer, e.g. spacy.lang.en.English
        By default, a spacy tokenizer with a small English vocabulary 
        is used. NER, parsing, and tagging are disabled. Any spacy
        tokenzer can be passed in, but keep in mind other configurations 
        may slow down this function dramatically.
        
    Returns
    -------
    list[str]: List of word tokens from a single input string.
    """
    return [tok.text for tok in nlp(text)]

In [None]:
# export
def tokenize_many(rows, nlp=None, chunk=1_000):
    """Word tokenize a sequence of strings using multiprocessing. The max
    number of available processes are used.
    
    Parameters
    ----------
    rows: Iterable[str]
        A sequence of strings to tokenize. This could be a list, a column of
        a DataFrame, etc.
    nlp: spacy tokenizer, e.g. spacy.lang.en.English
        By default, a spacy tokenizer with a small English vocabulary 
        is used. NER, parsing, and tagging are disabled. Any spacy
        tokenzer can be passed in, but keep in mind other configurations 
        may slow down this function dramatically.
    chunk: int
        This determines how many items to send to multiprocessing at a time.
        The default of 1,000 is usually fine, but if you have extremely
        long pieces of text and memory is limited, you can always decrease it.
        Very small chunk sizes may increase processing time. Note that larger
        values will generally cause the progress bar to update more choppily.
        
    Returns
    -------
    list[list[str]]: Each nested list of word tokens corresponds to one
    of the input strings.
    """
    tokenize_ = partial(tokenize, nlp=nlp or tokenizer())
    length = len(rows)
    with multiprocessing.Pool() as p:
        res = list(tqdm(p.imap(tokenize_, rows, chunksize=chunk),
                        total=length))
    return res

In [None]:
# ~5-6 seconds
x = df.a.apply(tokenize)

In [None]:
# ~1-2 seconds
x = tokenize_many(df.a)

HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))




In [3]:
# export
class Vocabulary:

    def __init__(self, w2idx, w2vec=None, idx_misc=None, corpus_counts=None,
                 all_lower=True):
        """Defines a vocabulary object for NLP problems, allowing users to
        encode text with indices or embeddings.

        Parameters
        -----------
        w2idx: dict[str, int]
            Dictionary mapping words to their integer index in a vocabulary.
            The indices must allow for idx_misc to be added to the dictionary,
            so in the default case this should have a minimum index of 2. If
            a longer idx_misc is passed in, the minimum index would be larger.
        w2vec: dict[str, np.array]
            Dictionary mapping words to their embedding vectors stored as
            numpy arrays (optional).
        idx_misc: dict
            A dictionary mapping non-word tokens to indices. If none is passed
            in, a default version will be used with keys for unknown tokens
            and padding. A customized version might pass in additional tokens
            for repeated characters or all caps, for example.
        corpus_counts: collections.Counter
            Counter dict mapping words to their number of occurrences in a
            corpus (optional).
        all_lower: bool
            Specifies whether the data you've passed in (w2idx, w2vec, i2w) is
            all lowercase. Note that this will NOT change any of this data. If
            True, it simply lowercases user-input words when looking up their
            index or vector.
        """
        if not idx_misc:
            idx_misc = {'<PAD>': 0,
                        '<UNK>': 1}
        self.idx_misc = idx_misc
        # Check that space has been left for misc keys.
        assert len(idx_misc) == min(w2idx.values())

        # Core data structures.
        self.w2idx = {**self.idx_misc, **w2idx}
        self.i2w = [word for word, idx in sorted(self.w2idx.items(),
                                                 key=lambda x: x[1])]
        self.w2vec = w2vec or dict()

        # Miscellaneous other attributes.
        if w2vec:
            self.dim = len(w2vec[self[-1]])
        else:
            self.dim = 1
        self.corpus_counts = corpus_counts
        self.embedding_matrix = None
        self.w2vec['<UNK>'] = np.zeros(self.dim)
        self.all_lower = all_lower

    @classmethod
    def from_glove_file(cls, path, max_lines=float('inf'), idx_misc=None):
        """Create a new Vocabulary object by loading GloVe vectors from a text
        file. The embeddings are all lowercase so the user does not have the
        option to set the all_lower parameter.

        Parameters
        -----------
        path: str
            Path to file containing glove vectors.
        max_lines: int, float (optional)
            Loading the GloVe vectors can be slow, so for testing purposes
            it can be helpful to read in a subset. If no value is provided,
            all 400,000 lines in the file will be read in.
        idx_misc: dict
            Map non-standard tokens to indices. See constructor docstring.
        """
        w2idx = dict()
        w2vec = dict()
        misc_len = 2 if not idx_misc else len(idx_misc)

        with open(path, 'r') as f:
            for i, line in enumerate(f):
                if i >= max_lines:
                    break
                word, *values = line.strip().split(' ')
                w2idx[word] = i + misc_len
                w2vec[word] = np.array(values, dtype=np.float)

        return cls(w2idx, w2vec, idx_misc)

    @classmethod
    def from_tokens(cls, tokens, idx_misc=None, all_lower=True):
        """Construct a Vocabulary object from a list or array of tokens.

        Parameters
        -----------
        tokens: list[str]
            The word-tokenized corpus.
        idx_misc: dict
            Map non-standard tokens to indices. See constructor docstring.
        all_lower: bool
            Specifies whether your tokens are all lowercase.

        Returns
        --------
        Vocabulary
        """
        misc_len = 2 if not idx_misc else len(idx_misc)
        counts = Counter(tokens)
        w2idx = {word: i for i, (word, freq)
                 in enumerate(counts.most_common(), misc_len)}
        return cls(w2idx, idx_misc=idx_misc, corpus_counts=counts,
                   all_lower=all_lower)

    @staticmethod
    def from_pickle(path):
        """Load a previously saved Vocabulary object.

        Parameters
        -----------
        path: str
            Location of pickled Vocabulary file.

        Returns
        --------
        Vocabulary
        """
        return load(path)

    def save(self, path, verbose=True):
        """Pickle Vocabulary object for later use. We can then quickly load
        the object using torch.load(path), which can be much faster than
        re-computing everything when the vocab size becomes large.

        Parameters
        -----------
        path: str
            Where to save the output file.
        verbose: bool
            If True, print message showing where the object was saved to.
        """
        save(self, path, verbose)

    def filter_tokens(self, tokens, max_words=None, min_freq=0, inplace=False,
                      recompute=False):
        """Filter your vocabulary by specifying a max number of words or a min
        frequency in the corpus. When done in place, this also sorts vocab by
        frequency with more common words coming first (after idx_misc).

        Parameters
        -----------
        tokens: list[str]
            A tokenized list of words in the corpus (must be all lowercase
            when self.all_lower=True, such as when using GloVe vectors). There
            is no need to hold out test data here since we are not using
            labels.
        max_words: int (optional)
            Provides an upper threshold for the number of words in the
            vocabulary. If no value is passed in, no maximum limit will be
            enforced.
        min_freq: int (optional)
            Provides a lower threshold for the number of times a word must
            appear in the corpus to remain in the vocabulary. If no value is
            passed in, no minimum limit will be enforced.

            Note that we can specify values for both max_words and min_freq
            if desired. If no values are passed in for either, no pruning of
            the vocabulary will be performed.
        inplace: bool
            If True, will change the object's attributes
            (w2idx, w2vec, and i2w) to reflect the newly filtered vocabulary.
            If False, will not change the object, but will simply compute word
            counts and return what the new w2idx would be. This can be helpful
            for experimentation, as we may want to try out multiple values of
            min_freq to decide how many words to keep. After the first call,
            the attribute corpus_counts can also be examined to help determine
            the desired vocab size.
        recompute: bool
            If True, will calculate word counts from the given tokens. If
            False (the default), this will use existing counts if there are
            any.

            The idea is that if we call this method, then realize we want
            to change the corpus, we should calculate new word counts.
            However, if we are simply calling this method multiple times on
            the same corpus while deciding on the exact vocab size we want,
            we should not recompute the word counts.

        Returns
        --------
        dict or None: When called inplace, nothing is returned. When not
        inplace,
        """
        misc_len = len(self.idx_misc)
        if recompute or not self.corpus_counts:
            self.corpus_counts = Counter(tokens)
        filtered = {word: i for i, (word, freq)
                    in enumerate(self.corpus_counts.most_common(max_words),
                                 misc_len)
                    if freq >= min_freq}
        filtered = {**self.idx_misc, **filtered}

        if inplace:
            # Relies on python3.7 dicts retaining insertion order.
            self.i2w = list(filtered.keys())
            self.w2idx = filtered
            self.w2vec = {word: self.vector(word) for word in filtered}
        else:
            return filtered

    def build_embedding_matrix(self, inplace=False):
        """Create a 2D numpy array of embedding vectors where row[i]
        corresponds to word i in the vocabulary. This can be used to
        initialize weights in the model's embedding layer.

        Parameters
        -----------
        inplace: bool
            If True, will store the output in the object's embedding_matrix
            attribute. If False (default behavior), will simply return the
            matrix without storing it as part of the object. In the
            recommended case where inplace==False, we can store the output
            in another variable which we can use to initialize the weights in
            Torch, then delete the object and free up memory using
            gc.collect().
        """
        emb = np.zeros((len(self), self.dim))
        for i, word in enumerate(self):
            emb[i] = self.vector(word)

        if inplace:
            self.embedding_matrix = emb
        else:
            return emb

    def idx(self, word):
        """This will map a word (str) to its index (int) in the vocabulary.
        If a string is passed in and the word is not present, the index
        corresponding to the <UNK> token is returned.

        Parameters
        -----------
        word: str
            A word that needs to be mapped to an integer index.

        Returns
        --------
        int: The index of the given word in the vocabulary.

        Examples
        ---------
        >>> vocab.idx('the')
        2
        """
        if self.all_lower and word not in self.idx_misc:
            word = word.lower()
        return self.w2idx.get(word, self.w2idx['<UNK>'])

    def vector(self, word):
        """This maps a word to its corresponding embedding vector. If not
        contained in the vocab, a vector of zeros will be returned.

        Parameters
        -----------
        word: str
            A word that needs to be mapped to a vector.

        Returns
        --------
        np.array
        """
        if self.all_lower and word not in self.idx_misc:
            word = word.lower()
        return self.w2vec.get(word, self.w2vec['<UNK>'])

    def encode(self, text, nlp, max_len, pad_end=True, trim_start=True):
        """Encode text so that each token is replaced by its integer index in
        the vocab.

        Parameters
        -----------
        text: str
            Raw text to be encoded.
        nlp: spacy.lang.en.English
            Spacy tokenizer. Typically want to disable 'parser', 'tagger', and
            'ner' as they aren't used here and slow down the encoding process.
        max_len: int
            Length of output encoding. If text is shorter, it will be padded
            to fit the specified length. If text is longer, it will be
            trimmed.
        pad_end: bool
            If True, add padding to the end of short sentences. If False, pad
            the start of these sentences.
        trim_start: bool
            If True, trim off the start of sentences that are too long. If
            False, trim off the end.

        Returns
        --------
        np.array[int]: Array of length max_len containing integer indices
            corresponding to the words passed in.
        """
        output = np.ones(max_len) * self.idx('<PAD>')
        encoded = [self.idx(tok.text) for tok in nlp(text)]

        # Trim sentence in case it's longer than max_len.
        if len(encoded) > max_len:
            if trim_start:
                encoded = encoded[len(encoded) - max_len:]
            else:
                encoded = encoded[:max_len]

        # Replace zeros at start or end, depending on choice of pad_end.
        if pad_end:
            output[:len(encoded)] = encoded
        else:
            output[max_len-len(encoded):] = encoded
        return output.astype(int)

    def decode(self, idx, join=True, sep=' '):
        """Convert a list of indices to a string or list of words/tokens.

        Parameters
        -----------
        idx: list[int]
            A list of integers indexing into the vocabulary. This will often
            be the output of the encode() method.
        join: bool
            If True, return a single string. If False, return a list of 
            strings.
        sep: str
            If join is True, this determines what character is used to join
            tokens. Word tokens will usually be joined by a space, but some
            tokenization schemes include spaces and can be joined with an 
            empty string ('').

        Returns
        --------
        str or list[str]: A string or list of strings (words/tokens)
        reconstructed by indexing into the vocabulary.
        """
        tokens = [self[i] for i in idx]
        if join: return sep.join(tokens)
        return tokens

    def __getitem__(self, i):
        """This will map an index (int) to a word (str).

        Parameters
        -----------
        i: int
            Integer index for a word.

        Returns
        --------
        str: Word corresponding to the given index.

        Examples
        ---------
        >>> vocab = Vocabulary(w2idx, w2vec)
        >>> vocab[1]
        '<UNK>'
        """
        return self.i2w[i]

    def __len__(self):
        """Number of words in vocabulary."""
        return len(self.w2idx)

    def __iter__(self):
        for word in self.w2idx.keys():
            yield word

    def __contains__(self, word):
        return word in self.w2idx.keys()

    def __eq__(self, obj):
        if not isinstance(obj, Vocabulary):
            return False

        ignore = {'w2vec', 'embedding_matrix'}
        attrs = [k for k, v in hdir(vocab).items()
                 if v == 'attribute' and k not in ignore]
        return all([getattr(self, attr) == getattr(obj, attr)
                    for attr in attrs])

    def __repr__(self):
        msg = f'Vocabulary({len(self)} words'
        if self.dim > 1:
            msg += f', {self.dim}-D embeddings'
        return msg + ')'

In [None]:
# export
def domain(url, strip_www=True, mode='registered_domain'):
    domain_ = getattr(extract(url), mode)
    return domain_.replace('www.', '') if strip_www else domain_

In [11]:
# export
def domains_from_google_search(term, drop_probable_defaults=True):
    """Get domains from first page of google sarch for a given term.

    Parameters
    ----------
    term: str
        Term to search for.
    drop_probable_defaults: bool
        If True, remove two domains which seem to be present on all google
        search results pages. I didn't see a simple identifier for which links
        on the page are search results so we're left with this clumsy 
        filtering method (setting this to True means if 'accounts.google.com' 
        is a legitimate search result, we'd still remove it).

    Returns
    -------
    set[str]: Domain names (no www) found on the first page of search results.
    """
    url = f'http://www.google.com/search?q={"+".join(term.split())}'
    r = requests.get(url)
    links = BeautifulSoup(r.content, 'lxml').find_all('a')
    # Use dict to remove duplicates while maintaining order.
    domains = dict.fromkeys(domain(link['href'].replace('/url?q=', '')) 
                            for link in links 
                            if link['href'].startswith('/url?q='))
    if drop_probable_defaults:
        for d in ('accounts.google.com', 'support.google.com'):
            domains.pop(d, None)
    return list(domains)

In [13]:
domains_from_google_search('protein powder')

['healthline.com',
 'webmd.com',
 'health.harvard.edu',
 'amazon.com',
 'prevention.com',
 'health.clevelandclinic.org',
 'eatthis.com',
 'medicalnewstoday.com']

In [7]:
# export
class Embeddings:
    """Embeddings object. Lets us easily map word to index, index to
    word, and word to vector. We can use this to find similar words,
    build analogies, or get 2D representations for plotting. Generally, 
    user-facing methods let us pass in strings, while internal versions (same 
    name except prefixed with an underscore) allow us to pass in vectors.
    """

    def __init__(self, mat, w2i, pca=None):
        """
        Parameters
        ----------
        mat: str
            Numpy array of embeddings where row i corresponds to ID i
            in w2i.
        w2i: dict[str, int]
            Dictionary mapping word to its index in the vocabulary.
        pca: sklearn.decomposition.PCA or None
            If provided, this should be a PCA object with 2 components that
            was previously fit on `mat`. If None, a new object will be created
            and fit. This will let us plot embeddings in a way humans can
            visually parse.
        """
        self.mat = mat
        max_id = max(w2i.values())
        expected_ids = list(range(max_id + 1))
        if list(w2i.values()) != expected_ids:
            if sorted(w2i.values()) == expected_ids:
                warnings.warn(
                    'Your w2i dict is out of order (where ordered would mean '
                    'the first key has id 0, the second has id 1, etc.). '
                    'This should technically be fine but we recommend fixing '
                    'it to be safe.'
                )
            else:
                raise ValueError('Your w2i dict has missing indices. '
                                 'We do not currently support gaps.')
            
        self.w2i = {k.lower(): v for k, v in w2i.items()}
        if self.w2i != w2i:
            if len(self.w2i) == len(w2i):
                warnings.warn(
                    'Your w2i dict contains 1 or more uppercase characters. '
                    'Our current implementation force-lowercases everything. '
                    'You don\'t have any collisions (e.g. "Dog" and "dog") so '
                    'this should be okay, just keep this behavior in mind.'
                )
            else:
                raise ValueError(
                    'Our current implementation force-lowercases your w2i '
                    'dict and yours appears to contain a collision (e.g. '
                    '"Dog" and "dog"). We tentatively plan to allow cased '
                    'keys in the future.'
                )
                
        self.i2w = [w for w, i in 
                    sorted(self.w2i.items(), key=lambda x: x[1])]
        if len(self.w2i) != len(self.i2w):
            warnings.warn(
                'Some keys in your w2i share an index. Mapping keys to IDs '
                'should still work (if this is your intent) but reversing the '
                'operation may produce unexpected results (e.g. if "dog" and '
                '"bulldog" both map to index 0, it\'s unclear whether index 0 '
                'should be decoded as "dog" or "bulldog").'
            )
            
        self.n_embeddings, self.dim = self.mat.shape
        # Sets "pca" and "mat_2d" attributes.
        self._validate_or_fit_pca(pca)
        
    def _validate_or_fit_pca(self, pca=None):
        """Compresses embedding matrix using PCA. If an sklearn pca object is
        passed in, we'll check that it's been fit already. We make this its
        own method for cases where we perform inplace operations on the 
        embedding matrix (e.g. self.normed) because these require PCA to be
        refit).
        
        Parameters
        ----------
        pca: None or sklearn.decomposition.PCA
            If not None, the object should already be fitted.
        """
        if pca is None:
            pca = PCA(n_components=2).fit(self.mat)
        else:
            check_is_fitted(pca)
        self.pca = pca
        self.mat_2d = self.pca.transform(self.mat)
        
    @classmethod
    def from_text_file(cls, path, max_words=float('inf'), print_freq=10_000):
        """Create a new Embeddings object from a raw text file using the
        GloVe format (each row contains a word and its embedding as 
        space-separated floats).

        Parameters
        ----------
        path: str
            Location of csv file containing GloVe vectors.
        max_words: int, float
            Set maximum number of words to read in from file. This can be used
            during development to reduce wait times when loading data.
            
        Returns
        -------
        Embeddings: Newly instantiated object.
        """
        w2i = dict()
        mat = []
        with open(path, 'r') as f:
            for i, line in enumerate(f):
                # Faster testing
                if i >= max_words: break
                word, *nums = line.strip().split()
                w2i[word] = i
                mat.append(np.array(nums, dtype=float))
                if i % print_freq == 0: print(i, word)
        return cls(np.array(mat), w2i)
    
    @classmethod
    def from_word2vec(cls, w2vec, w2i=None):
        if w2i is None:
            mat = np.vstack(list(w2vec.values()))
            w2i = {k: i for i, k in enumerate(w2vec)}
        else:
            mat = np.vstack([w2vec[word] for word, i
                             in sorted(w2i.items(), key=lambda x: x[1])])
        return cls(mat=mat, w2i=w2i)

    @classmethod
    def from_pickle(cls, path):
        """If an Embeddings object previously saved its data in a pickle file,
        loading it that way can avoid repeated computation.

        Parameters
        ----------
        path: str
            Location of pickle file.

        Returns
        -------
        Embeddings: Newly instantiated object using the data that was stored
        in the pickle file.
        """
        return cls(**load(path))

    def save(self, path, verbose=True):
        """Save data to a compressed pickle file. This reduces the amount of
        space needed for storage (the csv is much larger) and can let us
        avoid running PCA and building the embedding matrix again.

        Parameters
        ----------
        path: str
            Path that object will be saved to.
        verbose

        Returns
        -------
        None
        """
        # No need to save mat_2d since pca can quickly transform `mat`.
        data = dict(mat=self.mat,
                    w2i=self.w2i,
                    pca=self.pca)
        save(data, path, verbose=verbose)
        
    def normed(self, inplace=False):
        """Create new Embeddings object where all vectors have unit norm.

        Parameters
        ----------
        inplace: bool
        
        Returns
        -------
        Embeddings or None: If inplace is False, return a new Embeddings
        object with the same indices. If it's True, return None. In either 
        case, the vectors of the resulting Embeddings object will have unit
        norm.
        """
        normed_mat = self.mat / self.norm(self.mat)[:, None]
        if inplace:
            self.mat = normed_mat
            self._validate_or_fit_pca()
        else:
            return type(self)(normed_mat, self.w2i)
    
    def subset(self, n, recompute_2d=True):
        """Subset Embeddings to top n words. Nice way to see how results of
        all other methods (e.g. nearest_neighbors) would change if we used a
        smaller vocabulary.

        Parameters
        ----------
        n: int
            Top n embeddings (indices 0 through n-1) will be included in 
            subset.
        recompute_2d: bool
            If True, a new 2D matrix will be computed only using the subset,
            meaning information about the excluded embeddings will be ignored.
            If False, a subset of the existing embeddings will be used.

        Returns
        -------
        Embeddings: Same as self but now with length n.
        """
        return type(self)(
            self.mat[:n],
            {k: v for k, v in sorted(self.w2i.items(), key=lambda x: x[1])
             if v < n},
            pca=None if recompute_2d else self.pca
        )

    @dispatch(str)
    def vec(self, word):
        """Look up the embedding for a given word. Return None if not found.

        Parameters
        ----------
        word: str
            Input word to look up embedding for.

        Returns
        -------
        np.array: Embedding corresponding to the input word. If word not in
            vocab, return None.
        """
        idx = self.get(word)
        if idx is not None:
            return self.mat[idx]
        
    @dispatch(Iterable)
    def vec(self, words):
        """Get embedding vectors for a list of words and return them as a 
        single numpy array. Note that all words must be present here: we want
        to guarantee the output has the same number of rows as the input.

        Parameters
        ----------
        words: list[str]
            Input words to look up embeddings for.

        Returns
        -------
        np.array: Embeddings corresponding to the input words. 
        Shape (len(words), emb.dim).
        """
        # Don't just delegate to the other `vec` method because we want to 
        # ensure all words are present.
        return np.vstack([self.mat[self[word]] for word in words])

    @dispatch(str)
    def vec_2d(self, word):
        """Look up the compressed embedding for a word (PCA was used to shrink
        dimensionality to 2). Return None if the word is not present in vocab.

        Parameters
        ----------
        word: str
            Input work to look up.

        Returns
        -------
        np.array: Compressed embedding of length 2. None if not found.
        """
        idx = self.get(word)
        if idx is not None:
            return self.mat_2d[idx]
        
    @dispatch(Iterable)
    def vec_2d(self, words):
        """Look up the compressed embeddings for multiple words 
        (PCA was used to shrink dimensionality to 2). Note that all words must 
        be present here: we want to guarantee the output has the same number 
        of rows as the input.

        Parameters
        ----------
        words: list[str]
            Input words to look up embeddings for.

        Returns
        -------
        np.array: Shape (len(words), emb.dim). Row i corresponds to words[i].
        """
        # Don't just delegate to the other `vec` method because we want to 
        # ensure all words are present.
        return np.vstack([self.mat_2d[self[word]] for word in words])
        
    @staticmethod
    def distance(vec1, vec2, distance='cosine'):
        """Find distance between two vectors.
        
        Parameters
        ----------
        distance: str
            One of ('cosine', 'euclidean', 'manhattan').
        """
        if distance == 'euclidean':
            dists = Embeddings.norm(vec1 - vec2)
        elif distance == 'cosine':
            dists = Embeddings.cosine_distance(vec1, vec2)
        elif distance == 'manhattan':
            dists = Embeddings.manhattan_distance(vec1, vec2)
        # Let arrays have numpy dtypes, but scalars will just be floats.
        return dists if isinstance(dists, Iterable) else float(dists)
    
    def _distances(self, vec, distance='cosine'):
        """Find distance from an input vector to every other vector in the
        embedding matrix.

        Parameters
        ----------
        vec: np.array
            Vector for the input word.
        distance: str
            Specifies what distance metric to use for calculations.
            One of ('euclidean', 'manhattan', 'cosine'). In a high dimensional
            space, cosine is often a good choice.

        Returns
        -------
        np.array: The i'th value corresponds to the distance to word i in the
            vocabulary.
        """
        return self.distance(self.mat, vec, distance=distance)

    def nearest_neighbors(self, word, n=5, distance='cosine', digits=3):
        """Find the most similar words to a given word. This wrapper
        allows the user to pass in a word. To pass in a vector, use
        `_nearest_neighbors`.

        Parameters
        ----------
        word: str
            A word that must be in the vocabulary.
        n: int
            Number of neighbors to return.
        distance: str
            Distance method to use when computing nearest neighbors. One of
            ('euclidean', 'manhattan', 'cosine').
        digits: int
            Digits to round output distances to.

        Returns
        -------
        dict[str, float]: Dictionary mapping word to distance.
        """
        # Error handling for words not in vocab.
        if word not in self:
            return None
        return self._nearest_neighbors(self.vec(word), n, distance, digits)

    def _nearest_neighbors(self, vec, n=5, distance='cosine', digits=3,
                           skip_first=True):
        """Find the most similar words to a given word's vector. 
        This is the internal function behind `nearest_neighbors`, so you pass
        in a vector instead of a word.

        Parameters
        ----------
        vec: np.array
        n: int
        distance: str
            One of ('cosine', 'euclidean', 'manhattan').
        digits: int
        skip_first: bool
            If True, the nearest result will be sliced off (this is desirable
            when searching for a word's nearest neighbors, where we don't want
            to return the word itself). When finding analogies or performing
            embedding arithmetic, however, we likely don't want to slice off
            the first result.

        Returns
        -------
        dict[str, float]: Dictionary mapping word to distance.
        """
        dists = self._distances(vec, distance)
        idx = np.argsort(dists)[slice(skip_first, skip_first+n)]
        # First convert to float, otherwise we get np.float32 or np.float64
        # scalars which can cause annoying bugs in APIs or dash apps.
        return {self.i2w[i]: round(float(dists[i]), digits) for i in idx}

    def analogy(self, a, b, c, n=5, **kwargs):
        """Fill in the analogy: A is to B as C is to ___. Note that we always
        treat A and B as valid candidates to fill in the blank. C is
        only considered as a candidate in the trivial case where A=B, in which
        case C should be the first choice.
        
        Parameters
        ----------
        a: str
            First word in analogy.
        b: str
            Second word in analogy.
        c: str
            Third word in analogy.
        n: int
            Number of candidates to return. Note that we specify this
            separately from kwargs since we need to alter its value before
            passing it to `_nearest_neighbors`. This will allow us to remove
            the word c as a candidate if it is returned.
        kwargs: distance (str), digits (int)
            See _nearest_neighbors for details.
            
        Returns
        -------
        list[str]: Best candidates to complete the analogy in descending order
            of likelihood.
        """
        # If any words missing from vocab, arithmetic w/ None throws error.
        try:
            vec = self.vec(b) - self.vec(a) + self.vec(c)
        except TypeError:
            return None

        # Except for trivial edge case, return 1 extra value in case neighbors
        # includes c, which will be removed in these situations.
        a, b, c = a.lower(), b.lower(), c.lower()
        trivial = (a == b)
        neighbors = self._nearest_neighbors(vec, n=n+1-trivial,
                                            skip_first=False, **kwargs)
        if not trivial and c in neighbors:
            neighbors.pop(c)

        # Relies on dicts being ordered in python >= 3.6.
        return list(neighbors)[:n]
    
    def cbow(self, *args):
        """Wrapper to `_cbow` that allows us to pass in strings instead of
        vectors. Computes bag of words vector by averaging vectors for all 
        input words.

        Parameters
        ----------
        args: str
            Multiple words to average over.

        Returns
        -------
        np.array: Average of all input vectors. This will have the same
            embedding dimension as each input.
        """
        vecs = [arg for arg in map(self.vec, args) if arg is not None]
        if vecs:
            return self._cbow(*vecs)

    def _cbow(self, *args):
        """Internal helper for `cbow` method that lets us pass in vectors 
        instead of words.

        Parameters
        ----------
        args: np.array
            Word vectors to average.

        Returns
        -------
        np.array: Average of all input vectors. This will have the same
            embedding dimension as each input.
        """
        return np.mean(args, axis=0)

    def cbow_neighbors(self, *args, n=5, exclude_args=True, **kwargs):
        """Wrapper to `cbow` method. This lets us pass in words, compute their
        average embedding, then return the words nearest this embedding. The
        input words are not considered to be candidates for neighbors (e.g. if
        you input the words 'happy' and 'cheerful', the neighbors returned 
        will not include those words even if they are the closest to the mean
        embedding) unless you set exclude_args=False. The idea here is to
        find additional words that may be similar to the group you've passed 
        in.

        Parameters
        ----------
        args: str
            Input words to average over.
        n: int
            Number of neighbors to return.
        kwargs: distance (str), digits (int)
            See _nearest_neighbors() for details.

        Returns
        -------
        dict[str, float]: Dictionary mapping word to distance from the average
            of the input words' vectors.
        """
        vec_avg = self.cbow(*args)
        if vec_avg is None:
            return
        w2dist = self._nearest_neighbors(vec_avg, n=len(args)+n, 
                                         skip_first=False, **kwargs)

        # Lowercase to help remove duplicates.
        args = set(arg.lower() for arg in args)
        return {word: w2dist[word] for word in 
                [w for w in w2dist if not exclude_args or w not in args][:n]}
    
    @valuecheck
    def matching_keys(self, *terms,
                      mode:('standard', 'regex', 'ninja')='standard'):
        """Find keys (usually URLs, but could be used on words) containing
        a given term/prefix/regex. This helps us do things like create theme
        vectors (e.g. use this to find all sites related to "games" or 
        "gaming", then proceed to average their embeddings and potentially
        expand the group even more by finding nearest neighbors).
        
        Parameters
        ----------
        terms: str
            One or more strings to search for. This can be a word ('gaming'),
            a partial word ('gam'), a regex ('^gam.*').
        mode: str
            standard - Performs exact string matching.
            regex - Allows passing in regular expressions like '^sports'.
            ninja - Use wordninja to remove spurious matches that violate 
                likely word boundaries. Mostly useful for things like URLs.
                E.g. if you search for 'math' with mode='standard', the URL
                'mathiasmiller.com' would match. With mode='ninja', it would 
                not because this URL seems to refer to a person, not the word
                'math'.
                
        Returns
        -------
        list[str]: Keys (words, URLs, etc.) matching any of the given terms.
        """
        if mode == 'regex':
            match_fn = re.search
        else:
            def match_fn(term, key):
                return term in key

            # This is a bit slow so perform it as a second step after first
            # filtering for URLs that contain the term at all.
            if mode == 'ninja':
                def post_match_fn(term, key):
                    return term in wn.split(key)

        keys = [key for key in self if any(match_fn(t, key) for t in terms)]
        if mode == 'ninja':
            keys = [k for k in keys if any(post_match_fn(t, k) 
                                           for t in terms)]
        return keys
    
    def compare_distances(self, key, distance='cosine', as_df=True,
                          sort_df=True, **vectors):
        """Compare how far a word/domain is from one or more vectors. Intended
        for use with cbow results: e.g. checking if a word is closer to a
        'liberal' vector or a 'conservative' vector.
        
        Parameters
        ----------
        key: str
            Word/domain to analyze.
        distance: str
            One of ('cosine', 'euclidean', 'manhattan'). Determines distance
            method to use.
        as_df: bool
            If True, output is a dataframe. If False, it's a dict mapping 
            keys (strings) to distances (floats).
        sort_df: bool
            If True and as_df is True, the output df will be sorted by 
            distance from closest to furthest (relatively speaking - all 
            results will be relatively close).
        vectors: np.array
            One or more vectors to compare the input key's vector to. These
            are kwargs rather than args because we need names for the 
            resulting df to show which distance corresponds to which vector.

        Returns
        -------
        pd.DataFrame or dict[str, float]: Type depends on value of `as_df`.
        """
        return self._compare_distances(self.vec(key), distance, as_df, 
                                       sort_df, **vectors)
            
    def _compare_distances(self, src_vec, distance='cosine', as_df=True,
                           sort_df=True, **vectors):
        """Internal version of `compare_distances` that accepts a vector
        rather than a word.
        
        Parameters
        ----------
        src_vec: np.array
            Embedding for the word/domain to analyze.
        distance: str
            One of ('cosine', 'euclidean', 'manhattan'). Determines distance
            method to use.
        as_df: bool
            If True, output is a dataframe. If False, it's a dict mapping 
            keys (strings) to distances (floats).
        sort_df: bool
            If True and as_df is True, the output df will be sorted by 
            distance from closest to furthest (relatively speaking - all 
            results will be relatively close).
        vectors: np.array
            One or more vectors to compare the input key's vector to. These
            are kwargs rather than args because we need names for the 
            resulting df to show which distance corresponds to which vector.

        Returns
        -------
        pd.DataFrame or dict[str, float]: Type depends on value of `as_df`.
        """
        d2dist = {k: self.distance(src_vec, v, distance)
                  for k, v in vectors.items()}
        if as_df:
            d2dist = pd.DataFrame.from_dict(d2dist, orient='index',
                                            columns=['score'])
            if sort_df: d2dist = d2dist.sort_values('score', ascending=True)
        return d2dist
    
    @valuecheck
    def semantic_vector(
            self, *queries, n=25, 
            include_queries:('always', 'never', 'auto')='always',
            mode='standard', google_missing=False
    ):
        """Mostly for domains rather than words: create a vector matching the
        "semantic theme" of 1 or more input queries (usually several). 
        For example, to create a "movie" theme, you could pass in 'imdb.com',
        'rottentomatoes.com', and 'letterboxd.com'.

        Parameters
        ----------
        queries: str
            These will often be keys in the Embeddings object (i.e. domains).
            However, we also support words (e.g. "movie"), regular expressions
            (e.g. "^movie*"), or even phrases (e.g. "scary movies"). Some
            combinations of the above are supported: the only limitation as of
            2/12/21 is you must choose a str matching mode (see 
            `self.matching_keys`). So you can pass in a mix of domains, words,
            and phrases, but you can't use string matching for some words and
            regex matching for others. Might be supported in the future if I
            encounter situations where it seems useful.
        n: int
            Number of neighbors to find in `cbow_neighbors` method. Note that
            this won't necessarily be the final number of keys returned - 
            `include_queries` will affect that too.
        include_queries: str
            Determines whether URLs retrieved from the initial step of query
            string matching should be included in results. 'never' is useful
            if you specifically want URLs that DON'T contain the queries 
            (e.g. music-related sites without "music" in the URL), but 
            I suspect 'always' may give better quality results. 'auto' will
            allow these matches to be retained but won't force them to if they
            aren't close to the final theme vector.
        mode: str
            Determines type of string matching used to find initial "seed"
            domains (see `self.matching_keys`). Options are 
            ('standard', 'regex', 'ninja').
        google_missing: bool
            If True, terms that don't yield any string match results will be
            googled and any domains on the first page of results will be used
            if they're in our vocabulary.

        Returns
        -------
        tuple[np.array, list[str]] or None: First item is the "bag of words"
        array created by averaging over n neighbors of the input queries.
        Second item is a list of the neighbor names sorted by similarity in
        decreasing order.
        """
        # If a query isn't in our embeddings, we assume it's intended to be a
        # str matching query, not a full domain. If it is a full domain, this
        # is a bit inefficient but it won't affect the final list of queries.
        #  Don't use a list comprehension because we need to warn when no
        # matches are found since we remove missing queries before calling
        # `cbow_neighbors` (i.e. no built-in warning). This means our final
        # list won't have missing queries, which is nice.
        all_queries = []
        for q in queries:
            if q in self:
                matches = [q]
            else:
                matches = self.matching_keys(q, mode=mode)
                if not matches and google_missing:
                    matches = domains_from_google_search(q)
                if not matches:
                    warnings.warn(f'No matches found for {q}.')
            all_queries.extend(matches)
        if not all_queries:
            warnings.warn('Queries yielded zero matches.')
            # Return None (not vector) for consistency with other methods.
            return None, []

        # Keep as dict for now for O(1) lookup time in case we use huge n.
        matches = self.cbow_neighbors(*all_queries, n=n,
                                      exclude_args=include_queries=='never')

        if include_queries == 'always':
            # Use dict as ordered set: remove duplicates but maintain order.
            matches = dict.fromkeys(
                list(matches) + [q for q in all_queries if q not in matches]
            )
        vec = self.cbow(*matches)
        return vec, list(matches)

    @staticmethod
    def norm(vec):
        """Compute L2 norm of a vector. Euclidean distance between two vectors
        can be found by the operation norm(vec1 - vec2).

        Parameters
        ----------
        vec: np.array
            Input vector.

        Returns
        -------
        float: L2 norm of input vector.
        """
        return np.sqrt(np.sum(vec ** 2, axis=-1))

    @staticmethod
    def manhattan_distance(vec1, vec2):
        """Compute L1 distance between two vectors.

        Parameters
        ----------
        vec1: np.array
        vec2: np.array

        Returns
        -------
        float or np.array: Manhattan distance between vec1 and vec2. If two
            vectors are passed in, the output will be a single number. When
            computing distances between a vector and a matrix, the output
            will be a vector (np.array).
        """
        return np.sum(abs(vec1 - vec2), axis=-1)

    @staticmethod
    def cosine_distance(vec1, vec2):
        """Compute cosine distance between two vectors.

        Parameters
        ----------
        vec1: np.array
        vec2: np.array

        Returns
        -------
        float or np.array: Cosine distance between vec1 and vec2. If two
            vectors are passed in, the output will be a single number. When
            computing distances between a vector and a matrix, the output
            will be a vector (np.array).
        """
        return 1 - (np.sum(vec1 * vec2, axis=-1) /
                    (Embeddings.norm(vec1) * Embeddings.norm(vec2)))

    @dispatch(str)
    def __getitem__(self, key):
        """When indexing with a string, this acts as a word->index method.
        
        Examples
        --------
        >>> emb['the']
        1
        """
        return self.w2i[key.lower()]
    
    @dispatch((int, slice))
    def __getitem__(self, i):
        """When indexing with an integer, this acts as an index->word method.
        
        Examples
        --------
        >>> emb[1]
        'the'
        
        >>> emb[:3]
        ['a', the', 'is']
        """
        return self.i2w[i]
    
    @dispatch(Iterable)
    def __getitem__(self, keys):
        """Allows indexing in with a list of keys/indices. You can pass in a
        mix of strings and integers though I can't imagine why that would be
        necessary.
        
        Examples
        --------
        >>> emb[[1, 100, 7]]
        ['the', 'frog', 'dog']
        
        >>> emb[['the', 'dog', 'frog']]
        [1, 7, 100]
        """
        return [self[key] for key in keys]
            
    def get(self, key, default=None):
        """Returns None if word is not present just like dict.get."""
        try:
            return self.w2i[key.lower()]
        except KeyError:
            warnings.warn(f'{key} not in Embeddings.')
            return default

    def __len__(self):
        return self.n_embeddings

    def __contains__(self, word):
        return word.lower() in self.w2i

    def __iter__(self):
        """Yields words in vocabulary in insertion order (may differ from
        index order).
        """
        yield from self.w2i.keys()
        
    def __eq__(self, obj):
        if not isinstance(obj, Embeddings):
            return False

        ignore = {'pca'}
        for k, v in vars(obj).items():
            if k in ignore: continue
            v_self = getattr(self, k)
            if isinstance(v, np.ndarray):
                if not np.allclose(v, v_self): return False
            elif v != v_self: return False
        return True

    def __repr__(self):
        return f'Embeddings(len={len(self)}, dim={self.dim})'

In [16]:
w2i = {'a': 0, 'A': 1, 'b': 2}
mat = np.arange(6).reshape(3, 2)

with assert_raises(ValueError):
    emb = Embeddings(mat, w2i)

As expected, got ValueError(Our current implementation force-lowercases your w2i dict and yours appears to contain a collision (e.g. "Dog" and "dog"). We tentatively plan to allow cased keys in the future.).


In [17]:
w2i = {'B': 0, 'c': 1, 'a': 3}
with assert_raises(ValueError):
    emb = Embeddings(mat, w2i)

As expected, got ValueError(Your w2i dict has missing indices. We do not currently support gaps.).


In [18]:
w2i = {'a': 0, 'B': 1, 'c': 2}
emb = Embeddings(mat, w2i)



In [19]:
w2i = {'B': 1, 'c': 2, 'a': 0, }
emb = Embeddings(mat, w2i)



In [20]:
emb2 = Embeddings(mat, w2i)
assert emb == emb2, 'Should evaluate as equal when w2i and mat are equal.'



In [5]:
# export
def back_translate(text, to, from_lang='en'):
    """Translate a piece of text into another language, then back to English
    for data augmentation purposes. This is rate limited but we now have a
    pure ML version in the form of BacktranslateTransform.
    
    Parameters
    ----------
    text: str
        Text to back translate.
    to: str
        Language to translate to before translating back to English.
    from_lang: str
        Language of input text (usually 'en' for English).
        
    Returns
    -------
    str: Same language and basically the same content as the original text,
        but usually with slightly altered grammar, sentence structure, and/or
        vocabulary.
    """
    return str(
        TextBlob(text)\
        .translate(to=to)\
        .translate(from_lang=to, to=from_lang)
    )

In [None]:
text = """
Visit ESPN to get up-to-the-minute sports news coverage, scores, highlights and commentary for NFL, MLB, NBA, College Football, NCAA Basketball and more.
"""
back_translate(text, 'es')

'Visit ESPN to get coverage of sports news, scores, highlights and comments from the NFL, MLB, NBA, college football, NCAA basketball and more.'

In [None]:
text = """
Visit ESPN to get up-to-the-minute sports news coverage, scores, highlights and commentary for NFL, MLB, NBA, College Football, NCAA Basketball and more.
"""
back_translate(text, 'fr')

'Visit ESPN for up-to-date sports information, scores, highlights and commentary for the NFL, MLB, NBA, college football, NCAA basketball and more.'

In [None]:
# export
def postprocess_embeddings(emb, d=None):
    """Implements the algorithm from the paper:
    
    All-But-The-Top: Simple and Effective Post-Processing 
    for Word Representations (https://arxiv.org/pdf/1702.01417.pdf)
    
    There are three steps:
    1. Compute the mean embedding and subtract this from the 
    original embedding matrix. 
    2. Perform PCA and extract the top d components.
    3. Eliminate the principal components from the mean-adjusted
    embeddings.
    
    Parameters
    ----------
    emb: np.array
        Embedding matrix of size (vocab_size, embedding_length).
    d: int
        Number of components to use in PCA. Defaults to 
        embedding_length/100 as recommended by the paper.
    """
    d = d or emb.shape[1] // 100
    emb_adj = emb - emb.mean(0)
    u = PCA(d).fit(emb_adj).components_
    return emb_adj - emb@u.T@u

In [None]:
# export
def compress_embeddings(emb, new_dim, d=None):
    """Reduce embedding dimension as described in the paper:
    
    Simple and Effective Dimensionality Reduction for Word Embeddings
    (https://lld-workshop.github.io/2017/papers/LLD_2017_paper_34.pdf)
    
    Parameters
    ----------
    emb: np.array
        Embedding matrix of size (vocab_size, embedding_length).
    d: int
        Number of components to use in the post-processing
        method described here: https://arxiv.org/pdf/1702.01417.pdf
        Defaults to embedding_length/100 as recommended by the paper.
        
    Returns
    -------
    np.array: Compressed embedding matrix of shape (vocab_size, new_dim).
    """
    emb = postprocess_embeddings(emb, d)
    emb = PCA(new_dim).fit_transform(emb)
    return postprocess_embeddings(emb, d)

## Data Augmentation

In [9]:
# export
@auto_repr
class ParaphraseTransform:
    """Text transform that paraphrases input text as a method of data
    augmentation. This is rather slow so it's recommended to precompute 
    samples and save them, but you could generate samples on the fly if 
    desired. One further downside of that approach is you'll have a huge
    paraphrasing model on the GPU while (presumably) training another model.
    
    Other paraphrasing models exist on Model Hub but as of 11/14/2020, none of
    the results compared favorably to this pegasus model, at least based on
    a rough "eyeball check". While smaller and presumably faster, many of 
    these appear to require processing a single example at a time which 
    diminishes these gains. If you do attempt to use them, you'll likely need 
    to write a new class with a preprocessing method that does something like
    the following:
    _preprocess(text) -> 'paraphrase: {text}</s>'
    I'm recording this here because many are missing documentation and it took
    me some time to discover this.
    """

    name = 'tuner007/pegasus_paraphrase'

    def __init__(self, n=1, pipe=None):
        """
        Parameters
        ----------
        n: int
            Default number of samples to generate. You can override this in
            __call__.
        pipe: transformers Text2TextGenerationPipeline or None
        """
        if pipe:
            self.pipe = pipe
            self.name = pipe.model.config._name_or_path
        else:
            self.pipe = Text2TextGenerationPipeline(
                PegasusForConditionalGeneration.from_pretrained(self.name),
                PegasusTokenizer.from_pretrained(self.name),
                device=0 if torch.cuda.is_available() else -1
            )
        self.n = n
            
        assert type(self.pipe).__name__ == 'Text2TextGenerationPipeline'
        if 'cuda' not in str(self.pipe.device) and torch.cuda.is_available():
            warnings.warn('The pipeline passed in is not using cuda. '
                          'Did you mean to use the available GPU?')
                
    def _preprocess(self, text):
        """Does nothing (just want shared interface with other transforms)."""
        return text
    
    @add_docstring(PreTrainedModel.generate)
    def __call__(self, text, n=None, flat=True, **kwargs):
        """
        Parameters
        ----------
        text: str or Iterable[str]
            Raw text to transform.
        n: int or None
            If None, use the default self.n.
        flat: bool
            If True, return flat list of strings. If False, return list of 
            nested lists where list i contains n augmentations of input i.
        kwargs: any
            Additional kwargs are passed to the model's text generation 
            method. Its docstring is included below for convenience.
            
        Returns
        -------        
        list: either a list with n strings per input string, or a list of
        lists, each of length n, if flat=False.
        """
        n = n or self.n
        rows = [row['generated_text'] for row in 
                self.pipe(text, num_return_sequences=n, **kwargs)]
        if listlike(text) and not flat: 
            rows = [rows[i*n:(i+1)*n] for i in range(len(text))]
        return rows

In [7]:
text = 'It was a beautiful sunny day and birds were chirping.'
texts = ['Play fun games online for free! Watch your favorite movies and tv '
         'shows here.', 
         'Bill hated school, especially math. His teacher was losing '
         'patience with him.']

In [16]:
p_tfm = ParaphraseTransform()

  self._sock = None
  self._sock = None


In [17]:
p_tfm(text, n=3, temperature=10)

['Birds were singing in the sun.',
 'Birds were singing on a nice sunny day.',
 'Birds were singing on a sunny day.']

In [10]:
# export
@auto_repr
class GenerativeTransform:
    """Text transform that truncates a piece of text and completes it using
    a text generation model for the purposes of data augmentation. We
    recommend precomputing samples and saving them for later use, but you
    could generate samples on the fly if desired. Aside from speed, this
    approach also has the drawback of having a text generation model on the 
    GPU while (presumably) training another model.
    """
    
    name = 'text-generation'
    
    def __init__(self, n=1, pipe=None):
        """
        Parameters
        ----------
        n: int
            Default number of samples to generate. You can override this in
            __call__.
        pipe: Transformers TextGenerationPipeline or None
        """
        if pipe:
            self.pipe = pipe
            self.name = pipe.model.config._name_or_path
        else:
            self.pipe = pipeline(self.name, 
                                 device=0 if torch.cuda.is_available() 
                                 else -1)
        self.n = n
        
        assert type(self.pipe).__name__ == 'TextGenerationPipeline'
        if 'cuda' not in str(self.pipe.device) and torch.cuda.is_available():
            warnings.warn('The pipeline passed in is not using cuda. '
                          'Did you mean to use the available GPU?')
    
    def _preprocess(self, text, drop=None, drop_pct=None, rand_low=None, 
                    rand_high=None, min_keep=3, return_tuple=False):
        """Truncate text so we can generate the ending.
        
        Parameters
        ----------
        text: str or Iterable[str]
        drop: None or int
            If provided, specifies the number of words to drop. We use a
            simple "split on spaces" strategy since it's fast and simple.
            Drop strategies occur in the signature in order of priority, so
            if this is non-None it will override any values passed in for
            drop_pct or rand_low/rand_high.
        drop_pct: float or None
            If provided, this should be a value between 0.0 and 1.0 specifying
            the proportion of words to drop. 
        rand_low: int or None
            If provided, specifies the minimum number of words to drop. The
            max will be set by rand_high (which must also be provided with
            rand_low). A random integer will be selected for each row of text.
        rand_high: int or None
            See rand_low: helps define bounds when randomly truncating rows
            of text.
        min_keep: int
            The minimum number of words to keep. Sequences of this length or
            shorter will therefore remain un-transformed. You could set this
            to zero to enforce no minimum.
        return_tuple: bool
            If True, return a tuple where the first item is the truncated
            text and the second item is the number of words masked. This is
            rarely needed but it might be helpful if you want to use this for
            some sort of self-supervised pre-training task.
        """
        if listlike(text):
            return [self._preprocess(row, drop, drop_pct, rand_low, rand_high,
                                     min_keep, return_tuple) for row in text]
        
        tokens = text.split()
        if len(tokens) <= min_keep:
            n_drop = 0
        else:
            # Default is to truncate the last 20% of the sequence.
            if drop:
                n_drop = drop
            elif drop_pct:
                n_drop = int(drop_pct * len(tokens))
            elif rand_low is not None and rand_high is not None:
                n_drop = np.random.randint(rand_low, rand_high)
            else:
                n_drop = int(np.ceil(.2 * len(tokens)))
            n_drop = np.clip(n_drop, 0, len(tokens) - min_keep)
            tokens = tokens[:-n_drop]
        truncated = ' '.join(tokens)
        return (truncated, n_drop) if return_tuple else truncated
    
    @add_docstring(PreTrainedModel.generate)
    def __call__(self, text, n=None, flat=True, min_length=2, max_length=7, 
                 drop=None, drop_pct=None, rand_low=None, rand_high=None, 
                 min_keep=3, **generate_kwargs):
        """
        Parameters
        ----------
        text: str or Iterable[str]
        n: int or None
            Number of samples to generate for each input. Defaults to self.n
            if None.
        flat: bool
            If True, return flat list of strings. If False, return list of 
            nested lists where list i contains n augmentations of input i.
        min_length: int
            Min number of tokens to generate.
        max_length: int
            Max number of tokens to generate. You could set this equal to 
            min_length to enforce a constant number.
        drop: None or int
            If provided, specifies the number of words to drop. We use a
            simple "split on spaces" strategy since it's fast and simple.
            Drop strategies occur in the signature in order of priority, so
            if this is non-None it will override any values passed in for
            drop_pct or rand_low/rand_high.
        drop_pct: float or None
            If provided, this should be a value between 0.0 and 1.0 specifying
            the proportion of words to drop. 
        rand_low: int or None
            If provided, specifies the minimum number of words to drop. The
            max will be set by rand_high (which must also be provided with
            rand_low). A random integer will be selected for each row of text.
        rand_high: int or None
            See rand_low: helps define bounds when randomly truncating rows
            of text.
        min_keep: int
            The minimum number of words to keep. Sequences of this length or
            shorter will therefore remain un-transformed. You could set this
            to zero to enforce no minimum.
        generate_kwargs: any
            Forwarded to model's `generate` method. For convenience, its
            docstring is provided below.
            
        Returns
        -------
        list: either a list with n strings per input string, or a list of
        lists, each of length n, if flat=False.
        """
        n = n or self.n
        if listlike(text):
            res = [self(row, n, flat=flat, min_length=min_length, 
                        max_length=max_length, drop=drop, drop_pct=drop_pct, 
                        rand_low=rand_low, rand_high=rand_high, 
                        min_keep=min_keep, **generate_kwargs) for row in text]
            return flatten(res) if flat else res
    
        # `generate` counts current length as part of min_length. 
        text = self._preprocess(text, drop, drop_pct, rand_low=rand_low, 
                                rand_high=rand_high, min_keep=min_keep)
        n_curr = len(self.pipe.tokenizer.tokenize(text))
        res = self.pipe(text, min_length=n_curr + min_length,
                        max_length=n_curr + max_length,
                        num_return_sequences=n, **generate_kwargs)
        return [row['generated_text'] for row in res]

In [42]:
g_tfm = GenerativeTransform(n=3)

In [43]:
print(texts)
g_tfm._preprocess(texts, drop_pct=.75)

['Play fun games online for free! Watch your favorite movies and tv shows here.', 'Bill hated school, especially math. His teacher was losing patience with him.']


['Play fun games online', 'Bill hated school,']

In [44]:
g_tfm(text, drop_pct=.75)

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


['It was a disaster. It was a disaster in',
 'It was a really rough thing to say," the',
 'It was a different kind of world. The city']

In [45]:
g_tfm(texts)

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence
Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


['Play fun games online for free! Watch your favorite movies and tv series from Disney in this game',
 'Play fun games online for free! Watch your favorite movies and TV shows on Netflix, Amazon Video',
 'Play fun games online for free! Watch your favorite movies and TV shows online at your mobile device',
 'Bill hated school, especially math. His teacher was losing her eyesight due to a small',
 'Bill hated school, especially math. His teacher was losing sight of the fact that his work',
 'Bill hated school, especially math. His teacher was losing touch with him. Even though she']

In [46]:
res = g_tfm(texts, flat=False)
res

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence
Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


[['Play fun games online for free! Watch your favorite movies and streams live for free! Read,',
  'Play fun games online for free! Watch your favorite movies and TV shows play out! Explore The',
  'Play fun games online for free! Watch your favorite movies and play them yourself.\n\nFree'],
 ["Bill hated school, especially math. His teacher was losing sleep over his student's grades,",
  'Bill hated school, especially math. His teacher was losing her job at the time.\n',
  'Bill hated school, especially math. His teacher was losing control of his mind. He wouldn']]

In [47]:
g_tfm(texts, n=2, min_length=3, max_length=5)

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence
Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


["Play fun games online for free! Watch your favorite movies and sports videos and you'll",
 'Play fun games online for free! Watch your favorite movies and television shows in real time',
 'Bill hated school, especially math. His teacher was losing her temper, so he',
 'Bill hated school, especially math. His teacher was losing her job and he just']

In [48]:
g_tfm(text, n=5, drop_pct=.5, min_keep=2)

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


["It was a beautiful sunny day. I'm thankful. My",
 'It was a beautiful sunny day in August but we had to',
 'It was a beautiful sunny day, and I went down to',
 'It was a beautiful sunny spot in our neighbourhood. We had',
 'It was a beautiful sunny Friday evening.\n\n"It']

In [4]:
# export
@auto_repr
class FillMaskTransform:    
    """Text transform that masks one or more words in a piece of text and 
    fills them using RoBERTa for the purposes of data augmentation. We
    recommend precomputing samples and saving them for later use, but you
    could generate samples on the fly if desired. In addition to being slow,
    that approach also entails having a mask filling model on the GPU while 
    (presumably) training another model.
    """

    name = 'fill-mask'
    MASK = '<mask>'
    
    def __init__(self, n=1, max_n=None, pipe=None):
        """
        Parameters
        ----------
        n: int
            n is intentionally bigger than the default n in __call__. This is
            the number of candidates generated, so if we use strategy='random'
            it makes sense for this to be larger.
        max_n: int or None
            Used as topk attribute when sampling generated candidates. This 
            must be >=n. Increasing this should not slow down generation. When
            using strategy='random', you probably want this to be strictly >n.
            Defaults to n+2.
        pipe: transformers FillMaskPipeline
            We let users pass in an existing pipeline since instantiation can
            be slow.
        """
        if pipe:
            self.pipe = pipe
            self.name = pipe.model.config._name_or_path
        else:
            self.pipe = pipeline(self.name, 
                                 device=0 if torch.cuda.is_available()
                                 else -1)
        # Set n before max_n.
        self.n = n
        self.max_n = max_n or n+2
        
        assert type(self.pipe).__name__ == 'FillMaskPipeline'
        if 'cuda' not in str(self.pipe.device) and torch.cuda.is_available():
            warnings.warn('The pipeline passed in is not using cuda. '
                          'Did you mean to use the available GPU?')
    
    def _preprocess(self, text, min_keep=3, errors='raise'):
        """Randomly mask one word from an input piece of text to prepare it 
        for RoBERTa to fill. Notice that even if the user chooses to mask
        multiple words, each call to `_preprocess` only masks one since the
        model can only fill one at a time.
        
        Parameters
        ----------
        text: str or Iterable[str]
            One or more pieces of text to process.
        min_keep: int
            Minimum number of words to keep after truncating each piece of 
            text.
        errors: str
            If 'warn', we show a warning when min_keep is violated but allow
            masking to take place. Any other value will result in an error
            being raised.
        """
        if listlike(text):
            return [self._preprocess(row, min_keep, errors) for row in text]
        
        tokens = text.split()
        if len(tokens) < min_keep + 1:
            msg = (f'Text "{text[:25]}..." is too short to mask while '
                   f'enforcing min_keep={min_keep}.')
            # Err on side of caution: typos raise error too.
            if errors == 'warn':
                warnings.warn(msg)
            else:
                raise ValueError(msg)
        
        idx = np.random.choice(range(len(tokens)))
        return ' '.join(self.MASK if i == idx else t 
                        for i, t in enumerate(tokens))
    
    @add_docstring(PreTrainedModel.generate)
    def __call__(self, text, n=None, flat=True, n_mask=1, min_keep=3, 
                 return_all=False, errors='raise', strategy='best', **kwargs):
        """
        Parameters
        ----------
        text: str or Iterable[str]
        n: int or None
            Number of variations to return per piece of input text. If -1,
            return all generated examples for the given mask count.
            This can become very large when n_mask is large.
            Example: if self.max_n=3, n=-1, and n_mask=4, we first mask once 
            and generate 3 samples. Then we mask each of those 3 and generate
            a total of 9 samples, then 27, then finally 81 which is what will 
            be returned. The intermediate samples can be returned by
            specifying `return_all=True`. 
        flat: bool
            If True, return flat list of strings. If False, return list of
            nested lists where list i contains n augmented versions of input 
            i.
        n_mask: int
            Number of words to mask. Because the model can only fill 1 masked
            word at a time, `n_mask` forward passes will be performed.
        min_keep: int
            Minimum number of words to keep (presumably, you wouldn't want to
            mask every word in an input sentence since that would strip it of
            all existing meaning). This can be strictly enforced or not,
            depending on your choice of `errors`.
        return_all: bool
            If True, return all intermediate generated samples rather than 
            just the final samples (e.g. if n_mask is 3, we first have to 
            generate samples with 1 masked word, then samples with 2 masked 
            words, and finally samples with 3 masked words. This is because
            RoBERTa can only fill one masked word at a time.) See the 
            explanation of the `n` parameter for an example.
        errors: str
            One of ('warn', 'raise'). 'raise' will raise a ValueError if we're 
            about to violate `min_keep`. 'warn' will only show a warning if 
            this happens but will not strictly prevent it from occurring.
        strategy: str
            One of ('random', 'best'). The model will generate self.max_n
            samples and if n < self.max_n, this means we need some way of 
            selecting which samples to keep. 'random' selects randomly without
            replacement, while 'best' chooses the n most likely generations.
            Note: when n_mask > 1, you should probably use strategy='random'
            if you want relatively diverse results. If 'best', the benefit of
            additional iterations is diminished because we are likely to end
            up with very similar (or even identical) results.
        kwargs: any
            Forwarded to model's `generate` method. Its docstring is provided
            below for convenience.
            
        Returns
        -------
        list: either a list with n strings per input string, or a list of
        lists, each of length n, if flat=False. This is slightly different if
        return_all=True (see its description for details).
        """
        # Make sure we generate adequate number of sequences. Model topk must
        # be >= our desired n.
        n = n or self.n
        if n > self.max_n:
            self.max_n = n
            
        # Each item will be a list of strings. Each string in res[i]
        # will have i words changed. If text is a sequence of strings, we must
        # handle each one separately because each is passed through pipeline
        # repeatedly.
        if listlike(text):
            res = [self(row, n=n, flat=flat, n_mask=n_mask, min_keep=min_keep,
                        return_all=return_all, errors=errors, 
                        strategy=strategy, **kwargs) for row in text]
            return flatten(res) if flat else res

        res = [[text]]
        for i in range(n_mask):
            seqs = self.pipe(self._preprocess(res[-1], min_keep=min_keep,
                                              errors=errors))
            # Transformers returns either list of dicts or list of list of 
            # dicts depending on whether input list has 1 item or multiple.
            if isinstance(seqs[0], list): 
                seqs = [seq for group in seqs for seq in group]
            text = [seq['sequence'].replace('<s>', '').replace('</s>', '') 
                    for seq in seqs]
            
            # Keep all generated samples when n is -1.
            if n != -1:
                if strategy == 'random':
                    text = np.random.choice(text, n, replace=False)
                elif strategy == 'best':
                    text = text[:n]
                else:
                    raise ValueError('strategy should be "random" or "best".')
            res.append(text)
        if not return_all: res = res[n_mask]
        return flatten(res) if flat else res
    
    @property
    def max_n(self):
        return self.pipe.topk
    
    @max_n.setter
    def max_n(self, max_n):
        """Need to ensure the model generates enough options to return the
        desired number of samples."""
        if not isinstance(max_n, int):
            raise TypeError('max_n must be an integer.')
        if max_n < self.n:
            raise ValueError(f'max_n must be >= self.n (currently {self.n}.')
        self.pipe.topk = max_n

In [5]:
fm_tfm = FillMaskTransform(n=4, max_n=8)

Downloading:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/331M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['lm_head.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
fm_tfm(texts)

['Play fun games online for free! Watch your favorite movies and TV shows here.',
 'Play fun games online for free! Watch your favorite movies and tv shows here.',
 'Play fun games online for free! Watch your favorite movies and television shows here.',
 'Play fun games online for free! Watch your favorite movies and Broadway shows here.',
 'Bill hated school, especially math. His teacher was losing patience with him.',
 'Bill hates school, especially math. His teacher was losing patience with him.',
 'Bill loved school, especially math. His teacher was losing patience with him.',
 'Bill loves school, especially math. His teacher was losing patience with him.']

In [60]:
fm_tfm(texts, flat=False)

[['Play fun games online for free! Watch your favorite movies and tv shows here.',
  'Play fun games online for free! Watch your favourite movies and tv shows here.',
  'Play fun games online for free! Watch your own movies and tv shows here.',
  'Play fun games online for free! Watch your favorites movies and tv shows here.'],
 ['Bill hated school, especially math. His teacher was losing patience with him.',
  'Bill hated school, especially math. His teacher was having patience with him.',
  'Bill hated school, especially math. His teacher was no patience with him.',
  'Bill hated school, especially math. His teacher was lacking patience with him.']]

In [61]:
fm_tfm(text, n=2, strategy='best')

['It was a beautiful sunny day and birds were chirping.',
 'It was a beautiful sunny day and birds kept chirping.']

In [62]:
fm_tfm(text, n=2, strategy='random')

['It was a beautiful sunny day and they were chirping.',
 'It was a beautiful sunny day and monkeys were chirping.']

In [63]:
fm_tfm(text, n_mask=2, return_all=True, flat=False)

[['It was a beautiful sunny day and birds were chirping.'],
 ['It was a beautiful sunny day and birds were chirping.',
  "It's a beautiful sunny day and birds were chirping.",
  'It is a beautiful sunny day and birds were chirping.',
  'It seemed a beautiful sunny day and birds were chirping.'],
 ['It was a beautiful sunny day and birds were chirping.',
  'It was a beautiful sunny day and birds kept chirping.',
  'It was a beautiful sunny day and birds started chirping.',
  'It was a beautiful sunny day and birds are chirping.']]

Notice how quickly samples pile up when using n=-1, n_mask>1, and return_all=True.

In [64]:
fm_tfm(text, n=-1, n_mask=2, return_all=True)

['It was a beautiful sunny day and birds were chirping.',
 'It was a beautiful sunny day and birds were chirping.',
 'This was a beautiful sunny day and birds were chirping.',
 'Today was a beautiful sunny day and birds were chirping.',
 'Yesterday was a beautiful sunny day and birds were chirping.',
 'Sunday was a beautiful sunny day and birds were chirping.',
 ' It was a beautiful sunny day and birds were chirping.',
 ' it was a beautiful sunny day and birds were chirping.',
 'Saturday was a beautiful sunny day and birds were chirping.',
 'It was a beautiful sunny day and birds were chirping.',
 'It was a beautiful sunny day and we were chirping.',
 'It was a beautiful sunny day and kids were chirping.',
 'It was a beautiful sunny day and frogs were chirping.',
 'It was a beautiful sunny day and chickens were chirping.',
 'It was a beautiful sunny day and they were chirping.',
 'It was a beautiful sunny day and children were chirping.',
 'It was a beautiful sunny day and monkeys were

In [65]:
fm_tfm(texts, n=2, n_mask=2, return_all=True, flat=False)

[[['Play fun games online for free! Watch your favorite movies and tv shows here.'],
  ['Play fun games online for free! Watch your favorite movies and tv shows here.',
   'Play fun games online for free! Watch your favourite movies and tv shows here.'],
  ['Play fun games online for free! Watch your favorite movies and TV shows here.',
   'Play fun games online for free! Watch your favorite movies and tv shows here.']],
 [['Bill hated school, especially math. His teacher was losing patience with him.'],
  ['Bill hated school, especially math. His teacher was losing patience with him.',
   'Bill hated school, especially math. His teacher was losing patience for him.'],
  ['Bill hated school, especially math. His teacher was losing patience with him.',
   'Bill hated school, especially math. His teacher was having patience with him.']]]

We also provide a convenience function that allows us to easily generate new samples from a source dataframe or csv while preserving any desired metadata so we can map each generated row to the correct label, ID, raw text, etc.

In [12]:
# export
NLP_TRANSFORMS = {
    'fillmask': FillMaskTransform,
    'paraphrase': ParaphraseTransform,
    'generative': GenerativeTransform
}

In [10]:
# export
class BacktranslateTransform:
    """Augment/perturb text inputs by translating them to a different language
    and then back to English (this process can be repeated as many times as 
    you want by specifying multiple target languages). As of 2/10/21, this is
    excluded from incendio's NLP_TRANSFORMS variable since its interface is a
    little different from the other transforms: it has 2 pipelines, not 1, so
    has variables `names` and `pipes` (both lists) instead of `name` and 
    `pipe`. It also has no `_preprocess` method.
    """

    names = ['Helsinki-NLP/opus-mt-en-ROMANCE',
             'Helsinki-NLP/opus-mt-ROMANCE-en']

    language_codes = {
        'es': 'spanish',
        'fr': 'french',
        'it': 'italian',
        'pt': 'portuguese',
        'pt_br': 'portuguese (brazil)',
        'ro': 'romanian',
        'ca': 'catalan',
        'gl': 'galician',
        'pt_BR': 'portuguese (brazil?)',
        'la': 'latin',
        'wa': 'walloon',
        'fur': 'friulian (?)',
        'oc': 'occitan',
        'fr_CA': 'french (canada)',
        'sc': 'sardianian',
        'es_ES': 'spanish',
        'es_MX': 'spanish (mexico)',
        'es_AR': 'spanish (argentina)',
        'es_PR': 'spanish (puerto rico)',
        'es_UY': 'spanish (uruguay)',
        'es_CL': 'spanish (chile)',
        'es_CO': 'spanish (colombia)',
        'es_CR': 'spanish (croatia)',
        'es_GT': 'spanish (guatemala)',
        'es_HN': 'spanish (honduras)',
        'es_NI': 'spanish (nicaragua)',
        'es_PA': 'spanish (panama)',
        'es_PE': 'spanish (peru)',
        'es_VE': 'spanish (venezuela)',
        'es_DO': 'spanish (dominican republic)',
        'es_EC': 'spanish (ecuador)',
        'es_SV': 'spanish (el salvador)',
        'an': 'aragonese',
        'pt_PT': 'portuguese (portugal)',
        'frp': 'franco provencal',
        'lad': 'ladino',
        'vec': 'venetian',
        'fr_FR': 'france (france)',
        'co': 'corsican',
        'it_IT': 'italian (italy)',
        'lld': 'ladin',
        'lij': 'ligurian',
        'lmo': 'lombard',
        'nap': 'neapolitan',
        'rm': 'rhaetian (?)',
        'scn': 'sicilian',
        'mwl': 'mirandese'
    }

    def __init__(self, to_langs, pipes=()):
        """ 
        Parameters
        ----------
        to_langs: Iterable[str]
            One or more language codes to use for backtranslation (see 
            self.language_codes for all options). They will be applied in
            order: for instance, passing in ['es', 'fr'] will translate input
            from 
            english -> spanish -> english -> french -> english. You can 
            override these later in specific calls but the value(s) you 
            provide here will be defaults.
        pipes: Iterable[Pipeline]
            Huggingface pipelines, the first of which translates English to
            Romance languages and the second of which does the reverse. It's
            usually easiest to let the Transform create these for you, but if
            you already have them passing them in will be faster.
        """
        if not pipes:
            pipes = [TranslationPipeline(
                        model=AutoModelForSeq2SeqLM.from_pretrained(name),
                        tokenizer=AutoTokenizer.from_pretrained(name),
                        device=1 - torch.cuda.is_available()
                     ) for name in names]
        self.pipes = pipes
        self.to_langs = tolist(to_langs)

    def __call__(self, text, intermediate=False, flat=True, to_langs=(), 
                 **kwargs):
        """
        Parameters
        ----------
        text: str or Iterable[str]
            The input pieces of text to translate.
        intermediate: bool
            If True, return all intermediate backtranslations if more than one
            target language is provided. Otherwise, only return the final
            backtranslation.
        flat: bool
            If True, return a flat list of strings. If False, return a list of
            lists where item i contains all the intermediate backtranslations
            (n languages in `to_langs` will generate n backtranslations). Note
            that if intermediate=False, results will always be flat.
        to_langs: Iterable[str]
            One or more language codes to use for backtranslation. They will 
            be applied in order: for instance, passing in ['es', 'fr'] will
            translate input from 
            english -> spanish -> english -> french -> english. If not
            specified, this defaults to self.to_langs.
        kwargs: any
            Ignored. Just provided for consistency with other transforms.
            
        Returns
        -------
        list[str] or list[list[str]]: Default is list of strings where item
        i of output corresponds to item i of input. If intermediate=True and
        flat=False, we get a list of lists where each nested list contains
        n backtranslations of input i.
        """
        text = tolist(text)
        to_langs = tolist(to_langs) or self.to_langs
        assert not set(to_langs) - set(self.language_codes), \
            'to_langs codes should all be present in self.language_codes.'

        steps = []
        for lang in to_langs:
            text = [f'>>{lang}<< {t}' for t in text]
            text = [row['translation_text'] for row in self.pipes[0](text)]
            text = [row['translation_text'] for row in self.pipes[1](text)]
            steps.append(text)
        if intermediate:
            steps = zip(*steps)
            return flatten(steps) if flat else lmap(list, *steps)
        else:
            return text

    def __repr__(self):
        lang_str = ", ".join(repr(lang) for lang in self.to_langs)
        return f'{func_name(self)}(to_langs=[{lang_str}])'

In [17]:
# export
@immutify_defaults
def augment_text_df(source, transform='fillmask', dest=None, n=5, 
                    text_col='text', id_cols=(), nrows=None, tfm_kwargs={},
                    call_kwargs={}):
    """Create augmented versions of a dataframe of text, optionally preserving
    other columns for identification purposes. We recommend precomputing and
    saving variations of your data rather than doing this on the fly in a 
    torch dataset since they can be rather space- and time-intensive. 
    Augmented versions of an input row should generally be kept in the same 
    training split: in order to keep the label the same, we usually want to 
    make relatively limited changes to the raw text (just enough to provide a
    regularizing effect).
    
    Parameters
    ----------
    source: str, Path, or pd.DataFrame
        If str or Path, this is a csv containing our text data. Alternatively,
        you can pass in a dataframe itself.
    transform: str or callable
        If str, this must be one of the keys in `NLP_TRANSFORMS` from this 
        same module - this will be used to create a new transform object.
        Alternatively, you can pass in a previously created object (NOT the
        class). The default is the mask filling transform as it's relatively
        quick and effective. 'paraphrase' may give better (but slower) 
        results. Anecdotally, 'generative' seems to provide lower quality
        results, but perhaps by experimenting with hyperparameters it could
        be more useful.
    dest: str, Path, or None
        If str or Path, this is where the output file will be saved to 
        (directories will be created as needed). If None, nothing will be 
        saved and the function will merely return the output DF for you to do 
        with as you wish.
    n: int
        Number of samples to generate for each raw row.
    text_col: str
        Name of column in DF containing the text to augment.
    id_cols: Iterable[str]
        Columns containing identifying information such as labels, row_ids, 
        etc. These also help us map the augmented text rows to their
        corresponding raw rows.
    nrows: int or None
        Max number of rows from the source DF to generate text for. Useful for
        testing (equivalently, you could pass in df.head(nrows) and leave this 
        as None).
    tfm_kwargs: dict
        Arguments to pass to `transform`'s constructor. These are ignored when
        passing in a transform object rather than a string.
    call_kwargs: dict
        Arguments to pass to the __call__ method of `transform` to affect
        the augmentation process.
        
    Returns
    -------
    pd.DataFrame: DF of generated text with columns `text_col` and `id_cols`.
    By default, this will have 5x the rows as your source DF, but this can
    easily be adjusted through the `nrows` parameter.
    """
    # Load data.
    if isinstance(source, (str, Path)):
        df = pd.read_csv(Path(source), usecols=[text_col] + list(id_cols), 
                         nrows=nrows)
    elif isinstance(source, pd.DataFrame):
        df = source.head(nrows)
    else:
        raise TypeError('`source` must be a str/Path or pd.DataFrame.')
        
    # Prepare for output file if necessary.
    if isinstance(dest, (str, Path)):
        dest = Path(dest)
        os.makedirs(dest.parent, exist_ok=True)
    elif dest is not None:
        raise ValueError('`dest` must be a str/Path containing the output '
                         'file name to create, or None if you just want to '
                         'return a df.')

    # For simplicity, we stick to one transform at a time. Slow to load so at
    # least for now, let user pass in the transform itself.
    transform = NLP_TRANSFORMS[transform](n=n, **tfm_kwargs) \
        if isinstance(transform, str) else transform
    
    # Generate new variations of input text.
    res = transform(df[text_col].tolist(), **{**call_kwargs, 'flat': True})
    res = pd.DataFrame(res, columns=[text_col])
    
    # Attach identifier columns to output (e.g. we usually want to store 
    # labels and or sample IDs. Most of our augmentation methods make 
    # relatively minor changes to the input so all variations of 1 input 
    # should remain in the same set, usually training.).
    if id_cols:
        df_id = pd.concat([df[col].repeat(res.shape[0] // df.shape[0])
                           for col in id_cols], axis=1).reset_index(drop=True)
        res = pd.concat([df_id, res], axis=1)
        
    # Optionally save output.
    if dest: res.to_csv(dest, index=False)
    return res