# Dataset Generation and Augmentation for NLP

We now have a number of ways in NLP to generate datasets from scratch and to augment datasets.

## OpenAI GPT3

We can generate a dataset using OpenAI's GPT3. Register for an account here:

https://beta.openai.com/playground

Once you have your API key, you can set the value below.

In [None]:
api_key = '<your API key goes here>'

In [None]:
import json
import random
import re
from itertools import chain

import requests

In [None]:
random.seed(0)

### Create the samples, in this case positive or negative.

In [None]:
class DaVinci:

  def __init__(self):
    self.url = 'https://api.openai.com/v1/completions'
    self.headers = {'Content-Type': 'application/json',
                    'Authorization': 'Bearer ' + api_key}

  def generate(self, prompt, **kwargs):
    data = {
      "model": "text-davinci-002",
      "prompt": prompt,
      "max_tokens": 256,
      "temperature": 0.7,
      "top_p": 1,
      "n": 1,
      "stream": False,
      "logprobs": None,
      "frequency_penalty": 0,
      "presence_penalty": 0.5
    }
    data.update(kwargs)
    response = requests.post(self.url, headers=self.headers, data=json.dumps(data))
    texts = [t['text'].split('\n') for t in response.json()['choices']]
    return list(t for t in chain.from_iterable(texts) if t)

In [None]:
davinci = DaVinci()

In [None]:
absence_prompt = '''> Radiologists have a variety of ways of expressing the *presence* of renal masses. Here are 3 examples:

1. A 1.2 cm mass exists on the left kidney.
2. There is a 1.2 cm right renal lesion.
3. A Bosniak cyst appears on the left.

> Radiologists have a variety of ways of expressing the *absence* of renal masses. Here are 10 examples:'''

In [None]:
absence_samples = davinci.generate(absence_prompt, n=9)
absence_samples[:10]

['1. No renal mass is identified.',
 '2. There is no evidence of a renal mass.',
 '3. The kidneys are normal in size and shape without focal lesions.',
 '4. The right kidney measures 10 cm in length and the left kidney measures 9.5 cm in length. There are no masses.',
 '5. The kidneys are unremarkable.',
 '6. The study does not reveal any lesions in the kidneys.',
 '7. There are no lesions in either kidney.',
 '8. Both kidneys are free of lesions.',
 '9. Neither kidney has any lesions.',
 '10. The kidneys are clear.']

In [None]:
# Remove initial enumerations and duplicates, and sort.
absence_samples = [re.sub(r'^\d+\.\s*', '', s) for s in absence_samples]
absence_samples = sorted(set(s for s in absence_samples if s.strip()))
absence_samples[:10]

['> Radiologists often include information about whether a renal mass is solid, cystic, or both solid and cystic. Here are 3 examples:',
 'Both kidneys are free of lesions.',
 'Both kidneys are free of mass lesions.',
 'Mass is not seen.',
 'Masses are not seen in either kidney.',
 'Negative for mass.',
 'Neither kidney has any lesions.',
 'No abnormality is seen.',
 'No evidence of mass.',
 'No evidence of renal mass.']

In [None]:
presence_prompt = '''> Radiologists have a variety of ways of expressing the *absence* of renal masses. Here are 3 examples:

1. No renal masses.
2. Negative study.
3. No evidence of any kidney masses.

> Radiologists have a variety of ways of expressing the *presence* of renal masses{include_measurements}. Here are 10 examples:'''

In [None]:
presence_samples = davinci.generate(presence_prompt.format(include_measurements=', which often includes a measurement'), n=8)
presence_samples += davinci.generate(presence_prompt.format(include_measurements=''), n=1)
presence_samples[:10]

['1. Renal mass measuring 3.2 x 2.6 cm.',
 '2. 3.2 x 2.6 cm renal mass.',
 '3. Right renal mass measuring 3.2 x 2.6 cm.',
 '4. Left renal mass, 3.2 x 2.6 cm.',
 '5. Bilobar renal mass, maximum dimension 6.1 cm.',
 '6. 3.2 cm renal mass in the left kidney.',
 '7. Complex cystic renal mass, 6.1 x 4.5 x 4.2 cm.',
 '8. Renal mass with hemorrhage, 5.0 x 4.6 x 5.1 cm.',
 '9. Renal mass, 3.2 cm, with central necrosis.',
 '10. Renal mass with associated calcification, 3.8 x 3.2 cm.']

In [None]:
# Remove initial enumerations and duplicates, and sort.
presence_samples = [re.sub(r'^\d+\.\s*', '', s) for s in presence_samples]
presence_samples = sorted(set(s for s in presence_samples if s.strip()))
presence_samples[:10]

['## Dictionaries',
 '1 cm left renal mass.',
 '10 cm renal mass in the left kidney',
 '11 cm renal mass in the right kidney',
 '3 cm mass in right kidney.',
 '3 cm mass in the left kidney',
 '3 cm mass in the right kidney.',
 '3 cm renal mass.',
 '3 cm right renal mass.',
 '3.2 cm renal mass in the left kidney.']

### Now test these 100% generated samples with a logistic regression model

In [None]:
!pip install -q embeddings

In [None]:
import re

import numpy as np

from embeddings import GloveEmbedding
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [None]:
class Embedder:

    embeddings = GloveEmbedding('wikipedia_gigaword', d_emb=50, show_progress=True, default='zero')

    def embed(self, sentences, aggregator='max'):
        embeddings = []
        for sentence in sentences:
            tokens = re.findall(r'\w+', sentence.lower()) 
            token_embeddings = np.array([self.embeddings.emb(t, default='zero') for t in tokens])
            sentence_embedding = getattr(token_embeddings, aggregator)(axis=0)
            embeddings.append(sentence_embedding)
        return np.stack(embeddings)

In [None]:
train_X = absence_samples + presence_samples
train_y = [0] * len(absence_samples) + [1] * len(presence_samples)
len(train_X), len(train_y)

(175, 175)

In [None]:
eval_X = [
    'There is a left renal cyst, probably a Bosniak 2, measuring 3.1 cm.',
    'Both kidneys are free of cysts.',
    'There are bilateral renal cysts, the largest of which is 1.1 cm.',
    'The kidneys are normal.',
    'A 1.7 x 1.9 x 3.2 cm mass is seen on the right kidney.',
    'Negative for renal cysts.',
    'Multiple bilateral tiny renal cysts.',
    'Kidneys contain no masses.',
    'A previously seen 8.3 cm left renal cyst is demonstrated.',
    'Kidneys: healthy.',
    'Too small to characterize left renal cyst.',
    'The kidneys are without abnormality.',
]
eval_y = [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]

In [None]:
embedder = Embedder()
vectorized_X = embedder.embed(train_X, aggregator='max')

clf = LogisticRegression(random_state=0, penalty='l1', solver='liblinear')
clf.fit(vectorized_X, train_y)

LogisticRegression(penalty='l1', random_state=0, solver='liblinear')

In [None]:
vectorized_X_eval = embedder.embed(eval_X, aggregator='max')
predictions = clf.predict(vectorized_X_eval)
predictions

array([1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0])

In [None]:
print(classification_report(eval_y, predictions))

              precision    recall  f1-score   support

           0       0.86      1.00      0.92         6
           1       1.00      0.83      0.91         6

    accuracy                           0.92        12
   macro avg       0.93      0.92      0.92        12
weighted avg       0.93      0.92      0.92        12



## Data Augmentation with Synonyms

There are a variety of ways to augment NLP data, though augmenting NLP data has historically been less practiced than the augmenting of computer vision data. Some of you may have experience with torchvision transforms, PIL transformations, or OpenCV transformations. We use these packages to augment datasets for computer vision tasks (such as reading radiology scans) with operations such as cropping, jittering, flipping, resizing, etc. It can increase the size of our dataset substantially.

NLP data augmentation is similar. We can add or remove words that probably do not affect the meaning of the sentence, replace words in the sentence with synonyms, try to negate the sentence to get its opposite meaning, etc.



In [None]:
!pip install -q nlpaug

In [None]:
from random import choice

from nlpaug.augmenter.word import SynonymAug

In [None]:
mass_words = ['cyst', 'nodule', 'lesion', 'mass', 'aml', 'angiomyolipoma', 'Bosniak']
stopwords = ['the', 'a', 'an', 'to', 'cm', 'there', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', 'is', 'are', 'has', 'have'] + mass_words

In [None]:
augmenter = SynonymAug(stopwords=stopwords)

In [None]:
augmenter.augment(['There is a 2.4 cm renal mass on the right.', 'A 1.2 cm cyst exists on the left.'])

['On that point is a 2. 4 cm nephritic mass on the rightfulness.',
 'A 1. 2 cm cyst exists on the left hand.']

In [None]:
def randomly_replace_mass_terms(samples):
    """
    We want to do this in a more controlled fashion.
    """
    mass_terms = re.compile(r''.join(mass_words), flags=re.I)
    for i in range(len(samples)):
        sentence = samples[i]
        samples[i] = mass_terms.sub(choice(mass_words), sentence)

In [None]:
augmenting_absence_samples = augmenter.augment(absence_samples)
randomly_replace_mass_terms(augmenting_absence_samples)

augmenting_presence_samples = augmenter.augment(presence_samples)
randomly_replace_mass_terms(augmenting_presence_samples)

In [None]:
augmented_absence_samples = absence_samples + augmenting_absence_samples
augmented_presence_samples = presence_samples + augmenting_presence_samples
len(augmented_absence_samples), len(augmented_presence_samples)

(148, 202)

In [None]:
augmented_absence_samples[-20:]

['The mass is cystic.',
 'The mass is satisfying.',
 'The good kidney measure 10 cm in length and the left kidney measure 9. 5 cm in distance. There are no multitude.',
 'The written report does non certify a mass.',
 'The study get along non bring out a mass.',
 'The cogitation does not discover any lesion in the kidney.',
 'The report exercise non reveal any mass.',
 'The written report does not expose any nephritic mass.',
 'The bailiwick is minus for a mass.',
 'There are no lesion in either kidney.',
 'Thither are no wound on the kidney.',
 'In that location are no nephritic abnormalities.',
 'There are no nephritic plenty.',
 'Thither is no grounds of a mass.',
 'In that location is no grounds of a nephritic mass.',
 'At that place is no evidence of nephritic mass.',
 'In that respect is no mass in either kidney.',
 'Thither is no mass.',
 'In that respect is no nephritic mass identify.',
 'Thither were no renal lesion.']

In [None]:
augmented_presence_samples[-20:]

['Nephritic mass, 3 cm.',
 'Nephritic mass, 3. 2 cm, with cardinal gangrene.',
 'Right broken pole mass.',
 'Correct nephritic mass mensurate 3. 2 cm.',
 'Right hand nephritic mass quantify 3. 2 10 2. 6 cm.',
 'Proper nephritic mass valuate 4. 5 cm.',
 'Good nephritic mass, 3 cm.',
 'Correct nephritic mass, 3. 0 cm.',
 'The remaining kidney hold a 3 cm mass.',
 'The mass in the veracious kidney measure 6. 5 10 5. 2 cm.',
 'The mass is site in the lower pole of the right kidney.',
 'The mass valuate 3 cm and is in the left hand kidney.',
 'The nephritic mass mensurate 8. 0 cm.',
 'Thither is a 3 cm mass in the unexpended kidney.',
 'In that respect is a 4. 3 cm mass in the odd kidney.',
 'There is a mass in the left hand kidney that measure 7. 0 10 6. 0 cm.',
 'On that point is a mass in the right hand kidney measure out 6. 0 cm in its longest proportion.',
 'Ii modest left renal masses.',
 'multiple low renal cysts',
 'nephritic mass appraise 3. 2 ten 2. 1 10 1. 9 cm in the right kidne

### Now test these 100% generated samples with a logistic regression model

In [None]:
train_X = augmented_absence_samples + augmented_presence_samples
train_y = [0] * len(augmented_absence_samples) + [1] * len(augmented_presence_samples)
len(train_X), len(train_y)

(350, 350)

In [None]:
vectorized_X = embedder.embed(train_X, aggregator='max')

clf = LogisticRegression(random_state=0, penalty='l1', solver='liblinear')
clf.fit(vectorized_X, train_y)

LogisticRegression(penalty='l1', random_state=0, solver='liblinear')

In [None]:
vectorized_X_eval = embedder.embed(eval_X, aggregator='max')
predictions = clf.predict(vectorized_X_eval)
predictions

array([1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0])

In [None]:
print(classification_report(eval_y, predictions))

              precision    recall  f1-score   support

           0       0.86      1.00      0.92         6
           1       1.00      0.83      0.91         6

    accuracy                           0.92        12
   macro avg       0.93      0.92      0.92        12
weighted avg       0.93      0.92      0.92        12



## Data Augmentation with a Markov Model

I have so far found this sort of data generation/augmentation useful only in particular circumstances. For example, I had a dataset which had very few mentions of any cysts. I built a Markov model and generated several sentences with the word 'cyst', but within contexts different from the samples of cysts that I already had.

In [None]:
!pip install -q markovify

In [None]:
from time import time

import markovify

In [None]:
presence_text = ' '.join(presence_samples)
absence_text = ' '.join(absence_samples + ['No cysts appear in the kidney.', 'No cysts appear in the left kidney.'])

In [None]:
presence_markov_model = markovify.Text(presence_text, state_size=4)
absence_markov_model = markovify.Text(absence_text, state_size=2)  # Our corpus is not huge, so we need to shrink the state size.

In [None]:
def generate_new_sentences(model, n=50, timeout=30):
    sentences = []
    c = 0
    start = time()
    while c < n:
        sentence = model.make_sentence()
        if sentence:
            sentences.append(sentence)
            c += 1
        if time() - start >= timeout:
            break
    return sentences

In [None]:
markov_presence_samples = generate_new_sentences(presence_markov_model, n=len(presence_samples))
markov_absence_samples = generate_new_sentences(absence_markov_model, n=len(absence_samples))
len(markov_presence_samples), len(markov_absence_samples)

(101, 74)

In [None]:
markov_augmented_absence_samples = absence_samples + markov_absence_samples
markov_augmented_presence_samples = presence_samples + markov_presence_samples
len(markov_augmented_presence_samples), len(markov_augmented_absence_samples)

(202, 148)

### Now test these augmented samples with a logistic regression model

In [None]:
train_X = markov_augmented_absence_samples + markov_augmented_presence_samples
train_y = [0] * len(markov_augmented_absence_samples) + [1] * len(markov_augmented_presence_samples)
len(train_X), len(train_y)

(350, 350)

In [None]:
vectorized_X = embedder.embed(train_X, aggregator='max')

clf = LogisticRegression(random_state=0, penalty='l1', solver='liblinear')
clf.fit(vectorized_X, train_y)

LogisticRegression(penalty='l1', random_state=0, solver='liblinear')

In [None]:
vectorized_X_eval = embedder.embed(eval_X, aggregator='max')
predictions = clf.predict(vectorized_X_eval)
predictions

array([1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0])

In [None]:
print(classification_report(eval_y, predictions))

              precision    recall  f1-score   support

           0       0.75      1.00      0.86         6
           1       1.00      0.67      0.80         6

    accuracy                           0.83        12
   macro avg       0.88      0.83      0.83        12
weighted avg       0.88      0.83      0.83        12



## Templated Data Generation

Another way NLP practitioners generate datasets is to use templates. This can sometimes lead to substantial gains in model quality.

In [None]:
positive_templates = [
    'There is a {measure} {mass_type} in the {laterality} {area}.',
    'A {measure} {mass_type} in the {laterality} {area}.',
    'Bilateral {mass_type_plural} in the {area}.',
    '{mass_type_plural} {seen_terms}on both {area_plural}.',
    'There is {seen_terms}on the {laterality} a {mass_type} {measure}.',
    '{mass_type} is {seen_terms}on the {laterality} {measure}.',
]
negative_templates = [
    'There is no {mass_type} in the {laterality} {area}.',
    'No {mass_type} in the {laterality} {area}.',
    'Bilateral {mass_type_plural} in the {area} are absent.',
    '{mass_type_plural} not {seen_terms}on both {area_plural}.',
    'There is not {seen_terms}on the {laterality} a {mass_type}.',
    '{mass_type} is not {seen_terms}on the {laterality}.',
    'The {laterality} kidney is without {mass_type_plural}.'
]

In [None]:
def get_templated_samples(templates, n=100, **template_kwargs):
    """
    kwargs: dict of str: sequence possible kwargs for the template format function.
    """
    new_samples = set()
    while len(new_samples) < n:
        template = choice(templates)
        sample = template.format(**{k: choice(v) for k, v in template_kwargs.items()})
        new_samples.add(sample.capitalize())
    return sorted(new_samples)

In [None]:
template_kwargs = {
    'measure': [f'measuring {i / 10:.1f} cm' for i in range(100)] + ['small', 'tstc', 'tiny', 'too small to characterize'] * 20,
    'mass_type': ['lesion', 'cyst', 'mass', 'nodule'],
    'laterality': ['left', 'right'],
    'area': ['kidney', 'renal area', 'renal parenchyma'],
    'area_plural': ['kidneys', 'renals'],
    'mass_type_plural': ['lesions', 'cysts', 'masses', 'nodules'],
    'seen_terms': ['demonstrated ', 'visualized ', 'apparent ', 'proved ', 'proven ', 'seen ', 'viewed ', ' '],
}

In [None]:
templated_presence_samples = get_templated_samples(positive_templates, n=100, **template_kwargs)
templated_absence_samples = get_templated_samples(negative_templates, n=100, **template_kwargs)
len(templated_presence_samples), len(templated_absence_samples)

(100, 100)

In [None]:
template_augmented_absence_samples = absence_samples + templated_absence_samples
template_augmented_presence_samples = presence_samples + templated_presence_samples
len(template_augmented_presence_samples), len(template_augmented_absence_samples)

(201, 174)

In [None]:
template_augmented_presence_samples[-20:]

['There is apparent on the left a nodule too small to characterize.',
 'There is demonstrated on the right a cyst measuring 6.6 cm.',
 'There is demonstrated on the right a nodule measuring 9.6 cm.',
 'There is demonstrated on the right a nodule too small to characterize.',
 'There is proved on the left a mass tstc.',
 'There is proved on the right a cyst measuring 4.8 cm.',
 'There is proven on the left a cyst tiny.',
 'There is proven on the right a cyst too small to characterize.',
 'There is seen on the left a cyst measuring 7.6 cm.',
 'There is seen on the left a nodule small.',
 'There is viewed on the left a lesion tiny.',
 'There is viewed on the left a mass measuring 6.9 cm.',
 'There is viewed on the left a nodule measuring 1.8 cm.',
 'There is viewed on the right a cyst too small to characterize.',
 'There is visualized on the left a mass measuring 9.7 cm.',
 'There is visualized on the left a nodule measuring 0.8 cm.',
 'There is visualized on the left a nodule measuring 5.

In [None]:
template_augmented_absence_samples[-20:]

['There is no nodule in the right renal parenchyma.',
 'There is not  on the left a cyst.',
 'There is not  on the right a nodule.',
 'There is not apparent on the right a lesion.',
 'There is not apparent on the right a mass.',
 'There is not demonstrated on the left a nodule.',
 'There is not demonstrated on the right a lesion.',
 'There is not demonstrated on the right a mass.',
 'There is not demonstrated on the right a nodule.',
 'There is not proved on the right a cyst.',
 'There is not proven on the left a lesion.',
 'There is not proven on the left a mass.',
 'There is not seen on the left a cyst.',
 'There is not seen on the left a mass.',
 'There is not seen on the right a cyst.',
 'There is not viewed on the left a cyst.',
 'There is not viewed on the left a lesion.',
 'There is not viewed on the right a cyst.',
 'There is not visualized on the left a mass.',
 'There is not visualized on the right a cyst.']

### Now test these augmented samples with a logistic regression model

In [None]:
train_X = template_augmented_absence_samples + template_augmented_presence_samples
train_y = [0] * len(template_augmented_absence_samples) + [1] * len(template_augmented_presence_samples)
len(train_X), len(train_y)

(375, 375)

In [None]:
vectorized_X = embedder.embed(train_X, aggregator='max')

clf = LogisticRegression(random_state=0, penalty='l1', solver='liblinear')
clf.fit(vectorized_X, train_y)

LogisticRegression(penalty='l1', random_state=0, solver='liblinear')

In [None]:
vectorized_X_eval = embedder.embed(eval_X, aggregator='max')
predictions = clf.predict(vectorized_X_eval)
predictions

array([1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0])

In [None]:
print(classification_report(eval_y, predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         6
           1       1.00      1.00      1.00         6

    accuracy                           1.00        12
   macro avg       1.00      1.00      1.00        12
weighted avg       1.00      1.00      1.00        12

