# Creating pretraining data for Augmentation Model
As a next step, we will use GPT to generate sentences for data augmentation.

Note: Creating this means that we have to have a significant amount of data already.

In [1]:
import os
import random
import sys

import numpy as np
import pandas as pd
from tqdm.auto import tqdm, trange

from style_transfer import get_dataset

In [2]:
yelp_train = get_dataset('yelp', 'train')
yelp_dev = get_dataset('yelp', 'dev')
print(len(yelp_train), len(yelp_dev))
yelp_train.head()

444101 63483


Unnamed: 0,text,attr
0,i was sadly mistaken .,negative
1,"so on to the hoagies , the italian is general ...",negative
2,minimal meat and a ton of shredded lettuce .,negative
3,nothing really special & not worthy of the $ _...,negative
4,"second , the steak hoagie , it is atrocious .",negative


In [3]:
POSITIVE_TOKEN = "<|positive|>"
NEGATIVE_TOKEN = "<|negative|>"
EOS_TOKEN = "<|endoftext|>"

token_map = {
    "positive": POSITIVE_TOKEN,
    "negative": NEGATIVE_TOKEN
}

In [4]:
def get_texts(df):
    random.seed(42)
    texts = []
    for _, row in tqdm(df.iterrows(), total=len(df)):
        text, attr = row['text'], row['attr']
        new_text = token_map[attr] + " " + text + " " + EOS_TOKEN
        texts.append(new_text)
    random.shuffle(texts)
    return texts

In [5]:
texts = get_texts(yelp_train)
texts[:10], texts[-10:]

HBox(children=(FloatProgress(value=0.0, max=444101.0), HTML(value='')))




(['<|negative|> when we came back my best friends iphone was no longer on the table . <|endoftext|>',
  '<|positive|> absolute perfection every time . <|endoftext|>',
  '<|positive|> glendale is pretty limited with non-chain restaurants so finding this place was amazing ! <|endoftext|>',
  '<|negative|> our regular server was not here . <|endoftext|>',
  '<|positive|> the staff are friendly & helpful . <|endoftext|>',
  "<|negative|> i 'd give this _num_ star in terms of food . <|endoftext|>",
  '<|positive|> our _num_ month old son ! <|endoftext|>',
  '<|positive|> i would highly recommend this nail salon . <|endoftext|>',
  '<|positive|> absolutely awesome ! <|endoftext|>',
  '<|negative|> the filing job on my nails was so incredibly sloppy . <|endoftext|>'],
 ["<|negative|> i gave one star because you ca n't pick lower than one star . <|endoftext|>",
  '<|positive|> first time here and the food is super yummy . <|endoftext|>',
  '<|negative|> first off the rooms smell , the towels s

In [24]:
lines = [text + '\n' for text in texts]
with open('data/train_gpt_yelp_augmentation.txt', 'w') as f:
    f.writelines(lines)

In [6]:
dev_texts = get_texts(yelp_dev)
dev_texts[:10]

HBox(children=(FloatProgress(value=0.0, max=63483.0), HTML(value='')))




['<|positive|> such cute clothes , shoes , and purses . <|endoftext|>',
 '<|positive|> great service , super nice employees . <|endoftext|>',
 '<|positive|> very nice ambiance in the restaraunt overall . <|endoftext|>',
 '<|positive|> this place makes me feel nostalgic . <|endoftext|>',
 '<|positive|> the staff is incredible . <|endoftext|>',
 '<|positive|> perfect done for medium rare . <|endoftext|>',
 '<|negative|> plus , the service sucked . <|endoftext|>',
 '<|negative|> also , chairs are dirty . <|endoftext|>',
 '<|positive|> the salon is always clean and has a positive atmosphere . <|endoftext|>',
 '<|negative|> last saturday i took my van in to get washed . <|endoftext|>']

In [8]:
lines = [text + '\n' for text in dev_texts]
with open('data/dev_li_gpt_yelp_augmentation.txt', 'w') as f:
    f.writelines(lines)