In [3]:
import fire
import json
import os
import numpy as np
import tensorflow as tf

import model
import sample
import encoder

from snorkel.augmentation import transformation_function

ModuleNotFoundError: No module named 'model'

### Data Augmentation by predicting the next sentence with GPT-2

#create a python 3.7 environment
conda create -n GPT2 python=3.7

#activate the environment
conda activate GPT2

download GPT2 
git clone https://github.com/openai/gpt-2.git


...in the 'ch02_labeling_and_augmentation' directory (APPARENTLY) so that 
you have a 'ch02_labeling_and_augmentation\gpt-2\models' path.


install the requirements, as presented in the repository
There are no conda packages for fire and regex, resorting to pip 
for the installation
pip install -r requirements.txt

'''
If you start the program, and you start seeing missing pacakges, you might
want to install them one by one.  
I had to do so for tensorflow  
If you don't have a GPU, you can install just the 'tensorflow' package.
'''
pip install tensorflow-gpu==1.12.0

'''
Download one of the pretrained models. To strike a balance between speed of
execution and accuracy, we will use the 355M model. The other models are: 124M, 355M, 774M and 1558M. 
'''
python download_model.py 355M

Substitute the "GPT_2_Models" with the path to the gpt-2\modes folder (code\companion\ch02_labeling_and_augmentation\gpt-2\models). 

In [4]:
def interact_model(
    raw_text,
    model_name='774M',
    seed=None,
    nsamples=1,
    batch_size=1,
    length= None,
    temperature=1,
    top_k=40,
    top_p=1,
    models_dir=r"GPT_2_Models",
):
    enc = encoder.get_encoder(model_name, models_dir)
    hparams = model.default_hparams()
    with open(os.path.join(models_dir, model_name, 'hparams.json')) as f:
        hparams.override_from_dict(json.load(f))

    if length is None:
        length = hparams.n_ctx // 2

    with tf.Session(graph=tf.Graph()) as sess:
        context = tf.placeholder(tf.int32, [batch_size, None])
        np.random.seed(seed)
        tf.set_random_seed(seed)
        output = sample.sample_sequence(
            hparams=hparams, length=length,
            context=context,
            batch_size=batch_size,
            temperature=temperature, top_k=top_k, top_p=top_p
        )

        saver = tf.train.Saver()
        ckpt = tf.train.latest_checkpoint(os.path.join(models_dir, model_name))
        saver.restore(sess, ckpt)

        context_tokens = enc.encode(raw_text)
        all_text = []
        for _ in range(nsamples // batch_size):
            out = sess.run(output, feed_dict={
                context: [context_tokens for _ in range(batch_size)]
            })[:, len(context_tokens):]
            for i in range(batch_size):
                text = enc.decode(out[i])
                all_text.append(text)
        return ''.join(all_text)

In [5]:
t = interact_model("Something did not feel right, since the first day.", length=50)
t





Instructions for updating:
Use `tf.cast` instead.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Instructions for updating:
Use `tf.random.categorical` instead.
INFO:tensorflow:Restoring parameters from C:\Users\sefilipi\Documents\books\snorkel\code\companion\ch02_labeling_and_augmentation\gpt-2\models\774M\model.ckpt


" So I decided to go with my wife. We left home together just as we went to buy new clothes, my wife was the first on the drive, she didn't go too fast.\n\nAfter that I sat in the car and did everything"

In [166]:
@transformation_function()
def predict_next(x):
    review = x["review_text"]
    # extract the first sentence
    period_index = review.find('.')
    first_sentence = review[:period_index+1]
    #predict and get full sentences only.
    predicted = interact_model(review, length=50)
    last_period = predicted.rfind('.')
    sentence = first_sentence+" "+predicted[:last_period+1]
    x["review_text"] = sentence
    return x

In [141]:
k = predict_next(df[df.product_name == 'Honest Illusions: Books: Nora Roberts'].iloc[0])
k

PREDICT_NEXT
product_name                Honest Illusions: Books: Nora Roberts
review_text     I simply adore this book and it's what made me...
reviewer                                  L. Cortes "Book Junkie"
Name: 2054, dtype: object
INFO:tensorflow:Restoring parameters from C:\Users\sefilipi\Documents\books\snorkel\chapter 2\Data\gpt-2\models\774M\model.ckpt


product_name                Honest Illusions: Books: Nora Roberts
review_text     I simply adore this book and it's what made me...
reviewer                                  L. Cortes "Book Junkie"
Name: 2054, dtype: object

### Data Augmentation through translation

In [10]:
import os, requests, uuid, json
import spacy
nlp = spacy.load('en')

Substitute the following "KEY", "ENDPOINT", "REGION" placeholders with the values found in your deployed instance, as illustrated in Figure 2-4. 

In [7]:
resource_key = '<KEY>'
endpoint = '<ENDPOINT>'
region = '<REGION>'

In [142]:
def translate(text, language):
    headers = {
    'Ocp-Apim-Subscription-Key': resource_key,
    "Ocp-Apim-Subscription-Region" : region,
    'Content-type': 'application/json',
    'X-ClientTraceId': str(uuid.uuid4())
    }

    body = [{'text' : text}]
    response = requests.post(
        endpoint+"/translate?api-version=3.0&to="+language,
        headers=headers, json=body)
    translation = response.json()[0]["translations"][0]["text"]
    return translation


@transformation_function()
def augment_by_translation(x):
    text = x["review_text"]
    french = translate(text, "fr")
    english = translate(french, "en")
    score = nlp(text).similarity(nlp(english))
    if(score < 1.0):
        x["review_text"] = english
    else:
        x["review_text"] = "DUPLICATE"
    return x

In [143]:
k = augment_by_translation(df[df.product_name == 'Honest Illusions: Books: Nora Roberts'].iloc[0])
k

AUGMENT_BY_TRANSLATION
product_name                Honest Illusions: Books: Nora Roberts
review_text     I simply adore this book and it's what made me...
reviewer                                  L. Cortes "Book Junkie"
Name: 2054, dtype: object




AUGMENT_BY_TRANSLATION RETURN


product_name                Honest Illusions: Books: Nora Roberts
review_text     I love this book and that's what made me come ...
reviewer                                  L. Cortes "Book Junkie"
Name: 2054, dtype: object

### Augment by removing adverbs 

In [23]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\sefilipi\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [158]:
tags_to_remove = ["RB", "RBR", "RBS"]
@transformation_function()
def remove_adverbs(x):
    tokens = nltk.word_tokenize(x["review_text"])
    pos_tags = nltk.pos_tag(tokens)
    new_text = " ".join([x[0] for x in pos_tags if x[1] not in tags_to_remove]).replace(" .", ".")
    if(len(new_text) != len(tokens)):
        x["review_text"] = new_text
    else:
        x["review_text"] = "DUPLICATE"
    return x

In [147]:
remove_adverbs(df[df.product_name == 'Honest Illusions: Books: Nora Roberts'].iloc[0])

product_name                Honest Illusions: Books: Nora Roberts
review_text     I adore this book and it 's what made me go ou...
reviewer                                  L. Cortes "Book Junkie"
Name: 2054, dtype: object

### Remove Adverbs using SpacyPreprocessor

In [26]:
from snorkel.preprocess.nlp import SpacyPreprocessor

spacy = SpacyPreprocessor(text_field="review_text", doc_field="doc", language="en")

In [27]:
df[df.product_name == 'Honest Illusions: Books: Nora Roberts'].iloc[0]["review_text"]

"I simply adore this book and it's what made me go out to buy every other Nora Roberts novel.  It just amazes me the chemistry between Luke and Roxanne.  Everytime I read it I can't get over the excitement and the smile it brings to me as they barb back and forth.  It's just an amazing story over the course of growing up together and how it all develops.  Roxanne is smart and sassy and Luke is just too cool.  This romantic duo shines and no one else compares to them yet for me.  My very favorite romance"

In [165]:
@transformation_function(pre=[spacy])
def spacy_remove_adverbs(x):
    words_no_adverbs = [token for i, token in enumerate(x.doc) 
                        if token.pos_ != "ADV"]
    new_sentence = " ".join([x.text for x in words_no_adverbs])
    if(len(words_no_adverbs) != len(x["review_text"])):
        x["review_text"] = new_sentence.replace(" .  ", ". ")
    else:
        x["review_text"]= "DUPLICATE"
    return x

In [149]:
spacy_remove_adverbs(df[df.product_name == 'Honest Illusions: Books: Nora Roberts'].iloc[0])

SPACY_REMOVE_ADVERBS
product_name                Honest Illusions: Books: Nora Roberts
review_text     I simply adore this book and it's what made me...
reviewer                                  L. Cortes "Book Junkie"
Name: 2054, dtype: object
SPACY_REMOVE_ADVERBS RETURN


product_name                Honest Illusions: Books: Nora Roberts
review_text     I adore this book and it 's what made me go ou...
reviewer                                  L. Cortes "Book Junkie"
Name: 2054, dtype: object

### Read the dataset

In [16]:
import pandas as pd
import xml.etree.ElementTree as et 

xtree = et.parse(r"C:\Users\sefilipi\Documents\books\snorkel\chapter 2\Data\book.unlabeled")
xroot = xtree.getroot()
df_cols = ["product_name", "review_text", "reviewer"]
records = []

for node in xroot: 
    text = node.find("review_text").text.replace('\n', '')
    title = node.find("product_name").text.replace('\n', '')
    reviewer = node.find("reviewer").text.replace('\n', '')
    records.append({"review_text": text, "product_name": title, "reviewer": reviewer})

df = pd.DataFrame(records, columns = df_cols)
df.head(30)

Unnamed: 0,product_name,review_text,reviewer
0,Manual pedi�trico para los due�os del nuevo be...,This all-Spanish handbook for parents with new...,Midwest Book Review
1,Child of God: Books: Cormac Mccarthy,McCarthy's writing and portrayal of Lester Bal...,Brian Asquith
2,Child of God: Books: Cormac Mccarthy,Do you giggle uncontrollably when poking corps...,Bruce Miller
3,Child of God: Books: Cormac Mccarthy,I was initiated into the world of Cormac McCar...,"Christopher Davis ""Christopher E.D."""
4,Child of God: Books: Cormac Mccarthy,I cannot speak to the literary points in the n...,Alex Johnson
5,Child of God: Books: Cormac Mccarthy,There is no denying the strain of Faulkner tha...,Steve
6,Child of God: Books: Cormac Mccarthy,I have read sevral of McCarthy's other novels....,William J. Fickling
7,Child of God: Books: Cormac Mccarthy,This is a gripping novel that probes the bread...,Robert Ortiz
8,Child of God: Books: Cormac Mccarthy,"If you're considering buying the Peter Smith ""...",Allan MacInnis
9,"Building, Buying and Operating a Boarding Kenn...",What I didn't realise when I ordered this is t...,Jennifer Robinson


In [17]:
df.product_name.unique()

array(['Manual pedi�trico para los due�os del nuevo beb�: Books: Graciela Esquivel-Aguilar,Horst D. Weinburg',
       'Child of God: Books: Cormac Mccarthy',
       'Building, Buying and Operating a Boarding Kennel: Books: James Krack',
       "The Protector's War: Books: S. M. Stirling",
       'American Colonies: The Settling of North America (The Penguin History of the United States, Volume1) (Hist of the USA): Books: Alan Taylor',
       'Secret Life of Bees, The (Essential Edition): (Penguin Essential Edition): Books: Sue  Kidd',
       "Red Star Over Hollywood: The Film Colony's Long Romance with the Left: Books: Ronald Radosh",
       'The Corporate University Handbook: Designing, Managing, and Growing a Successful Program: Books: Mark Allen',
       'Turn Up the Heat with G. Garvin: Books: Gerry Garvin',
       'Rescue: Books: Nicholas Sparks',
       'Secret Life of Bees: CD: Books: Sue Monk Kidd',
       'The Messenger: Books: Daniel Silva',
       'Making Things Public: Atmo

In [18]:
df[df.product_name == 'Honest Illusions: Books: Nora Roberts']["review_text"].to_list()[0]

"I simply adore this book and it's what made me go out to buy every other Nora Roberts novel.  It just amazes me the chemistry between Luke and Roxanne.  Everytime I read it I can't get over the excitement and the smile it brings to me as they barb back and forth.  It's just an amazing story over the course of growing up together and how it all develops.  Roxanne is smart and sassy and Luke is just too cool.  This romantic duo shines and no one else compares to them yet for me.  My very favorite romance"

In [159]:
df2 = df[df.product_name == 'Honest Illusions: Books: Nora Roberts']
df2

Unnamed: 0,product_name,review_text,reviewer
2054,Honest Illusions: Books: Nora Roberts,I simply adore this book and it's what made me...,"L. Cortes ""Book Junkie"""
2055,Honest Illusions: Books: Nora Roberts,"Magic, mystery, romance and burglary are all p...","Terry A. Benedict-Devine ""Terry"""
2056,Honest Illusions: Books: Nora Roberts,I read the review and got the book and fell in...,Diana Faraone
2057,Honest Illusions: Books: Nora Roberts,It is difficult to find books in this genre th...,A. Rowley
2058,Honest Illusions: Books: Nora Roberts,This is one of my favorite Nora Roberts book. ...,"avid reader ""A reader"""
2059,Honest Illusions: Books: Nora Roberts,"This book has everything....Love, Greed, Murde...",S. Williams
2060,Honest Illusions: Books: Nora Roberts,"When I began to read Nora Roberts, I really di...",Creekergirl


In [170]:
from snorkel.augmentation import PandasTFApplier, ApplyAllPolicy

tfs = [predict_next, augment_by_translation, spacy_remove_adverbs]
policy = ApplyAllPolicy(3, n_per_original=1, keep_original=True)

tf_applier = PandasTFApplier(tfs, policy)
policy.generate_for_example()

[[], [0, 1, 2]]

In [188]:
from snorkel.augmentation import *

tfs = [ predict_next, augment_by_translation, spacy_remove_adverbs]
policy =ApplyEachPolicy(len(tfs), keep_original=True)

tf_applier = PandasTFApplier(tfs, policy)
policy.generate_for_example()

[[], [0], [1], [2]]

In [192]:
from snorkel.augmentation import PandasTFApplier, ApplyOnePolicy

tfs = [ predict_next, augment_by_translation, spacy_remove_adverbs]
policy =ApplyOnePolicy(4, keep_original=True)

tf_applier = PandasTFApplier(tfs, policy)
policy.generate_for_example()

[[], [0], [0], [0], [0]]

In [194]:
from snorkel.augmentation import PandasTFApplier, RandomPolicy

tfs = [ predict_next, augment_by_translation, spacy_remove_adverbs]
policy =RandomPolicy(3, sequence_length =5, n_per_original = 2, keep_original=True)

tf_applier = PandasTFApplier(tfs, policy)
policy.generate_for_example()

[[], [1, 2, 1, 2, 2], [2, 1, 2, 0, 1]]

In [197]:
from snorkel.augmentation import PandasTFApplier, MeanFieldPolicy

tfs = [ predict_next, augment_by_translation, spacy_remove_adverbs]
policy = MeanFieldPolicy(
    len(tfs),
    sequence_length=1,
    n_per_original=1,
    keep_original=True,
    p=[0.3, 0.3, 0.4],
)

tf_applier = PandasTFApplier(tfs, policy)
policy.generate_for_example()

[[], [0], [0]]

In [163]:
from snorkel.augmentation import PandasTFApplier, MeanFieldPolicy

tfs = [ predict_next, augment_by_translation, spacy_remove_adverbs]
policy = MeanFieldPolicy(
    len(tfs),
    sequence_length=1,
    n_per_original=1,
    keep_original=True,
    p=[0.3, 0.3, 0.4],
)

tf_applier = PandasTFApplier(tfs, policy)
df_train_augmented = tf_applier.apply(df2)


  0%|                                                                                            | 0/7 [00:00<?, ?it/s]

SPACY_REMOVE_ADVERBS
product_name                Honest Illusions: Books: Nora Roberts
review_text     I simply adore this book and it's what made me...
reviewer                                  L. Cortes "Book Junkie"
Name: 2054, dtype: object
SPACY_REMOVE_ADVERBS RETURN
PREDICT_NEXT
product_name                Honest Illusions: Books: Nora Roberts
review_text     Magic, mystery, romance and burglary are all p...
reviewer                         Terry A. Benedict-Devine "Terry"
Name: 2055, dtype: object
INFO:tensorflow:Restoring parameters from C:\Users\sefilipi\Documents\books\snorkel\chapter 2\Data\gpt-2\models\774M\model.ckpt



 29%|████████████████████████                                                            | 2/7 [01:07<02:48, 33.77s/it]

AUGMENT_BY_TRANSLATION
product_name                Honest Illusions: Books: Nora Roberts
review_text     I read the review and got the book and fell in...
reviewer                                            Diana Faraone
Name: 2056, dtype: object




AUGMENT_BY_TRANSLATION RETURN



 43%|████████████████████████████████████                                                | 3/7 [01:11<01:24, 21.21s/it]

AUGMENT_BY_TRANSLATION
product_name                Honest Illusions: Books: Nora Roberts
review_text     It is difficult to find books in this genre th...
reviewer                                                A. Rowley
Name: 2057, dtype: object
AUGMENT_BY_TRANSLATION RETURN



 57%|████████████████████████████████████████████████                                    | 4/7 [01:13<00:41, 13.99s/it]

SPACY_REMOVE_ADVERBS
product_name                Honest Illusions: Books: Nora Roberts
review_text     This is one of my favorite Nora Roberts book. ...
reviewer                                   avid reader "A reader"
Name: 2058, dtype: object
SPACY_REMOVE_ADVERBS RETURN
PREDICT_NEXT
product_name                Honest Illusions: Books: Nora Roberts
review_text     This book has everything....Love, Greed, Murde...
reviewer                                              S. Williams
Name: 2059, dtype: object
INFO:tensorflow:Restoring parameters from C:\Users\sefilipi\Documents\books\snorkel\chapter 2\Data\gpt-2\models\774M\model.ckpt



 86%|████████████████████████████████████████████████████████████████████████            | 6/7 [01:59<00:18, 18.78s/it]

PREDICT_NEXT
product_name                Honest Illusions: Books: Nora Roberts
review_text     When I began to read Nora Roberts, I really di...
reviewer                                              Creekergirl
Name: 2060, dtype: object
INFO:tensorflow:Restoring parameters from C:\Users\sefilipi\Documents\books\snorkel\chapter 2\Data\gpt-2\models\774M\model.ckpt


100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [02:42<00:00, 23.21s/it]


In [162]:
df2

Unnamed: 0,product_name,review_text,reviewer
2054,Honest Illusions: Books: Nora Roberts,I simply adore this book and it's what made me...,"L. Cortes ""Book Junkie"""
2055,Honest Illusions: Books: Nora Roberts,"Magic, mystery, romance and burglary are all p...","Terry A. Benedict-Devine ""Terry"""
2056,Honest Illusions: Books: Nora Roberts,I read the review and got the book and fell in...,Diana Faraone
2057,Honest Illusions: Books: Nora Roberts,It is difficult to find books in this genre th...,A. Rowley
2058,Honest Illusions: Books: Nora Roberts,This is one of my favorite Nora Roberts book. ...,"avid reader ""A reader"""
2059,Honest Illusions: Books: Nora Roberts,"This book has everything....Love, Greed, Murde...",S. Williams
2060,Honest Illusions: Books: Nora Roberts,"When I began to read Nora Roberts, I really di...",Creekergirl


In [161]:
df_train_augmented

Unnamed: 0,product_name,review_text,reviewer
2054,Honest Illusions: Books: Nora Roberts,I simply adore this book and it's what made me...,"L. Cortes ""Book Junkie"""
2054,Honest Illusions: Books: Nora Roberts,I love this book and that's what made me come ...,"L. Cortes ""Book Junkie"""
2054,Honest Illusions: Books: Nora Roberts,I love this book and that's what made me come ...,"L. Cortes ""Book Junkie"""
2055,Honest Illusions: Books: Nora Roberts,"Magic, mystery, romance and burglary are all p...","Terry A. Benedict-Devine ""Terry"""
2055,Honest Illusions: Books: Nora Roberts,"Magic , mystery , romance and burglary are all...","Terry A. Benedict-Devine ""Terry"""
2055,Honest Illusions: Books: Nora Roberts,"Magic , mystery , romance and burglary are all...","Terry A. Benedict-Devine ""Terry"""
2056,Honest Illusions: Books: Nora Roberts,I read the review and got the book and fell in...,Diana Faraone
2056,Honest Illusions: Books: Nora Roberts,I read the review and got the book and fell in...,Diana Faraone
2056,Honest Illusions: Books: Nora Roberts,I read the review and got the book and fell in...,Diana Faraone
2057,Honest Illusions: Books: Nora Roberts,It is difficult to find books in this genre th...,A. Rowley
