## 8.1 GPT for style completion

In [1]:
from transformers import GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, GPT2LMHeadModel, pipeline, \
                         Trainer, TrainingArguments

In [2]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [8]:
!ls data/

Flickr8k.token.txt   latex_tutorial.txt   snips.train.txt
disaster_sample.csv  rocks.jpg            toxic.csv
english_to_latex.csv sample.wav
[1m[36mflicker_images[m[m       skate.jpg


In [9]:
pds_data = TextDataset(
    tokenizer=tokenizer,
    file_path='data/PDS2.txt',  # Principles of Data Science - Sinan Ozdemir
    block_size=32  # length of each chunk of text to use as a datapoint
)

In [10]:
pds_data[0], pds_data[0].shape  # inspect the first point

(tensor([  200, 47231,  6418,   286,  6060,  5800,   198, 12211,  5061,   198,
           198,    32, 31516,   338,  5698,   284, 13905,  7605,   290,  4583,
           284,   198, 11249,   304,   171,   105,   222, 13967,  1366,    12,
         15808,  5479]),
 torch.Size([32]))

In [11]:
print(tokenizer.decode(pds_data[0]))

Principles of Data Science
Second Edition

A beginner's guide to statistical techniques and theory to
build eﬀective data-driven applications


In [12]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,  # MLM is Masked Language Modelling
)

In [14]:
tokenizer.pad_token = tokenizer.eos_token 

In [15]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

1

In [16]:
collator_example = data_collator([tokenizer('I am an input'), tokenizer('So am I')])

collator_example

{'input_ids': tensor([[   40,   716,   281,  5128],
        [ 2396,   716,   314, 50257]]), 'attention_mask': tensor([[1, 1, 1, 1],
        [1, 1, 1, 0]]), 'labels': tensor([[  40,  716,  281, 5128],
        [2396,  716,  314, -100]])}

In [17]:
collator_example.input_ids  # 50256 is our pad token id

tensor([[   40,   716,   281,  5128],
        [ 2396,   716,   314, 50257]])

In [18]:
tokenizer.pad_token_id

50257

In [19]:
collator_example.attention_mask  # Note the 0 in the attention mask where we have a pad token

tensor([[1, 1, 1, 1],
        [1, 1, 1, 0]])

In [20]:
collator_example.labels  # note the -100 to ignore loss calculation for the padded token
# Reminder that labels are shifted *inside* the GPT model so we don't need to worry about that

tensor([[  40,  716,  281, 5128],
        [2396,  716,  314, -100]])

In [21]:
model = GPT2LMHeadModel.from_pretrained('gpt2')  # load up a GPT2 model

pretrained_generator = pipeline(
    'text-generation', model=model, tokenizer='gpt2',
    config={'max_length': 200, 'do_sample': True, 'top_p': 0.9, 'temperature': 0.7, 'top_k': 10}
)

In [22]:
print('----------')
for generated_sequence in pretrained_generator('A dataset shows the relationships', num_return_sequences=3):
    print(generated_sequence['generated_text'])
    print('----------')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


----------
A dataset shows the relationships between these values and the degree of success: a higher degree of success means higher percentage of people have been at this company.

Here are some of the important things to know. The survey data suggest that in the last
----------
A dataset shows the relationships at the top and bottom of multiple variables. I use one variable for each of the variables as close as possible, with the other being the full name that the data came from. If the full details about the correlation between the
----------
A dataset shows the relationships among major political groups like the Green Party and the Democratic Party:

The graph shows the relative influence of the groups that support various policies on issues like immigration, education and health care. Green-leaning groups favor policies designed
----------


In [24]:
training_args = TrainingArguments(
    output_dir="./gpt2_pds", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=3, # number of training epochs
    per_device_train_batch_size=32, # batch size for training
    per_device_eval_batch_size=32,  # batch size for evaluation
    warmup_steps=len(pds_data.examples) // 5, # number of warmup steps for learning rate scheduler,
    logging_steps=50,
    load_best_model_at_end=True,
    evaluation_strategy='epoch',
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=pds_data.examples[:int(len(pds_data.examples)*.8)],
    eval_dataset=pds_data.examples[int(len(pds_data.examples)*.8):]
)

trainer.evaluate()

W&B installed but not logged in. Run `wandb login` or set the WANDB_API_KEY env variable.


{'eval_loss': 4.955985069274902,
 'eval_runtime': 44.5679,
 'eval_samples_per_second': 21.091}

In [25]:
trainer.train()

Epoch,Training Loss,Validation Loss,Runtime,Samples Per Second
1,4.2725,4.096642,57.8959,16.236
2,3.7764,3.862182,67.6388,13.897
3,3.417,3.777707,80.9289,11.615


TrainOutput(global_step=354, training_loss=3.897780248674296, metrics={'train_runtime': 3060.0222, 'train_samples_per_second': 0.116, 'total_flos': 269220049256448, 'epoch': 3.0})

In [26]:
trainer.evaluate()

{'eval_loss': 3.77770733833313,
 'eval_runtime': 70.6418,
 'eval_samples_per_second': 13.307,
 'epoch': 3.0}

In [27]:
trainer.save_model()

In [28]:
loaded_model = GPT2LMHeadModel.from_pretrained('./gpt2_pds')

finetuned_generator = pipeline(
    'text-generation', model=loaded_model, tokenizer=tokenizer,
    config={'max_length': 200,  'do_sample': True, 'top_p': 0.9, 'temperature': 0.7, 'top_k': 10}
)

In [29]:
print('----------')
for generated_sequence in finetuned_generator('A dataset shows the relationships', num_return_sequences=3):
    print(generated_sequence['generated_text'])
    print('----------')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


----------
A dataset shows the relationships with known distribution
of
all possible data points in a data set:
r = Data[0]

# we see our
pw = pd_trees[2]
# we can see
----------
A dataset shows the relationships between different levels of categorical vocabulary from the
following
Let's look at the
class names.
Predictive classifiers were used to model the clustering of the data using the standard
data procedure

----------
A dataset shows the relationships between the variables being measured by X and Y.

In Python we use a categorical variable which is either the data or the variable itself, as shown below:
X = np.sqrt(data = np
----------


## 8.2 GPT for code dictation

In [30]:
from transformers import GPT2Tokenizer, DataCollatorForLanguageModeling, TrainingArguments, Trainer, \
                         GPT2LMHeadModel, pipeline
from datasets import Dataset
import pandas as pd

In [32]:
data = pd.read_csv('data/english_to_latex.csv')

print(data.shape)

data.head(2)

(50, 2)


Unnamed: 0,English,LaTeX
0,integral from a to b of x squared,"\int_{a}^{b} x^2\,dx"
1,integral from negative 1 to 1 of x squared,"\int_{-1}^{1} x^2\,dx"


In [33]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

tokenizer.pad_token = tokenizer.eos_token

# Add our singular prompt
CONVERSION_PROMPT = 'LCT\n'  # LaTeX conversion task

CONVERSION_TOKEN = 'LaTeX:'


In [34]:
# This is our "training prompt" that we want GPT2 to recognize and learn
training_examples = f'{CONVERSION_PROMPT}English: ' + data['English'] + '\n' + CONVERSION_TOKEN + ' ' + data['LaTeX'].astype(str)

print(training_examples[0])


LCT
English: integral from a to b of x squared
LaTeX: \int_{a}^{b} x^2\,dx


In [35]:
task_df = pd.DataFrame({'text': training_examples})

task_df.head(2)

Unnamed: 0,text
0,LCT\nEnglish: integral from a to b of x square...
1,LCT\nEnglish: integral from negative 1 to 1 of...


In [36]:
latex_data = Dataset.from_pandas(task_df)  # turn a pandas DataFrame into a Dataset

def preprocess(examples):  # tokenize our text but don't pad because our collator will pad for us dynamically
    return tokenizer(examples['text'], truncation=True)

latex_data = latex_data.map(preprocess, batched=True)

latex_data = latex_data.train_test_split(train_size=.8)

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [37]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [38]:
latex_gpt2 = GPT2LMHeadModel.from_pretrained('gpt2')

In [40]:
training_args = TrainingArguments(
    output_dir="./english_to_latex",
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=10, # number of training epochs
    per_device_train_batch_size=2, # batch size for training
    per_device_eval_batch_size=20,  # batch size for evaluation
    load_best_model_at_end=True,
    logging_steps=5,
    evaluation_strategy='epoch',
)

trainer = Trainer(
    model=latex_gpt2,
    args=training_args,
    train_dataset=latex_data["train"],
    eval_dataset=latex_data["test"],
    data_collator=data_collator,
)

trainer.evaluate()

W&B installed but not logged in. Run `wandb login` or set the WANDB_API_KEY env variable.


{'eval_loss': 4.924079418182373,
 'eval_runtime': 0.6375,
 'eval_samples_per_second': 15.686}

In [41]:
trainer.train()

Epoch,Training Loss,Validation Loss,Runtime,Samples Per Second
1,1.422,1.148117,0.6639,15.062
2,1.2295,0.919335,0.806,12.407
3,0.517,0.876098,0.8246,12.127
4,0.5482,0.848282,0.8861,11.285
5,0.6344,0.877802,1.033,9.681
6,0.5483,0.90494,1.0332,9.679
7,0.4768,0.868022,0.9739,10.268
8,0.4082,0.912227,0.9537,10.485
9,0.4477,0.877793,0.9841,10.162
10,0.3593,0.869021,0.9717,10.292


TrainOutput(global_step=200, training_loss=0.7420856261253357, metrics={'train_runtime': 320.9031, 'train_samples_per_second': 0.623, 'total_flos': 8816311517184, 'epoch': 10.0})

In [42]:
trainer.evaluate()

{'eval_loss': 0.848281741142273,
 'eval_runtime': 0.9621,
 'eval_samples_per_second': 10.394,
 'epoch': 10.0}

In [43]:
# Let's try fine-tuning it again but first let's let the model read a calculus book

In [47]:
# Calculus Made Easy by Silvanus P. Thompson - https://gutenberg.org/ebooks/33283

calculus_data = TextDataset(
    tokenizer=tokenizer,
    file_path='data/calculus made easy.txt',  # Principles of Data Science - Sinan Ozdemir
    block_size=32
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,  # MLM is Masked Language Modelling
)

latex_gpt2 = GPT2LMHeadModel.from_pretrained('gpt2')

training_args = TrainingArguments(
    output_dir="./calculus",
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=1, # number of training epochs
    per_device_train_batch_size=32, # batch size for training
    per_device_eval_batch_size=32,  # batch size for evaluation
    load_best_model_at_end=True,
    logging_steps=50,
    eval_steps=50,
    evaluation_strategy='steps',
)

trainer = Trainer(
    model=latex_gpt2,
    args=training_args,
    data_collator=data_collator,
    train_dataset=calculus_data.examples[:int(len(calculus_data.examples)*.8)],
    eval_dataset=calculus_data.examples[int(len(calculus_data.examples)*.8):]
)

W&B installed but not logged in. Run `wandb login` or set the WANDB_API_KEY env variable.


In [48]:
trainer.evaluate()  # initial loss for the calculus book

{'eval_loss': 2.5129024982452393,
 'eval_runtime': 78.3929,
 'eval_samples_per_second': 20.716}

In [49]:
trainer.train()

Step,Training Loss,Validation Loss,Runtime,Samples Per Second
50,1.7818,1.644646,126.954,12.792
100,1.5931,1.595124,128.2563,12.662
150,1.5605,1.569163,123.223,13.179
200,1.468,1.558228,100.5028,16.159


TrainOutput(global_step=203, training_loss=1.5998379561701432, metrics={'train_runtime': 2233.5785, 'train_samples_per_second': 0.091, 'total_flos': 155157525725184, 'epoch': 1.0})

In [50]:
trainer.save_model()

In [52]:
calculus_latex_gpt2 = GPT2LMHeadModel.from_pretrained('./calculus')  # load up our gpt pre-trained on calculus

training_args = TrainingArguments(
    output_dir="./calculus_english_to_latex",
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=10, # number of training epochs
    per_device_train_batch_size=2, # batch size for training
    per_device_eval_batch_size=20,  # batch size for evaluation
    load_best_model_at_end=True,
    logging_steps=5,
    evaluation_strategy='epoch',
)

trainer = Trainer(
    model=calculus_latex_gpt2,
    args=training_args,
    train_dataset=latex_data["train"],
    eval_dataset=latex_data["test"],
    data_collator=data_collator,
)

trainer.evaluate()  # loss is starting slightly lower than before

W&B installed but not logged in. Run `wandb login` or set the WANDB_API_KEY env variable.


{'eval_loss': 4.608597278594971,
 'eval_runtime': 0.6361,
 'eval_samples_per_second': 15.72}

In [53]:
trainer.train()

Epoch,Training Loss,Validation Loss,Runtime,Samples Per Second
1,1.3851,1.129366,0.7377,13.556
2,1.125,0.882218,0.6511,15.359
3,0.4794,0.908945,0.806,12.408
4,0.5245,0.820663,0.6184,16.17
5,0.6015,0.911463,0.7335,13.634
6,0.5136,0.960205,0.6141,16.284
7,0.4241,0.936979,0.5987,16.704
8,0.3545,0.962718,0.6037,16.563
9,0.4199,1.008138,0.5919,16.895
10,0.3353,1.003042,0.5798,17.247


TrainOutput(global_step=200, training_loss=0.6930449610948562, metrics={'train_runtime': 269.4542, 'train_samples_per_second': 0.742, 'total_flos': 8816311517184, 'epoch': 10.0})

In [54]:
trainer.evaluate()  # pre-training on the calculus book for one epoch led to a minor drop in loss

{'eval_loss': 0.820662796497345,
 'eval_runtime': 0.7309,
 'eval_samples_per_second': 13.681,
 'epoch': 10.0}

In [55]:
trainer.save_model()  # save this model

In [56]:
loaded_model = GPT2LMHeadModel.from_pretrained('./calculus_english_to_latex')
latex_generator = pipeline('text-generation', model=loaded_model, tokenizer=tokenizer)

In [57]:
text_sample = 'f of x equals integral from 0 to pi of x to the fourth power'
conversion_text_sample = f'{CONVERSION_PROMPT}English: {text_sample}\n{CONVERSION_TOKEN}'

print(conversion_text_sample)

LCT
English: f of x equals integral from 0 to pi of x to the fourth power
LaTeX:


In [58]:
print(latex_generator(
    conversion_text_sample, num_beams=5, early_stopping=True, temperature=0.7,
    max_length=len(tokenizer.encode(conversion_text_sample)) + 20
)[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


LCT
English: f of x equals integral from 0 to pi of x to the fourth power
LaTeX: f(x) = \int_{0}^{pi} x^4\,dx^


In [59]:
text_sample = 'f of x is sum from 0 to x of x squared'
conversion_text_sample = f'{CONVERSION_PROMPT}English: {text_sample}\n{CONVERSION_TOKEN}'

print(latex_generator(
    conversion_text_sample, num_beams=5, early_stopping=True, temperature=0.7,
    max_length=len(tokenizer.encode(conversion_text_sample)) + 20
)[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


LCT
English: f of x is sum from 0 to x of x squared
LaTeX: f(x) = \sum_{0}^{x} x^2\,dx^


In [69]:
# Sanity check that a non-finetuned model could not have done this
non_finetuned_latex_generator = pipeline(
    'text-generation', 
    model=GPT2LMHeadModel.from_pretrained('gpt2'),  # not fine-tuned!
    tokenizer=tokenizer
)

In [71]:
print(non_finetuned_latex_generator(
    conversion_text_sample, num_beams=5, early_stopping=True, temperature=0.7,
    max_length=len(tokenizer.encode(few_shot_prompt)) + 20
)[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


LCT
English: f of x is sum from 0 to x of x squared
LaTeX: f of x is sum from 0 to x of x squared
LaTeX: f of x is sum from 0 to x of x squared


In [72]:
few_shot_prompt = """LCT
English: f of x is sum from 0 to x of x squared
LaTeX: f(x) = \sum_{0}^{x} x^2 \,dx \
###
LCT
English: f of x equals integral from 0 to pi of x to the fourth power
LaTeX: f(x) = \int_{0}^{\pi} x^4 \,dx \
###
LCT
English: x squared
LaTeX:"""

In [73]:
print(non_finetuned_latex_generator(
    few_shot_prompt, num_beams=5, early_stopping=True, temperature=0.7,
    max_length=len(tokenizer.encode(few_shot_prompt)) + 20
)[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


LCT
English: f of x is sum from 0 to x of x squared
LaTeX: f(x) = \sum_{0}^{x} x^2 \,dx ###
LCT
English: f of x equals integral from 0 to pi of x to the fourth power
LaTeX: f(x) = \int_{0}^{\pi} x^4 \,dx ###
LCT
English: x squared
LaTeX: f(x) = \sum_{0}^{x} x^2 \,dx ###


In [74]:
print(non_finetuned_latex_generator(
    conversion_text_sample, num_beams=5, early_stopping=True, temperature=0.7,
    max_length=len(tokenizer.encode(conversion_text_sample)) + 20
)[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


LCT
English: f of x is sum from 0 to x of x squared
LaTeX: f of x is sum from 0 to x of x squared
LaTeX: f of x is
