In [1]:
!nvidia-smi

Tue Dec  1 08:48:59 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.38       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   57C    P0    30W /  70W |      0MiB / 15079MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install transformers



In [3]:
import re
import json
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments, AutoModelWithLMHead
import numpy as np
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer, GPT2LMHeadModel, set_seed


In [4]:
from google.colab import files
# manually selected the shakespeare train and test files
train_path = files.upload()
test_path = files.upload()


## Parameters


In [5]:
train_path = 'train.txt'
test_path = 'test.txt'
model = "gpt2"
model_path = './gpt2'
block_size=128
num_train_epochs=3
per_device_train_batch_size=32 
per_device_eval_batch_size=64  
eval_steps = 20 
save_steps=40
warmup_steps=10

## Preprocess data

In [6]:
tokenizer = GPT2Tokenizer.from_pretrained(model)
set_seed(0)

## Prepare the data for our model

In [7]:
data_collector = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
train_dataset = TextDataset(tokenizer=tokenizer, file_path=train_path, 
                            block_size=block_size)
test_dataset = TextDataset(tokenizer=tokenizer, file_path=test_path,
                           block_size=block_size) 



## Initialize model

In [8]:
model = AutoModelWithLMHead.from_pretrained(model)


arguments = TrainingArguments(output_dir=model_path, overwrite_output_dir=True, 
                              num_train_epochs=num_train_epochs, 
                              per_device_train_batch_size=per_device_train_batch_size, 
                              per_device_eval_batch_size=per_device_eval_batch_size,  
                              eval_steps=eval_steps, save_steps=save_steps, 
                              warmup_steps=warmup_steps)


trainer = Trainer(model=model, args=arguments, data_collator=data_collector,
                  train_dataset=train_dataset, eval_dataset=test_dataset)



##Train and Save model

In [9]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=15, training_loss=3.1195770263671876)

In [10]:
trainer.save_model()

##Sampling methods


In [11]:
input = "What is the meaning of life "

In [12]:
enc_input = tokenizer.encode(input, return_tensors='pt')
model = GPT2LMHeadModel.from_pretrained(model_path, pad_token_id=tokenizer.eos_token_id)

In [13]:
# greedy output
greedy_gen = model.generate(enc_input,
                            max_length=100
                            )
print("Output:\n" + 100 * '-')
print(tokenizer.decode(greedy_gen[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
What is the meaning of life?"

"I am a man who lives by his own will. I am a man who has no other choice but to live by my own will. I am a man who has no other choice but to live by my own will. I am a man who has no other choice but to live by my own will. I am a man who has no other choice but to live by my own will. I am a man who has no other choice but


In [14]:
# beam output
beam_gen = model.generate(enc_input,
                          max_length=100,
                          num_beams=5,
                          early_stopping=True
                          )
print("Output:\n" + 100 * '-')
print(tokenizer.decode(beam_gen[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
What is the meaning of life?"

"The meaning of life is to live in harmony with nature. It is to live in harmony with nature. It is to live in harmony with nature. It is to live in harmony with nature. It is to live in harmony with nature. It is to live in harmony with nature. It is to live in harmony with nature. It is to live in harmony with nature. It is to live in harmony with nature. It is to live in


In [15]:
# beam output with n-gram penalty and potentially all sequences returned for illustration purpose
# we observe almost no difference, if at all
beam_penalty_gen = model.generate(enc_input,
                          max_length=100,
                          num_beams=3,
                          no_repeat_ngram_size=2,
                          #num_return_sequences=3, 
                          early_stopping=True
                          )
# we could uncomment num_return_sequences to illustrate the different beams with virtually no difference
# print("Output:\n" + 100 * '-')
# for i, beam in enumerate(beam_penalty_gen):
#   print(i, tokenizer.decode(beam, skip_special_tokens=True))
print("Output:\n" + 100 * '-')
print(tokenizer.decode(beam_penalty_gen[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
What is the meaning of life?"

"Life is a struggle for survival. It is not a fight for freedom, but for the freedom to live in harmony with nature and with the laws of the land. The struggle is for a better life for all people, and not for one group at a time. We are fighting to preserve the rights of all, not just those who are born into poverty. This is why we call on all to join us in this struggle, because it is


In [16]:
# conditional sampling
sampling = model.generate(enc_input,
                          max_length=100,
                          do_sample=True,
                          top_k=0
                          )
print("Output:\n" + 100 * '-')
print(tokenizer.decode(sampling[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
What is the meaning of life??)," of course, men generally tend to shun these beliefs, and often struggle with them in various ways. Migraine naturally high in Migraine often results from our own failure attaining early, post-traumatic strength, or uncontrolled motor activity. In female patients, such as me, the resulting insomnia is often due to fluctuations in our own mental technologies and our inability to pursue the individualistic thoughts and actions we pursue today.24 "[B]ecause of] the


In [17]:
# conditional sampling with temperature
temperature_sampling = model.generate(enc_input,
                          max_length=100,
                          do_sample=True,
                          top_k=0,
                          temperature=0.6
                          )
print("Output:\n" + 100 * '-')
print(tokenizer.decode(temperature_sampling[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
What is the meaning of life?" he asked. "It is the destruction of the world." The students, he said, had been called upon to take up arms against the regime. "They were called on to do so because they believed that the government had taken over the country and that the people who had been against it had been overthrown by the people who had voted for it. But what was the meaning of life?" I asked. "The meaning of life is that there is no


In [18]:
# top-k sampling where probability mass is redistibuted among top-k samples, and then next word is drawn from that distribution
top_k_sampling = model.generate(enc_input,
                          max_length=100,
                          do_sample=True,
                          top_k=40  # same as standard gpt2
                          )
print("Output:\n" + 100 * '-')
print(tokenizer.decode(top_k_sampling[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
What is the meaning of life?" (6)--A. The meaning of life comes from the idea that you are born to live and that every moment you are born is your work. We shall soon show how this concept came to be known as the Law of the Life Project. At least one chapter or two will deal with an important passage in the life of O. J. Simpson: "What would happen if he ever went to prison?" He replied: "I would certainly not be able


In [19]:
# top-p sampling similar to top-k, but sampling from smallest possible set with bigger cumulative dist. than p
# then redistribute probability mass again and sample accordingly
top_p_sampling = model.generate(enc_input,
                          max_length=100,
                          do_sample=True,
                          top_p=0.9,
                          top_k=0
                          )
print("Output:\n" + 100 * '-')
print(tokenizer.decode(top_p_sampling[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
What is the meaning of life?"

"About thirteen years ago. Here is a young woman about nineteen years old. She is writing in French about a people which was living before me. Many of them were in the Western districts. As soon as they came to the European countries, they showed no signs of changing. They were young men and they had not a college education. This young man, in many cases, was already a fairly well-known man, so he believed that the


In [20]:
# combination of top-p sampling and top-k to avoid very low ranked words while still allowing for dynamic selection
top_pk_combination_sampling = model.generate(enc_input,
                          max_length=100,
                          do_sample=True,
                          top_p=0.95,
                          top_k=40
                          )
print("Output:\n" + 100 * '-')
print(tokenizer.decode(top_pk_combination_sampling[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
What is the meaning of life?"

"I can say nothing more than that there is no other meaning."

"Well then, then, I beg your pardon."

"And I also beg that you pardon our Lord Chancellor and Lord Marshal!"

"Yes, sir, of course I shall. I shall not go to any court against you. And, if you please, I will send an affidavit to the Governor, who will then give us the documents which will


In [21]:
trainer.evaluate()

{'epoch': 3.0, 'eval_loss': 2.7671456336975098}