In [1]:
!nvidia-smi

Mon Nov 30 18:12:55 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.38       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   69C    P8    11W /  70W |      0MiB / 15079MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/3a/83/e74092e7f24a08d751aa59b37a9fc572b2e4af3918cb66f7766c3affb1b4/transformers-3.5.1-py3-none-any.whl (1.3MB)
[K     |████████████████████████████████| 1.3MB 12.6MB/s 
[?25hCollecting tokenizers==0.9.3
[?25l  Downloading https://files.pythonhosted.org/packages/4c/34/b39eb9994bc3c999270b69c9eea40ecc6f0e97991dba28282b9fd32d44ee/tokenizers-0.9.3-cp36-cp36m-manylinux1_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 48.1MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 45.2MB/s 
[?25hCollecting sentencepiece==0.1.91
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB

In [None]:
import re
import json
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments, AutoModelWithLMHead
import numpy as np

In [None]:
from google.colab import files
# manually selected the shakespeare train and test files
train_path = files.upload()
test_path = files.upload()


Saving train.txt to train.txt


Saving test.txt to test.txt


## Preprocess data

In [None]:
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer, GPT2LMHeadModel, set_seed

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
train_path = 'train.txt'
test_path = 'test.txt'
set_seed(0)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




## Prepare the data for our model

In [None]:
def load_dataset(train_path, test_path, tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=128)
     
    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=128)   
    
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset,test_dataset,data_collator

train_dataset,test_dataset,data_collator = load_dataset(train_path, 
                                                        test_path, 
                                                        tokenizer)



## Initialize model

In [None]:
model = AutoModelWithLMHead.from_pretrained("gpt2")


training_args = TrainingArguments(
    output_dir="./gpt2-trump", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=10, # number of training epochs
    per_device_train_batch_size=32, # batch size for training
    per_device_eval_batch_size=64,  # batch size for evaluation
    eval_steps = 20, # Number of update steps between two evaluations.
    save_steps=40, # after # steps model is saved 
    warmup_steps=10,# number of warmup steps for learning rate scheduler
    )


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    prediction_loss_only=True,
    )



##Train and Save model

In [None]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=460, training_loss=2.750154976222826)

In [None]:
trainer.save_model()

##Sampling methods


In [None]:
input = "What is the meaning of life "

In [None]:
enc_input = tokenizer.encode(input, return_tensors='pt')
model = GPT2LMHeadModel.from_pretrained("./gpt2-trump", pad_token_id=tokenizer.eos_token_id)

In [None]:
# greedy output
greedy_gen = model.generate(enc_input,
                            max_length=100
                            )
print("Output:\n" + 100 * '-')
print(tokenizer.decode(greedy_gen[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
What is the meaning of life  \
    That you, my lord, have so much to do with me?\
    I have no power to do with you.\
    I have no power to do with you.\
    I have no power to do with you.\
    I have no power to do with you.\
    I have no power to do with you.\
  


In [None]:
# beam output
beam_gen = model.generate(enc_input,
                          max_length=100,
                          num_beams=5,
                          early_stopping=True
                          )
print("Output:\n" + 100 * '-')
print(tokenizer.decode(beam_gen[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
What is the meaning of life  \
    That you do not know it?\
\
  Rom. I do not know it.\
\
  Jul. I do not know it.\
\
  Rom. I do not know it.\
                                        


In [None]:
# beam output with n-gram penalty and potentially all sequences returned for illustration purpose
# we observe almost no difference, if at all
beam_penalty_gen = model.generate(enc_input,
                          max_length=100,
                          num_beams=3,
                          no_repeat_ngram_size=2,
                          #num_return_sequences=3, 
                          early_stopping=True
                          )
# we could uncomment num_return_sequences to illustrate the different beams with virtually no difference
# print("Output:\n" + 100 * '-')
# for i, beam in enumerate(beam_penalty_gen):
#   print(i, tokenizer.decode(beam, skip_special_tokens=True))
print("Output:\n" + 100 * '-')
print(tokenizer.decode(beam_penalty_gen[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
What is the meaning of life?"\
  Ham. I do not know, my lord, but I think it is a question of the nature of things.   \
Exit. [Exeunt Hamlet and Guildenstern.]  O, what is this, that, when I speak to you, I say, 'I am a man'?  [Aside.] 'A man,' you say? 'Tis not a word, sir.'  Grief,


In [None]:
# conditional sampling
sampling = model.generate(enc_input,
                          max_length=100,
                          do_sample=True,
                          top_k=0
                          )
print("Output:\n" + 100 * '-')
print(tokenizer.decode(sampling[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
What is the meaning of life  \
    In ten wars and up to five thousand wars? With the stars in the night?\
  MACBETH. They saw us.\
  DUNCAN. For a man, death cannot stand for days.\
  MACBETH. As if Romeo were a beetle-\
    Scouring this beetle's cave with his boar's ear. Go yonder?\
  MACBETH.


In [None]:
# conditional sampling with temperature
temperature_sampling = model.generate(enc_input,
                          max_length=100,
                          do_sample=True,
                          top_k=0,
                          temperature=0.6
                          )
print("Output:\n" + 100 * '-')
print(tokenizer.decode(temperature_sampling[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
What is the meaning of life  \
    That you do not know the word\
    With which you speak?\
  DUNCAN. I know the word.\
  MALCOLM. So you know it well.\
  MACBETH. I know the word well.\
  DUNCAN. Gentlemen, I'll tell you the matter.\
    For we have a feeling\
   


In [None]:
# top-k sampling where probability mass is redistibuted among top-k samples, and then next word is drawn from that distribution
top_k_sampling = model.generate(enc_input,
                          max_length=100,
                          do_sample=True,
                          top_k=40  # same as standard gpt2
                          )
print("Output:\n" + 100 * '-')
print(tokenizer.decode(top_k_sampling[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
What is the meaning of life  \
How many sorrows this world shall bear,\
Who will be buried in this earth?\
    O, what joys this very earth!\
    I can not, though it is heaven,\
    Not to hear of the ungainst man,\
    But to say I was in awe of heaven\
    When I saw him come to speak\
 


In [None]:
# top-p sampling similar to top-k, but sampling from smallest possible set with bigger cumulative dist. than p
# then redistribute probability mass again and sample accordingly
top_p_sampling = model.generate(enc_input,
                          max_length=100,
                          do_sample=True,
                          top_p=0.9,
                          top_k=0
                          )
print("Output:\n" + 100 * '-')
print(tokenizer.decode(top_p_sampling[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
What is the meaning of life  \
That grows stale even within?\
\
COMINIUS.\
Not we.\
\
MARTINIUS.\
Let it not be, but my slave I should die.\
I am rather here-\
So should thou art here!\
Let my mutinous folly be-    To live here I must die.\
'Tis not too late.\
\



In [None]:
# combination of top-p sampling and top-k to avoid very low ranked words while still allowing for dynamic selection
top_pk_combination_sampling = model.generate(enc_input,
                          max_length=100,
                          do_sample=True,
                          top_p=0.95,
                          top_k=40
                          )
print("Output:\n" + 100 * '-')
print(tokenizer.decode(top_pk_combination_sampling[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
What is the meaning of life  \
To tell us what our true desires are?\
\
MENENIUS.\
I have heard the good old man talk of 'tis.\
\
MARCIUS.\
What's your pleasure?\
\
MENENIUS.\
Th' use to me at once. I should have thought it in the time of\
The Roman wars;\
And, as we say, in the


In [None]:
trainer.evaluate()

{'epoch': 10.0, 'eval_loss': 3.206249237060547}