In [1]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# initialize tokenizer and model from pretrained GPT2 model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [37]:
sentence = 'Generative pre-trained transformers are a family of language models generally trained on a large corpus of text data to generate human-like text. They are built using several blocks of the transformer architecture. They can be fine-tuned for various natural language processing tasks such as text generation, language translation, and text classification.'

In [38]:
inputs = tokenizer.encode(sentence, return_tensors='pt')
inputs

tensor([[ 8645,   876,   662,    12, 35311,  6121,   364,   389,   257,  1641,
           286,  3303,  4981,  4143,  8776,   319,   257,  1588, 35789,   286,
          2420,  1366,   284,  7716,  1692,    12,  2339,  2420,    13,  1119,
           389,  3170,  1262,  1811,  7021,   286,   262, 47385, 10959,    13,
          1119,   460,   307,  3734,    12, 28286,   276,   329,  2972,  3288,
          3303,  7587,  8861,   884,   355,  2420,  5270,    11,  3303, 11059,
            11,   290,  2420, 17923,    13]])

In [39]:
# we pass a maximum output length of 200 tokens
outputs = model.generate(inputs, max_length=200, do_sample=True)
outputs

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([[ 8645,   876,   662,    12, 35311,  6121,   364,   389,   257,  1641,
           286,  3303,  4981,  4143,  8776,   319,   257,  1588, 35789,   286,
          2420,  1366,   284,  7716,  1692,    12,  2339,  2420,    13,  1119,
           389,  3170,  1262,  1811,  7021,   286,   262, 47385, 10959,    13,
          1119,   460,   307,  3734,    12, 28286,   276,   329,  2972,  3288,
          3303,  7587,  8861,   884,   355,  2420,  5270,    11,  3303, 11059,
            11,   290,  2420, 17923,    13,   198,   198,  7003,   517,  3716,
           621, 10224,   662,    12, 35311,  6121,   364,    11,   777,  2512,
           286,  6121,   364,   481,  1249,   329, 12846,   290,  6942, 11059,
           286,  3288,  3303,  7587,  4133,   884,   355,  2420,   284,   517,
          3716,  5479,   319,   257,  4025,  5046,    13,  1119,   423,   867,
         13391,   290,   867, 38457,    13,  1119,   743,   307,  3614,   416,
           511,  2176,  2854,  4661,    11,   543,  

In [40]:
# Decoding
# Our generate step outputs an array of tokens rather than words. 
# To convert these tokens into words, we need to .decode them.
text = tokenizer.decode(outputs[0], skip_special_tokens=True)
text


'Generative pre-trained transformers are a family of language models generally trained on a large corpus of text data to generate human-like text. They are built using several blocks of the transformer architecture. They can be fine-tuned for various natural language processing tasks such as text generation, language translation, and text classification.\n\nAlthough more complex than conventional pre-trained transformers, these block of transformers will allow for flexible and efficient translation of natural language processing resources such as text to more complex applications on a larger scale. They have many advantages and many disadvantages. They may be limited by their specific performance goals, which are highly variable. In most cases, however, their optimization is the key determining factor and that can give advantages over regular pre-trained transformers if the optimized block is applied to highly complex problems such as image processing.\n\nThe term "process and translat

In [42]:
tokenizer.decode(outputs[0], skip_special_tokens=True, temperature=1, top_k=50)
# We can add more randomness with temperature — the default value is 1, 
# a high value like 5 will produce a pretty nonsensical output:


# add the top_k parameter — which limits the sample tokens to a given number of the most probable tokens. 
# This results in text that tends to stick to the same topic (or set of words) for a longer period of time.

'Generative pre-trained transformers are a family of language models generally trained on a large corpus of text data to generate human-like text. They are built using several blocks of the transformer architecture. They can be fine-tuned for various natural language processing tasks such as text generation, language translation, and text classification.\n\nAlthough more complex than conventional pre-trained transformers, these block of transformers will allow for flexible and efficient translation of natural language processing resources such as text to more complex applications on a larger scale. They have many advantages and many disadvantages. They may be limited by their specific performance goals, which are highly variable. In most cases, however, their optimization is the key determining factor and that can give advantages over regular pre-trained transformers if the optimized block is applied to highly complex problems such as image processing.\n\nThe term "process and translat