https://huggingface.co/docs/transformers/tasks/language_modeling

# data loading

In [None]:
from datasets import load_dataset

eli5 = load_dataset("eli5")

DatasetDict({
    train_eli5: Dataset({
        features: ['q_id', 'title', 'selftext', 'document', 'subreddit', 'answers', 'title_urls', 'selftext_urls', 'answers_urls'],
        num_rows: 272634
    })
    validation_eli5: Dataset({
        features: ['q_id', 'title', 'selftext', 'document', 'subreddit', 'answers', 'title_urls', 'selftext_urls', 'answers_urls'],
        num_rows: 9812
    })
    test_eli5: Dataset({
        features: ['q_id', 'title', 'selftext', 'document', 'subreddit', 'answers', 'title_urls', 'selftext_urls', 'answers_urls'],
        num_rows: 24512
    })
    train_asks: Dataset({
        features: ['q_id', 'title', 'selftext', 'document', 'subreddit', 'answers', 'title_urls', 'selftext_urls', 'answers_urls'],
        num_rows: 131778
    })
    validation_asks: Dataset({
        features: ['q_id', 'title', 'selftext', 'document', 'subreddit', 'answers', 'title_urls', 'selftext_urls', 'answers_urls'],
        num_rows: 2281
    })
    test_asks: Dataset({
        features: ['q_id', 'title', 'selftext', 'document', 'subreddit', 'answers', 'title_urls', 'selftext_urls', 'answers_urls'],
        num_rows: 4462
    })
    train_askh: Dataset({
        features: ['q_id', 'title', 'selftext', 'document', 'subreddit', 'answers', 'title_urls', 'selftext_urls', 'answers_urls'],
        num_rows: 98525
    })
    validation_askh: Dataset({
        features: ['q_id', 'title', 'selftext', 'document', 'subreddit', 'answers', 'title_urls', 'selftext_urls', 'answers_urls'],
        num_rows: 4901
    })
    test_askh: Dataset({
        features: ['q_id', 'title', 'selftext', 'document', 'subreddit', 'answers', 'title_urls', 'selftext_urls', 'answers_urls'],
        num_rows: 9764
    })
})

In [None]:
eli5 = load_dataset("eli5", split="train_asks[:5000]")

In [None]:
eli5 = eli5.train_test_split(test_size=0.2)

In [None]:
eli5['train'][0]

![](eli5_example.png)

In [None]:
eli5 = eli5.flatten()

In [None]:
eli5['train'][0]

{'q_id': '1oy5tc', 'title': 'in football whats the point of wasting the first two plays with a rush - up the middle - not regular rush plays i get those', 'selftext': '', 'document': '', 'subreddit': 'explainlikeimfive', 'answers.a_id': ['ccwtgnz', 'ccwtmho', 'ccwt946', 'ccwvj0u'], 'answers.text': ["Keep the defense honest, get a feel for the pass rush, open up the passing game. An offense that's too one dimensional will fail. And those rushes up the middle can be busted wide open sometimes for big yardage.", "If you throw the ball all the time, then the defense will adapt to always cover for a pass.  By doing a simple running play every now and then, you force the defense to stay close and guard against the run.  Sometimes, the offense can catch the defense off guard by faking a run and freeing up their receivers.\n\nAlso, you don't have to gain massive yards on every single play.  Sometimes, it works best to gain a few yards at a time.  As long as you get the first down, you are in good shape.", 'In most cases the O-Line is supposed to make a hole for the running back to go through. If you run too many plays to the outside/throws the defense will catch on.\n\nAlso, 2 5 yard plays gets you a new set of downs.', "I you don't like those type of plays, watch CFL.  We only get 3 downs so you can't afford to waste one.  Lots more passing."], 'answers.score': [3, 2, 2, 2], 'title_urls.url': [], 'selftext_urls.url': [], 'answers_urls.url': []}

# load pre-trained model

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilgpt2")

# data processing

In [None]:
def preprocess_function(examples):
    return tokenizer([" ".join(x) for x in examples["answers.text"]])

In [None]:
tokenized_eli5 = eli5.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=eli5["train"].column_names,
)

In [None]:
block_size = 128


def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
lm_dataset = tokenized_eli5.map(group_texts, batched=True, num_proc=4)

![](causal_processed_data.png)

# load the pre-trained model

In [None]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer

model = AutoModelForCausalLM.from_pretrained("distilgpt2")

import torch

model = model.to(torch.device('cuda:7'))

In [None]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# building the trainer

In [None]:
training_args = TrainingArguments(
    output_dir="my_awesome_eli5_clm-model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
)

![](causal_data_size.png)

# train the model

In [None]:
trainer.train()

![](causal_training.png)

# model evaluation

In [None]:
import math

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

# model saving

In [None]:
trainer.save_model(
output_dir = 'causal_model_trained',
)

# model loading

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilgpt2")

from transformers import AutoModelForCausalLM, TrainingArguments, Trainer

model = AutoModelForCausalLM.from_pretrained("causal_model_trained")

# inference

In [None]:
from transformers import pipeline

generator = pipeline(
    "text-generation", 
    model=model,
    tokenizer=tokenizer,
    )

In [None]:
prompt = "Somatic hypermutation allows the immune system to"

generator(prompt)

![](causal_result.png)

# end