##  NLP Story Generation

In this notebook a transformer-based language model is finetuned, using a subset of the [TinyStories](https://huggingface.co/datasets/roneneldan/TinyStories) dataset.

In [77]:
# !pip install datasets



In [78]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from transformers import DataCollatorForLanguageModeling
from transformers import TrainingArguments, Trainer

In [79]:
# device = "cuda"
device = "cpu"

In [80]:
# Fetch GPT2 model and tokenizer from Huggingface
model_name = "gpt2"

model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [81]:
# Test the model to tell stories
prompt = "Once upon a time"

inputs = tokenizer(prompt, return_tensors="pt").to(device)
outputs = model.generate(inputs.input_ids, max_length = 50, do_sample=True, temperature=0.5, top_p=0.92, no_repeat_ngram_size=2)

output_string = tokenizer.batch_decode(outputs)
output_string

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


['Once upon a time, the only thing that mattered was that you were a member of the team, and that was all that matters.\n\n"I\'m not saying I\'m going to change my name, I don\'t think I should. I']

In [82]:
# Get the TinyStories dataset from Huggingface, only use a subset
tiny_stories_ds = load_dataset("roneneldan/TinyStories", split="train[:5000]")
tiny_stories_ds = tiny_stories_ds.train_test_split(train_size=0.8)

tiny_stories_ds

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 4000
    })
    test: Dataset({
        features: ['text'],
        num_rows: 1000
    })
})

In [83]:
# Tokenize dataset, only consider the first 1024 characters of each story
def preprocess_function(batch):
    batch_strings = batch["text"]
    return tokenizer([x[:1024] for x in batch_strings])

tokenized_ds = tiny_stories_ds.map(
    preprocess_function,
    batched=True,
    batch_size=12,
    remove_columns=tiny_stories_ds["train"].column_names
)

tokenized_ds

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 4000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

In [84]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,mlm=False)

In [85]:
training_args = TrainingArguments(
    output_dir="./output/tiny_stories1",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=10
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    data_collator=data_collator
)

trainer.train()



Epoch,Training Loss,Validation Loss
1,2.0964,1.926138
2,1.93,1.871802
3,1.8598,1.843801
4,1.8128,1.827453
5,1.7738,1.817532
6,1.7442,1.809587
7,1.7211,1.804481
8,1.7045,1.801717
9,1.6911,1.800319
10,1.6819,1.799545


TrainOutput(global_step=5000, training_loss=1.8015656860351563, metrics={'train_runtime': 3331.8232, 'train_samples_per_second': 12.005, 'train_steps_per_second': 1.501, 'total_flos': 5214613248000000.0, 'train_loss': 1.8015656860351563, 'epoch': 10.0})

In [87]:
# Load fine-tuned model
story_model = AutoModelForCausalLM.from_pretrained("./output/tiny_stories1/checkpoint-5000").to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [91]:
# Inference
prompt = "Alice was so tired when she got back home so she went"

inputs = tokenizer(prompt, return_tensors="pt").to(device)
outputs = story_model.generate(
    inputs.input_ids,
    max_length=100,
    do_sample=True,
    temperature=0.5,
    top_p=0.9,
    no_repeat_ngram_size=2,
    eos_token_id=tokenizer.eos_token_id
    )

output_string = tokenizer.batch_decode(outputs)
output_string

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


['Alice was so tired when she got back home so she went to bed. She woke up feeling much better than usual and felt refreshed.\n\nThe next morning, she saw a big box in the living room. It was a special box that she had never seen before. Inside was lots of shiny things, some of which were magical. \n Her mom asked her if she wanted to play with them, but she said no. So she asked the other kids if they wanted her to take']

In [93]:
import math

eval_res = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_res['eval_loss']):.2f}")
print(eval_res)

Perplexity: 6.05
{'eval_loss': 1.799545407295227, 'eval_runtime': 22.1455, 'eval_samples_per_second': 45.156, 'eval_steps_per_second': 5.644, 'epoch': 10.0}
