In [None]:
# Create env
# python3 -m venv .venv
# source .venv/bin/activate
# pip install torch transformers pandas datasets "transformers[toruch]"


In [3]:
from transformers import AutoTokenizer , AutoModelForCausalLM
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

model

  from .autonotebook import tqdm as notebook_tqdm


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [4]:
prompt = "Once upon a time"

inputs = tokenizer(prompt , return_tensors="pt")

outputs = model.generate(inputs.input_ids,max_new_tokens=100,do_sample=True,top_k=50,top_p=0.95)

outputs_string = tokenizer.batch_decode(outputs)

outputs_string

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


['Once upon a time, we find that there is no place to stand, in a place of beauty, with this kind of beauty, that exists."\n\n"How does the world compare with this?\n"Let me tell you," she exclaimed. "It is not."\n"A better word, dear, we can all laugh, you know, and laugh with dignity, in our day, but in the present, we are living in a way that we have not yet, of necessity."<|endoftext|>']

In [5]:
# import dataset
from datasets import load_dataset

short_stories_dataset = load_dataset("roneneldan/TinyStories")
short_stories_dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 2119719
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 21990
    })
})

In [6]:
short_stories_dataset["train"][0]

{'text': 'One day, a little girl named Lily found a needle in her room. She knew it was difficult to play with it because it was sharp. Lily wanted to share the needle with her mom, so she could sew a button on her shirt.\n\nLily went to her mom and said, "Mom, I found this needle. Can you share it with me and sew my shirt?" Her mom smiled and said, "Yes, Lily, we can share the needle and fix your shirt."\n\nTogether, they shared the needle and sewed the button on Lily\'s shirt. It was not difficult for them because they were sharing and helping each other. After they finished, Lily thanked her mom for sharing the needle and fixing her shirt. They both felt happy because they had shared and worked together.'}

In [7]:
# get a smaller data

samll_story_dataset =  load_dataset("roneneldan/TinyStories",split="train[:1000]")
samll_story_dataset = samll_story_dataset.train_test_split(train_size=0.8)

samll_story_dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 800
    })
    test: Dataset({
        features: ['text'],
        num_rows: 200
    })
})

In [8]:
[len(x["text"].split(" ")) for x in samll_story_dataset["train"] ]

[199,
 132,
 215,
 134,
 330,
 164,
 232,
 103,
 325,
 89,
 166,
 119,
 198,
 128,
 137,
 141,
 179,
 119,
 140,
 108,
 137,
 160,
 163,
 190,
 166,
 149,
 145,
 196,
 109,
 342,
 146,
 141,
 196,
 179,
 142,
 140,
 335,
 178,
 116,
 129,
 165,
 135,
 131,
 138,
 138,
 158,
 142,
 125,
 144,
 134,
 265,
 215,
 159,
 200,
 246,
 184,
 141,
 203,
 214,
 86,
 462,
 435,
 119,
 192,
 441,
 164,
 293,
 374,
 94,
 183,
 317,
 129,
 151,
 271,
 127,
 270,
 158,
 94,
 299,
 381,
 244,
 140,
 130,
 142,
 114,
 127,
 332,
 176,
 164,
 262,
 150,
 150,
 162,
 138,
 139,
 138,
 157,
 66,
 140,
 252,
 318,
 309,
 215,
 157,
 161,
 187,
 161,
 319,
 163,
 341,
 159,
 116,
 109,
 150,
 102,
 116,
 190,
 144,
 111,
 149,
 125,
 137,
 231,
 110,
 141,
 171,
 107,
 82,
 210,
 158,
 320,
 136,
 161,
 91,
 106,
 168,
 131,
 116,
 267,
 193,
 119,
 112,
 183,
 198,
 170,
 323,
 136,
 153,
 143,
 130,
 149,
 133,
 164,
 133,
 147,
 189,
 183,
 176,
 112,
 188,
 181,
 106,
 165,
 84,
 305,
 129,
 123,
 90,
 

In [9]:
# Tokenizer dataset
def preporocess_batch(batch):
    all_text_items = batch["text"]
    trimed_text_items = [x[:500] for x in all_text_items]
    return tokenizer(trimed_text_items)

tokenizeed_dataset = samll_story_dataset.map(
    preporocess_batch,
    batched=True,
    batch_size=10,
    remove_columns=samll_story_dataset["train"].column_names
)

tokenizeed_dataset

Map: 100%|██████████| 800/800 [00:00<00:00, 2273.43 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 1889.35 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 800
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 200
    })
})

In [10]:
tokenizeed_dataset["train"][0]

{'input_ids': [7454,
  2402,
  257,
  640,
  612,
  373,
  257,
  32202,
  3706,
  5811,
  13,
  679,
  5615,
  287,
  257,
  1263,
  3881,
  416,
  262,
  5417,
  13,
  1881,
  1110,
  11,
  5811,
  373,
  6155,
  1863,
  262,
  10481,
  290,
  339,
  2497,
  257,
  1310,
  2576,
  2712,
  13,
  383,
  2576,
  3114,
  845,
  3772,
  11,
  523,
  5811,
  3066,
  284,
  12589,
  607,
  13,
  198,
  198,
  18861,
  531,
  11,
  366,
  15496,
  2474,
  220,
  198,
  464,
  2576,
  13541,
  290,
  531,
  11,
  366,
  17250,
  612,
  2474,
  220,
  198,
  198,
  18861,
  1965,
  11,
  366,
  2061,
  389,
  345,
  1804,
  1701,
  220,
  198,
  464,
  2576,
  531,
  11,
  366,
  40,
  1101,
  2045,
  329,
  21547,
  12758,
  82,
  13,
  1119,
  389,
  523,
  2495,
  11,
  290,
  6029,
  1165,
  2474,
  220,
  198,
  198,
  6423,
  5811,
  531,
  11,
  366,
  40,
  760,
  286,
  257,
  1295,
  326,
  468,
  6041,
  286,
  21547,
  12758,
  82,
  11,
  826,
  1474,
  616,
  3881,
  1363,
  13,


In [12]:
# Create a data collector
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collector = DataCollatorForLanguageModeling(tokenizer=tokenizer,mlm=False)
data_collector

DataCollatorForLanguageModeling(tokenizer=GPT2TokenizerFast(name_or_path='distilgpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}, mlm=False, mlm_probability=0.15, pad_to_multiple_of=None, tf_experimental_compile=False, return_tensors='pt')

In [13]:
# Create out trainer 
from transformers import Trainer , TrainingArguments

training_args = TrainingArguments(
    output_dir = "./output",
    evaluation_strategy= "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=10
)

trainer = Trainer (
    model=model,
    train_dataset= tokenizeed_dataset["train"],
    eval_dataset= tokenizeed_dataset["test"],
    args=training_args,
    data_collator=data_collector
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [14]:
# start train 
trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
                                                    
 10%|█         | 100/1000 [12:06<1:42:21,  6.82s/it]

{'eval_loss': 2.3971805572509766, 'eval_runtime': 51.2137, 'eval_samples_per_second': 3.905, 'eval_steps_per_second': 0.488, 'epoch': 1.0}


 12%|█▏        | 116/1000 [13:57<1:44:31,  7.09s/it]

In [None]:
# load model from the last checkpoints

model = AutoModelForCausalLM.from_pretrained("./output/check")
model

In [None]:
prompt = "Once upon a time"

inputs = tokenizer(prompt , return_tensors="pt")

outputs = model.generate(inputs.input_ids,max_new_tokens=100,do_sample=True,top_k=50,top_p=0.95)

outputs_string = tokenizer.batch_decode(outputs)

outputs_string