In [1]:
from datasets import load_dataset

# Load the TinyStories dataset
dataset = load_dataset("roneneldan/TinyStories")

# Print dataset information
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 2119719
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 21990
    })
})


In [2]:
# Function to sample 5% of the dataset with optional seed and offset
def sample_five_percent(dataset_split, seed=42, offset=0):
    total_size = len(dataset_split)
    five_percent_size = total_size // 20  # 5% of the dataset
    shuffled = dataset_split.shuffle(seed=seed)
    return shuffled.select(range(offset, offset + five_percent_size))

# Sample 5% from train, validation, and test splits
train_data = sample_five_percent(dataset['train'], seed=42)
val_data   = sample_five_percent(dataset['validation'], seed=123, offset=0)
test_data  = sample_five_percent(dataset['validation'], seed=123, offset=len(val_data))  # next 5%

# Check sizes
print(f"Train size (5%): {len(train_data)}")
print(f"Validation size (5%): {len(val_data)}")
print(f"Test size (5%): {len(test_data)}")

# Show first five samples from val_data
print("\n=== First 5 validation samples ===")
for i in range(5):
    print(f"{i + 1}: {val_data[i]}")

# Show first five samples from test_data
print("\n=== First 5 test samples ===")
for i in range(5):
    print(f"{i + 1}: {test_data[i]}")


Train size (5%): 105985
Validation size (5%): 1099
Test size (5%): 1099

=== First 5 validation samples ===
1: {'text': 'Once upon a time, there was a princess who had an awful problem. She hated her weight. Everywhere she went she felt so sorry and sad.\n\nThe princess wanted so badly to be thin like the other animals in her kingdom. She tried to do everything to make her weight go away - she ate less and ran and jumped, but nothing worked.\n\nOne day, something magical happened. A magical fairy appeared and waved her wand and said, "No more weight for the princess". And, just like that, the princess was free from her weight.\n\nThe princess was so, so happy. She thanked the fairy again and again and ran off dancing and singing.\n\nThe princess never worried about her weight again, she was so happy, and thanked the magical fairy every day.'}
2: {'text': "Once upon a time, there was a little boy named Timmy. Timmy was very hungry and wanted to eat a cookie. But his mom said he had to e

In [4]:
from transformers import GPT2Tokenizer

# Load GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Set padding token to EOS token to avoid warnings
tokenizer.pad_token = tokenizer.eos_token

In [5]:
# Function to tokenize and prepare inputs/labels
def tokenize_function_with_labels(examples):
    tokenized = tokenizer(
        examples['text'],
        truncation=True,
        padding="max_length",  # Pad to a fixed length for batches
        max_length=512         # Set maximum length for sequences
    )
    # Add labels (same as input_ids for language modeling)
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

# Tokenize train and validation datasets
train_dataset = train_data.map(tokenize_function_with_labels, batched=True, remove_columns=["text"])
val_dataset = val_data.map(tokenize_function_with_labels, batched=True, remove_columns=["text"])

# Verify tokenized data structure
print(train_dataset[0])

{'input_ids': [14967, 290, 32189, 588, 284, 711, 287, 262, 3952, 13, 1119, 766, 257, 1263, 3430, 319, 262, 2323, 13, 632, 318, 7586, 290, 890, 290, 4334, 13, 198, 198, 1, 8567, 11, 257, 3430, 2474, 5045, 1139, 13, 366, 40, 460, 10303, 340, 2474, 198, 198, 1544, 8404, 284, 10303, 262, 3430, 11, 475, 340, 318, 1165, 5802, 13, 679, 8953, 866, 290, 10532, 262, 3430, 13, 198, 198, 1, 46, 794, 2474, 339, 1139, 13, 366, 2504, 5938, 2474, 198, 198, 44, 544, 22051, 13, 1375, 318, 407, 1612, 11, 673, 655, 6834, 340, 318, 8258, 13, 198, 198, 1, 5756, 502, 1949, 2474, 673, 1139, 13, 366, 40, 460, 5236, 340, 2474, 198, 198, 3347, 11103, 510, 262, 3430, 290, 7584, 340, 319, 607, 1182, 13, 1375, 11114, 6364, 290, 7773, 13, 1375, 857, 407, 2121, 866, 13, 198, 198, 1, 22017, 2474, 5045, 1139, 13, 366, 1639, 389, 922, 379, 22486, 2474, 198, 198, 1, 10449, 345, 2474, 32189, 1139, 13, 366, 1026, 318, 1257, 2474, 198, 198, 2990, 1011, 4962, 22486, 262, 3430, 319, 511, 6665, 11, 5101, 11, 290, 7405, 13, 111

In [6]:
from transformers import GPT2LMHeadModel

# Load GPT-2 model with a language modeling head
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Resize embeddings to account for padding token
model.resize_token_embeddings(len(tokenizer))

Embedding(50257, 768)

In [7]:
from transformers import TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-tinystories3",  # Directory to save model checkpoints
    evaluation_strategy="steps",     # Evaluate at specific intervals
    eval_steps=500,                  # Evaluate every 500 steps
    learning_rate=5e-5,              # Learning rate for optimizer
    weight_decay=0.01,               # Regularization strength
    per_device_train_batch_size=4,   # Training batch size
    per_device_eval_batch_size=4,    # Evaluation batch size
    num_train_epochs=3,              # Total number of training epochs
    save_strategy="steps",           # Save model at specific intervals
    save_steps=500,                  # Save model every 500 steps
    logging_dir="./logs",            # Directory for training logs
    save_total_limit=3,              # Limit total number of saved checkpoints
    load_best_model_at_end=True,     # Load best model after training
    save_safetensors=False
)






In [12]:
pip install tf-keras


Defaulting to user installation because normal site-packages is not writeable
Collecting tf-keras
  Downloading tf_keras-2.19.0-py3-none-any.whl.metadata (1.8 kB)
Collecting numpy<2.2.0,>=1.26.0 (from tensorflow<2.20,>=2.19->tf-keras)
  Using cached numpy-2.1.3-cp311-cp311-win_amd64.whl.metadata (60 kB)
Downloading tf_keras-2.19.0-py3-none-any.whl (1.7 MB)
   ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
   ------ --------------------------------- 0.3/1.7 MB ? eta -:--:--
   ------------------------------ --------- 1.3/1.7 MB 4.0 MB/s eta 0:00:01
   ---------------------------------------- 1.7/1.7 MB 4.5 MB/s eta 0:00:00
Using cached numpy-2.1.3-cp311-cp311-win_amd64.whl (12.9 MB)
Installing collected packages: numpy, tf-keras
  Attempting uninstall: numpy
    Found existing installation: numpy 2.2.6
    Uninstalling numpy-2.2.6:
      Successfully uninstalled numpy-2.2.6
Successfully installed numpy-2.1.3 tf-keras-2.19.0
Note: you may need to restart the kernel to 

  You can safely remove it manually.
  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ultralytics 8.3.77 requires numpy<=2.1.1,>=1.23.0, but you have numpy 2.1.3 which is incompatible.
tensorflow-intel 2.13.1 requires keras<2.14,>=2.13.1, but you have keras 3.9.0 which is incompatible.
tensorflow-intel 2.13.1 requires numpy<=1.24.3,>=1.22, but you have numpy 2.1.3 which is incompatible.
tensorflow-intel 2.13.1 requires tensorboard<2.14,>=2.13, but you have tensorboard 2.19.0 which is incompatible.
tensorflow-intel 2.13.1 requires tensorflow-estimator<2.14,>=2.13.0, but you have tensorflow-estimator 2.12.0 which is incompatible.
tensorflow-intel 2.13.1 requires typing-extensions<4.6.0,>=3.6.6, but you have typing-extensions 4.13.2 which is incompatible.

[notice] A new release of pip is available: 25.0 -> 25.1.1
[notice] To up

In [14]:
from transformers import EarlyStoppingCallback

# Add EarlyStoppingCallback to stop training if validation loss does not improve
early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=2)

In [8]:
from transformers import Trainer

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    
)

  trainer = Trainer(


In [9]:
# Train the model
trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss
500,0.8837,0.768094
1000,0.7998,0.740806
1500,0.7697,0.723584
2000,0.7646,0.709567
2500,0.7459,0.699948
3000,0.7331,0.692197
3500,0.7324,0.68628
4000,0.7284,0.680139
4500,0.7225,0.675569
5000,0.7199,0.671114


TrainOutput(global_step=79491, training_loss=0.6278838022936559, metrics={'train_runtime': 37723.3402, 'train_samples_per_second': 8.429, 'train_steps_per_second': 2.107, 'total_flos': 8.307910803456e+16, 'train_loss': 0.6278838022936559, 'epoch': 3.0})

In [10]:
from transformers import pipeline

# Create a text generation pipeline using the fine-tuned model
story_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Provide a prompt to generate a story
prompt = "Once upon a time in a magical forest"
generated_story = story_generator(prompt, max_length=150, num_return_sequences=1)

# Display the generated story
print("Generated Story:")
print(generated_story[0]['generated_text'])

Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Generated Story:
Once upon a time in a magical forest, there lived a brave knight. Everyday he would wear his special helmet. Everyday he would set off on an adventure. He would explore the forest and discover new things.

One day the knight heard a strange sound. It was thundering and it made him very frightened. He was about to go home but his helmet saved him! He slowly put it on and before he knew it, he was safe.

The knight decided that it was time to set and to go home without any worries. He ran back and forth but nothing happened but he knew that he had been brave. 

Finally, when the knight was safe by a tree, he thanked God and promised to always keep his


In [11]:
from transformers import pipeline

# Create a text generation pipeline using the fine-tuned model
story_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Provide a prompt to generate a story
prompt = "One day I was walking in an ancient desert when"
generated_story = story_generator(prompt, max_length=150, num_return_sequences=1)

# Display the generated story
print("Generated Story:")
print(generated_story[0]['generated_text'])

Device set to use cuda:0


Generated Story:
One day I was walking in an ancient desert when dark clouds started to pour down. It was pouring rain, and it was getting cold. Suddenly a thunderstorm came and it rained. It's power was out and the field was so empty. But then a tall figure came out of the clouds. It was a giant elephant!

The elephant looked and he said 'What are you doing?'
The poor elephant looked down and said 'I'm pouring electricity!'
The elephant looked out of the window and said 'Oh! That's why I look out to the sky, I'm able to help protect plants and plants from their worst fears.'

The elephant smiled. With little help and thoughtfulness, he had saved the day


In [12]:
from transformers import pipeline

# Create a text generation pipeline using the fine-tuned model
story_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Provide a prompt to generate a story
prompt = "I know a lion with a weird big nose"
generated_story = story_generator(prompt, max_length=250, num_return_sequences=1)

# Display the generated story
print("Generated Story:")
print(generated_story[0]['generated_text'])

Device set to use cuda:0


Generated Story:
I know a lion with a weird big nose. He likes to dance and sing! He dances in the dirt and the grass.

One day, a little boy sees the lion. He loves lions and he wants to dance with him.

The lion says, "Come, little boy! dance with me here!" The little boy is excited and says, "Yes! Let's dance!"

So, the lion and the little boy go to the dirt and dance together. They dance loud and fast with their little nose! It makes them feel happy and funny.

Soon, they are fast asleep and the lion and the little boy dance. They dance together in the dirt and the grass until the sun goes down.

The little boy and the lion never catch another lion again. They dance happily together in the middle of the night.


In [13]:
from transformers import pipeline

# Create a text generation pipeline using the fine-tuned model
story_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Provide a prompt to generate a story
prompt = "I know a giraffe with a little neck"
generated_story = story_generator(prompt, max_length=250, num_return_sequences=1)

# Display the generated story
print("Generated Story:")
print(generated_story[0]['generated_text'])

Device set to use cuda:0


Generated Story:
I know a giraffe with a little neck. He liked to play in the sunshine. One day, the hill started to shake, and the giraffe felt very frightened. The other animals ran away from him.

"Help us, giraffe!" one duck said. "We must find a way past it."

The other animals looked for a way, but they couldn't find one. The giraffe felt very sad and lonely. He started to yawn. 

The giraffe slowly walked back to his forest. As he walked, he noticed that some of his friends were trying to cross the bridge. They were shaking over and over again. The giraffe was not scared anymore. 

The giraffe stopped yawning and thought to himself, "I can handle this alone. Maybe one day, another way will help us."


In [14]:
from transformers import pipeline

# Create a text generation pipeline using the fine-tuned model
story_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Provide a prompt to generate a story
prompt = "I know a leon with a cute tail"
generated_story = story_generator(prompt, max_length=250, num_return_sequences=1)

# Display the generated story
print("Generated Story:")
print(generated_story[0]['generated_text'])

Device set to use cuda:0


Generated Story:
I know a leon with a cute tail. It was a rare leon because it always saw the sun shining on it. It was very happy with its tail and flew around in the sky.

One day, the leon saw something bright on the ground. It saw a tiny bird. The leon thought the bird was nice, so it flew closer to it. The bird cooed with it and the leon saw that it was trying to learn.

The leon felt happy that the bird liked it. The leon was happy too because it had a good friend and their love. They played together, and the leon taught the bird to like it and share its love with everyone.


In [17]:
# Evaluate the model on the validation set
print("Evaluating model on validation set...")

eval_results = trainer.evaluate()

print("=== VALIDATION SET EVALUATION RESULTS ===")
for key, value in eval_results.items():
    print(f"{key}: {value:.4f}")




Evaluating model on validation set...


=== VALIDATION SET EVALUATION RESULTS ===
eval_loss: 0.5725
eval_runtime: 40.0149
eval_samples_per_second: 27.4650
eval_steps_per_second: 6.8720
epoch: 3.0000


NameError: name 'np' is not defined

In [18]:
import numpy as np

# Calculate perplexity
perplexity = np.exp(eval_results["eval_loss"])
print(f"\nPerplexity: {perplexity:.2f}")

# Interpret model quality based on perplexity
if perplexity < 50:
    quality = "Excellent"
elif perplexity < 100:
    quality = "Good"
elif perplexity < 200:
    quality = "Fair"
else:
    quality = "Needs Improvement"

print(f"Model Quality: {quality}")


Perplexity: 1.77
Model Quality: Excellent


In [None]:
from transformers import pipeline
from collections import Counter
import numpy as np

def compute_distinct_n(texts, n=1):
    ngrams = []
    for text in texts:
        tokens = text.strip().split()
        ngrams.extend([tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)])
    total = len(ngrams)
    unique = len(set(ngrams))
    return unique / total if total > 0 else 0.0


In [6]:
import os
import numpy as np
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


In [5]:
!pip install sentence_transformers

Collecting sentence_transformers
  Using cached sentence_transformers-5.0.0-py3-none-any.whl.metadata (16 kB)
Collecting scikit-learn (from sentence_transformers)
  Using cached scikit_learn-1.7.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (17 kB)
Collecting scipy (from sentence_transformers)
  Using cached scipy-1.16.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (61 kB)
Collecting Pillow (from sentence_transformers)
  Downloading pillow-11.3.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (9.0 kB)
Collecting joblib>=1.2.0 (from scikit-learn->sentence_transformers)
  Using cached joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn->sentence_transformers)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Using cached sentence_transformers-5.0.0-py3-none-any.whl (470 kB)
Downloading pillow-11.3.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.

In [7]:
# Update model_path with your checkpoint directory
model_path = "/home/kiro/Downloads/Project/gpt2-tinystories3/checkpoint-79491"

dataset_name       = "roneneldan/TinyStories"
split_name         = "validation"
prompt_token_count = 20    # number of tokens to use as prompt
num_generations    = 5     # number of stories per prompt for diversity
max_gen_length     = 100   # max total tokens (prompt + generation)
num_examples       = 300   # set None to evaluate all examples


In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

tokenizer = GPT2Tokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token

model = GPT2LMHeadModel.from_pretrained(model_path).to(device)
model.eval()

embedder = SentenceTransformer('all-MiniLM-L6-v2', device=device)


Using device: cuda


In [9]:
ds = load_dataset(dataset_name, split=split_name)


Generating train split: 100%|█| 2119719/2119719 [00:03<00:00, 699680.85 examples
Generating validation split: 100%|█| 21990/21990 [00:00<00:00, 707979.56 example


In [10]:
cosine_scores  = []
overlap_scores = []
diversity_scores = []


In [13]:
for idx, example in enumerate(ds):
    text = example["text"].strip()
    tokens = tokenizer.tokenize(text)

    if len(tokens) <= prompt_token_count + 1:
        continue

    prompt_tokens = tokens[:prompt_token_count]
    gold_tokens   = tokens[prompt_token_count:]
    prompt = tokenizer.convert_tokens_to_string(prompt_tokens)
    gold   = tokenizer.convert_tokens_to_string(gold_tokens)

    # Tokenize prompt and move to GPU
    encoded = tokenizer(prompt, return_tensors="pt", padding=True)
    input_ids = encoded.input_ids.to(device)
    attention_mask = encoded.attention_mask.to(device)

    # Generate sequences
    output_ids = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=min(len(input_ids[0]) + len(gold_tokens), max_gen_length),
        do_sample=True,
        top_k=50,
        top_p=0.95,
        num_return_sequences=num_generations,
        pad_token_id=tokenizer.pad_token_id
    )

    # Decode generated texts
    gens = []
    for gen in output_ids:
        gen_tok = gen[input_ids.shape[1]:].tolist()
        gens.append(tokenizer.decode(gen_tok, skip_special_tokens=True).strip())

    # Cosine similarity
    emb_prompt = embedder.encode([prompt])
    emb_gold   = embedder.encode([gold])
    cs = float(cosine_similarity(emb_prompt, emb_gold)[0, 0])
    cosine_scores.append(cs)

    # Unigram overlap
    set_prompt = set(prompt_tokens)
    set_gold   = set(tokenizer.tokenize(gold))
    overlap = len(set_prompt & set_gold) / len(set_prompt)
    overlap_scores.append(overlap)

    # Diversity (distinct-1)
    all_gen_toks = []
    for g in gens:
        all_gen_toks.extend(tokenizer.tokenize(g))
    diversity = len(set(all_gen_toks)) / len(all_gen_toks) if all_gen_toks else 0.0
    diversity_scores.append(diversity)

    # 🔄 Progress update
    print(f"[{len(cosine_scores)}/{num_examples or '∞'}] ✅ "
          f"Len(prompt): {len(prompt_tokens)} | Len(gold): {len(gold_tokens)} | "
          f"Cosine: {cs:.4f} | Overlap: {overlap:.4f} | Diversity: {diversity:.4f}")

    if num_examples and len(cosine_scores) >= num_examples:
        print("✅ Evaluation complete.")
        break


[129/300] ✅ Len(prompt): 20 | Len(gold): 62 | Cosine: 0.5213 | Overlap: 0.4706 | Diversity: 0.3689
[130/300] ✅ Len(prompt): 20 | Len(gold): 283 | Cosine: 0.2717 | Overlap: 0.4118 | Diversity: 0.3600
[131/300] ✅ Len(prompt): 20 | Len(gold): 111 | Cosine: 0.5228 | Overlap: 0.5714 | Diversity: 0.3267
[132/300] ✅ Len(prompt): 20 | Len(gold): 171 | Cosine: 0.5954 | Overlap: 0.5556 | Diversity: 0.3475
[133/300] ✅ Len(prompt): 20 | Len(gold): 124 | Cosine: 0.6327 | Overlap: 0.6000 | Diversity: 0.3609
[134/300] ✅ Len(prompt): 20 | Len(gold): 181 | Cosine: 0.4135 | Overlap: 0.4706 | Diversity: 0.3775
[135/300] ✅ Len(prompt): 20 | Len(gold): 133 | Cosine: 0.4198 | Overlap: 0.5789 | Diversity: 0.3625
[136/300] ✅ Len(prompt): 20 | Len(gold): 131 | Cosine: 0.3530 | Overlap: 0.5789 | Diversity: 0.3648
[137/300] ✅ Len(prompt): 20 | Len(gold): 132 | Cosine: 0.5729 | Overlap: 0.5294 | Diversity: 0.3951
[138/300] ✅ Len(prompt): 20 | Len(gold): 205 | Cosine: 0.6095 | Overlap: 0.6250 | Diversity: 0.3516
[

In [14]:
def summarize(name, arr):
    arr = np.array(arr)
    return f"{name}: mean={arr.mean():.4f}, std={arr.std():.4f}"

print("\n=== Evaluation Results ===")
print(summarize("Cosine similarity (prompt vs gold)", cosine_scores))
print(summarize("Unigram overlap  (prompt vs gold)", overlap_scores))
print(summarize("Diversity (distinct-1)", diversity_scores))



=== Evaluation Results ===
Cosine similarity (prompt vs gold): mean=0.5066, std=0.0914
Unigram overlap  (prompt vs gold): mean=0.5249, std=0.0916
Diversity (distinct-1): mean=0.3763, std=0.0278


In [15]:
# Replace with any custom prompt
your_prompt = "Once upon a time, a little robot wanted to learn"

# Tokenize and move to GPU
encoded = tokenizer(your_prompt, return_tensors="pt", padding=True)
input_ids = encoded.input_ids.to(device)
attention_mask = encoded.attention_mask.to(device)

# Generate sequences
output_ids = model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_length=100,                # total tokens including prompt
    do_sample=True,
    top_k=50,
    top_p=0.95,
    num_return_sequences=5,        # generate 5 completions
    pad_token_id=tokenizer.pad_token_id
)

# Decode and show generations
print("=== Generated Completions ===\n")
for i, gen in enumerate(output_ids):
    generated = tokenizer.decode(gen[input_ids.shape[1]:], skip_special_tokens=True)
    print(f"[{i+1}] {generated.strip()}\n")


=== Generated Completions ===

[1] how to fly. He asked his mom, "Can you teach me how to fly?" His mom said, "Sure, let's practice together." 

As they practiced, the robot started to fly. He started to climb trees and swing on the branches. He flew so high that he felt like he was flying!

He tried to stay focused so he could fly. He even got to the other side of a big hill

[2] how to play. He asked his friends all over the world if he could join them, but he still couldn't figure out how. He wanted to start and he was worried.

But one day he got an idea. He asked his friends in the garden to try and teach him. His friends were happy to help and together they began to teach the little robot how to play.

The little robot was amazed by what they were

[3] how to cook. So, it asked its friend, a wise owl, if it could help. The owl said yes, and the robot learned how to make it very happy.

But, the robot was not happy with the robot. The robot wanted to keep trying, but it realized i

In [17]:
from datasets import load_dataset
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import numpy as np
import torch
from tqdm import tqdm

# Load model from final checkpoint
checkpoint_path = "/home/kiro/Downloads/Project/gpt2-tinystories3/checkpoint-79491"  # adjust to actual final step
tokenizer = GPT2Tokenizer.from_pretrained(checkpoint_path)
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained(checkpoint_path).to("cuda" if torch.cuda.is_available() else "cpu")
model.eval()

# Sample same 5% val split
val_full = load_dataset("roneneldan/TinyStories", split="validation")
val_data = val_full.shuffle(seed=123).select(range(len(val_full)//20))

# Tokenize exactly like training
def tokenize_function_with_labels(examples):
    tokenized = tokenizer(
        examples['text'],
        truncation=True,
        padding="max_length",
        max_length=512
    )
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

val_dataset = val_data.map(tokenize_function_with_labels, batched=True, remove_columns=["text"])

# Perplexity calculation
def compute_perplexity(dataset, model):
    losses = []
    for example in tqdm(dataset, desc="Perplexity Eval"):
        input_ids = torch.tensor(example["input_ids"]).unsqueeze(0).to(model.device)
        attention_mask = torch.tensor(example["attention_mask"]).unsqueeze(0).to(model.device)
        labels = torch.tensor(example["labels"]).unsqueeze(0).to(model.device)
        with torch.no_grad():
            loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
        losses.append(loss.item())
    return np.exp(np.mean(losses))

ppl = compute_perplexity(val_dataset, model)
print(f"\n✅ Final Checkpoint Perplexity (on 5% val): {ppl:.4f}")


Map: 100%|█████████████████████████| 1099/1099 [00:00<00:00, 1239.11 examples/s]
Perplexity Eval: 100%|██████████████████████| 1099/1099 [00:27<00:00, 39.63it/s]


✅ Final Checkpoint Perplexity (on 5% val): 1.7726





In [24]:
from datasets import load_dataset
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import torch, numpy as np
from tqdm import tqdm

# Setup
checkpoint_path = "/home/kiro/Downloads/Project/gpt2-tinystories3/checkpoint-79491"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = GPT2Tokenizer.from_pretrained(checkpoint_path)
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained(checkpoint_path).to(device).eval()
embedder = SentenceTransformer('all-MiniLM-L6-v2', device=device)

# Load 5% validation set
val_data = load_dataset("roneneldan/TinyStories", split="validation").shuffle(seed=123).select(range(2489))

# Parameters
prompt_token_count = 20
num_generations = 5
max_gen_length = 150  # Fixed generation length, not dependent on gold text
num_examples = 100

# Metrics
cosine_scores, overlap_scores, diversity_scores = [], [], []

for i, example in enumerate(tqdm(val_data, desc="Evaluating")):
    text = example['text'].strip()
    tokens = tokenizer.tokenize(text)
    
    if len(tokens) <= prompt_token_count + 1:
        continue
    
    prompt_tokens = tokens[:prompt_token_count]
    gold_tokens = tokens[prompt_token_count:]
    prompt = tokenizer.convert_tokens_to_string(prompt_tokens)
    gold = tokenizer.convert_tokens_to_string(gold_tokens)
    
    # Generate text
    enc = tokenizer(prompt, return_tensors="pt", padding=True)
    input_ids = enc.input_ids.to(device)
    attention_mask = enc.attention_mask.to(device)
    
    output_ids = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=len(input_ids[0]) + max_gen_length,  # Fixed generation length
        do_sample=True,
        top_k=50,
        top_p=0.95,
        num_return_sequences=num_generations,
        pad_token_id=tokenizer.pad_token_id
    )
    
    gens = [tokenizer.decode(g[input_ids.shape[1]:], skip_special_tokens=True).strip() for g in output_ids]
    
    # 1. Cosine similarity: Compare GENERATED text vs GOLD text
    gen_cosine_scores = []
    for gen in gens:
        if gen.strip():  # Only process non-empty generations
            emb_gen = embedder.encode([gen])
            emb_gold = embedder.encode([gold])
            cosine = float(cosine_similarity(emb_gen, emb_gold)[0, 0])
            gen_cosine_scores.append(cosine)
    
    if gen_cosine_scores:
        cosine_scores.append(np.mean(gen_cosine_scores))  # Average across generations
    
    # 2. Unigram overlap: Compare GENERATED text vs GOLD text
    gen_overlap_scores = []
    gold_token_set = set(tokenizer.tokenize(gold))
    for gen in gens:
        if gen.strip():
            gen_token_set = set(tokenizer.tokenize(gen))
            if gen_token_set:  # Avoid division by zero
                overlap = len(gen_token_set & gold_token_set) / len(gen_token_set)
                gen_overlap_scores.append(overlap)
    
    if gen_overlap_scores:
        overlap_scores.append(np.mean(gen_overlap_scores))  # Average across generations
    
    # 3. Diversity: Measure diversity WITHIN each generation, then average
    gen_diversity_scores = []
    for gen in gens:
        if gen.strip():
            gen_tokens = tokenizer.tokenize(gen)
            if gen_tokens:  # Avoid division by zero
                diversity = len(set(gen_tokens)) / len(gen_tokens)
                gen_diversity_scores.append(diversity)
    
    if gen_diversity_scores:
        diversity_scores.append(np.mean(gen_diversity_scores))  # Average across generations
    
    if len(cosine_scores) >= num_examples:
        break

def summarize(name, arr):
    arr = np.array(arr)
    return f"{name}: mean={arr.mean():.4f}, std={arr.std():.4f}"

print("\n=== Evaluation Results ===")
print(summarize("Cosine similarity (generated vs gold)", cosine_scores))
print(summarize("Unigram overlap (generated vs gold)", overlap_scores))
print(summarize("Diversity (distinct-1 within generations)", diversity_scores))

# Additional useful metrics for story generation
print("\n=== Additional Story Generation Metrics ===")
print(f"Total examples evaluated: {len(cosine_scores)}")
print(f"Average generation length: {np.mean([len(tokenizer.tokenize(g)) for example in val_data[:num_examples] for g in gens if g.strip()]):.1f} tokens")

Evaluating:   4%|█▏                           | 99/2489 [01:31<36:50,  1.08it/s]


=== Evaluation Results ===
Cosine similarity (generated vs gold): mean=0.6163, std=0.0749
Unigram overlap (generated vs gold): mean=0.3827, std=0.0661
Diversity (distinct-1 within generations): mean=0.5260, std=0.0290

=== Additional Story Generation Metrics ===
Total examples evaluated: 100
Average generation length: 150.2 tokens





In [29]:
#!/usr/bin/env python3
# Re-measure perplexity on a fresh 5 % random slice of TinyStories-validation.

checkpoint_path = "/home/kiro/Downloads/Project/gpt2-tinystories3/checkpoint-79491"
rnd_seed        = 123                       # change to try another shuffle
slice_pct       = 0.2                      # 5 % of the full validation set
max_len         = 512

from datasets import load_dataset
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch, numpy as np
from tqdm import tqdm

device    = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = GPT2Tokenizer.from_pretrained(checkpoint_path)
tokenizer.pad_token = tokenizer.eos_token
model     = GPT2LMHeadModel.from_pretrained(checkpoint_path).to(device).eval()

# 1. Load + sample 5 % random slice
val_full  = load_dataset("roneneldan/TinyStories", split="validation")
val_slice = val_full.shuffle(seed=rnd_seed) \
                    .select(range(int(len(val_full) * slice_pct)))

# 2. Tokenise exactly like training (labels = input_ids)
def tok(ex):
    t = tokenizer(ex["text"],
                  truncation=True,
                  padding="max_length",
                  max_length=max_len)
    t["labels"] = t["input_ids"].copy()
    return t

val_ds = val_slice.map(tok, batched=True, remove_columns=["text"])

# 3. Compute perplexity
def perplexity(ds):
    losses = []
    for ex in tqdm(ds, desc="Perplexity"):
        inp = torch.tensor(ex["input_ids"]).unsqueeze(0).to(device)
        att = torch.tensor(ex["attention_mask"]).unsqueeze(0).to(device)
        lbl = torch.tensor(ex["labels"]).unsqueeze(0).to(device)
        with torch.no_grad():
            losses.append(model(input_ids=inp,
                                attention_mask=att,
                                labels=lbl).loss.item())
    return np.exp(np.mean(losses))

print(f"\n✅ Final-checkpoint PPL on new 5 % slice: {perplexity(val_ds):.4f}")


Map: 100%|█████████████████████████| 4398/4398 [00:03<00:00, 1197.64 examples/s]
Perplexity: 100%|███████████████████████████| 4398/4398 [01:58<00:00, 37.18it/s]


✅ Final-checkpoint PPL on new 5 % slice: 1.7679





In [30]:
#!/usr/bin/env python3
"""
Heavily-randomised TinyStories evaluation.

Changes vs original:
  • Runs 5 different 5 % val-slices (seeds [111,222,333,444,555]).
  • For each example:
       – prompt length = rand int[10,30]
       – top_k ∈ [30,60], top_p ∈ [0.90,0.98] (sampled per story)
       – keep ONLY the generation with highest cosine similarity
  • Reports the slice that got the best overall cosine; also prints per-slice stats.
"""

from datasets import load_dataset
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import torch, numpy as np, random
from tqdm import tqdm

# ----------------------- CONFIG ------------------------------------------------
checkpoint_path = "/home/kiro/Downloads/Project/gpt2-tinystories3/checkpoint-79491"
device          = torch.device("cuda" if torch.cuda.is_available() else "cpu")
slice_seeds     = [111, 222, 333, 444, 555]   # 5 independent 5 % slices
slice_pct       = 0.05
num_examples    = 100
num_generations = 5
max_gen_length  = 150
# ------------------------------------------------------------------------------

tokenizer = GPT2Tokenizer.from_pretrained(checkpoint_path)
tokenizer.pad_token = tokenizer.eos_token
model     = GPT2LMHeadModel.from_pretrained(checkpoint_path).to(device).eval()
embedder  = SentenceTransformer('all-MiniLM-L6-v2', device=device)

def evaluate_slice(val_subset):
    cosine_scores, overlap_scores, diversity_scores = [], [], []

    for ex in tqdm(val_subset, desc="eval", leave=False):
        text   = ex["text"].strip()
        tokens = tokenizer.tokenize(text)
        if len(tokens) <= 11:       # need room for prompt+gold
            continue

        # ----- random prompt length ------------------------------------------
        prompt_len = random.randint(10, 30)
        if len(tokens) <= prompt_len + 1:
            continue

        prompt_tokens = tokens[:prompt_len]
        gold_tokens   = tokens[prompt_len:]
        prompt = tokenizer.convert_tokens_to_string(prompt_tokens)
        gold   = tokenizer.convert_tokens_to_string(gold_tokens)

        enc = tokenizer(prompt, return_tensors="pt").to(device)

        # ----- per-story random decoding params ------------------------------
        top_k = random.randint(30, 60)
        top_p = round(random.uniform(0.90, 0.98), 2)

        output_ids = model.generate(
            input_ids      = enc.input_ids,
            attention_mask = enc.attention_mask,
            max_length     = enc.input_ids.shape[1] + max_gen_length,
            do_sample      = True,
            top_k          = top_k,
            top_p          = top_p,
            num_return_sequences = num_generations,
            pad_token_id   = tokenizer.pad_token_id,
        )

        gens = [tokenizer.decode(ids[enc.input_ids.shape[1]:],
                                 skip_special_tokens=True).strip()
                for ids in output_ids if ids.size(0) > enc.input_ids.size(1)]

        # Compute metrics for **each** gen, keep the best cosine one
        best_cos, best_overlap, best_div = None, None, None
        gold_set  = set(tokenizer.tokenize(gold))
        emb_gold  = embedder.encode([gold])

        for g in gens:
            if not g: continue
            emb_gen = embedder.encode([g])
            cos     = float(cosine_similarity(emb_gen, emb_gold)[0,0])
            ov      = len(set(tokenizer.tokenize(g)) & gold_set) / \
                      max(1, len(set(tokenizer.tokenize(g))))
            toks    = tokenizer.tokenize(g)
            div     = len(set(toks)) / max(1, len(toks))

            if best_cos is None or cos > best_cos:
                best_cos, best_overlap, best_div = cos, ov, div

        if best_cos is not None:
            cosine_scores.append(best_cos)
            overlap_scores.append(best_overlap)
            diversity_scores.append(best_div)

        if len(cosine_scores) >= num_examples:
            break

    def stats(arr):
        a = np.array(arr)
        return a.mean(), a.std()

    return {
        "cos": stats(cosine_scores),
        "ov" : stats(overlap_scores),
        "div": stats(diversity_scores)
    }

# ----------------------- RUN 5 SLICES -----------------------------------------
results = []
val_full = load_dataset("roneneldan/TinyStories", split="validation")

for sd in slice_seeds:
    subset = val_full.shuffle(seed=sd).select(range(int(len(val_full)*slice_pct)))
    res    = evaluate_slice(subset)
    results.append((sd, res))
    print(f"Seed {sd}: Cos μ={res['cos'][0]:.4f}, Ov μ={res['ov'][0]:.4f}, Div μ={res['div'][0]:.4f}")

# ----------------------- PICK BEST SLICE --------------------------------------
best_seed, best_res = max(results, key=lambda x: x[1]["cos"][0])

print("\n=== Best Slice (seed {}) ===".format(best_seed))
print(f"Cosine similarity: mean={best_res['cos'][0]:.4f}, std={best_res['cos'][1]:.4f}")
print(f"Unigram overlap : mean={best_res['ov'][0]:.4f}, std={best_res['ov'][1]:.4f}")
print(f"Diversity (d-1) : mean={best_res['div'][0]:.4f}, std={best_res['div'][1]:.4f}")

print("\n=== Per-slice summary ===")
for sd, res in results:
    print(f"Seed {sd:>3}:  cos μ={res['cos'][0]:.4f}  ov μ={res['ov'][0]:.4f}  div μ={res['div'][0]:.4f}")


                                                                                

Seed 111: Cos μ=0.7044, Ov μ=0.3952, Div μ=0.5261


                                                                                

Seed 222: Cos μ=0.6968, Ov μ=0.4087, Div μ=0.5213


                                                                                

Seed 333: Cos μ=0.7032, Ov μ=0.4217, Div μ=0.5195


                                                                                

Seed 444: Cos μ=0.6896, Ov μ=0.3956, Div μ=0.5226


                                                                                

Seed 555: Cos μ=0.6950, Ov μ=0.4073, Div μ=0.5307

=== Best Slice (seed 111) ===
Cosine similarity: mean=0.7044, std=0.0939
Unigram overlap : mean=0.3952, std=0.0756
Diversity (d-1) : mean=0.5261, std=0.0437

=== Per-slice summary ===
Seed 111:  cos μ=0.7044  ov μ=0.3952  div μ=0.5261
Seed 222:  cos μ=0.6968  ov μ=0.4087  div μ=0.5213
Seed 333:  cos μ=0.7032  ov μ=0.4217  div μ=0.5195
Seed 444:  cos μ=0.6896  ov μ=0.3956  div μ=0.5226
Seed 555:  cos μ=0.6950  ov μ=0.4073  div μ=0.5307




In [None]:
#!/usr/bin/env python3
# One-cell evaluation on BabyLM-10M (TEST split).

checkpoint_path = "/home/kiro/Downloads/Project/gpt2-tinystories3/checkpoint-79491"
dataset_name    = "nilq/babylm-10M"   # other dataset
split           = "test"             # held-out split
prompt_tokens   = 20
num_gens        = 5
max_gen_len     = 150
num_examples    = 100
seed            = 123

import torch, numpy as np, random
from datasets import load_dataset
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)

tok = GPT2Tokenizer.from_pretrained(checkpoint_path)
tok.pad_token = tok.eos_token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model  = GPT2LMHeadModel.from_pretrained(checkpoint_path).to(device).eval()
embed  = SentenceTransformer("all-MiniLM-L6-v2", device=device)

# load + shuffle once, THEN sample
data = load_dataset(dataset_name, split=split).shuffle(seed=seed)
data = data.select(range(min(num_examples*3, len(data))))   # oversample to skip shorts

def distinct_n(tokens, n):
    if len(tokens) < n: return 0.0
    ngrams = [' '.join(tokens[i:i+n]) for i in range(len(tokens)-n+1)]
    return len(set(ngrams)) / len(ngrams)

stats = {"cos":[], "ov":[], "d1":[], "d2":[]}

for ex in tqdm(data, desc="eval"):
    toks = tok.tokenize(ex["text"])
    if len(toks) <= prompt_tokens + 1:
        continue

    prompt = tok.convert_tokens_to_string(toks[:prompt_tokens])
    gold   = tok.convert_tokens_to_string(toks[prompt_tokens:])

    enc = tok(prompt, return_tensors="pt").to(device)
    outs = model.generate(
        **enc,
        max_length=enc.input_ids.shape[1] + max_gen_len,
        do_sample=True, top_k=50, top_p=0.95,
        num_return_sequences=num_gens,
        pad_token_id=tok.pad_token_id,
    )

    gens = [tok.decode(o[enc.input_ids.shape[1]:], skip_special_tokens=True).strip()
            for o in outs]

    g_emb = embed.encode([gold])
    stats["cos"].append(np.mean([
        cosine_similarity(embed.encode([g]), g_emb)[0,0] for g in gens if g]))

    g_set = set(tok.tokenize(gold))
    stats["ov"].append(np.mean([
        len(set(tok.tokenize(g)) & g_set) / len(set(tok.tokenize(g)))
        for g in gens if g and set(tok.tokenize(g))]))

    stats["d1"].append(np.mean([distinct_n(tok.tokenize(g),1) for g in gens if g]))
    stats["d2"].append(np.mean([distinct_n(tok.tokenize(g),2) for g in gens if g]))

    if len(stats["cos"]) >= num_examples:
        break

def show(name, arr):
    arr = np.array(arr)
    print(f"{name}: mean={arr.mean():.4f}, std={arr.std():.4f}")

print("\n=== Evaluation Results on", dataset_name, "===")
show("Cosine similarity (gen vs gold)", stats["cos"])
show("Unigram overlap (gen vs gold)",  stats["ov"])
show("Diversity (distinct-1)",         stats["d1"])
show("Diversity (distinct-2)",         stats["d2"])


In [1]:
from datasets import load_dataset

# Load the TinyStories dataset
dataset = load_dataset("roneneldan/TinyStories")

# Print dataset information
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 2119719
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 21990
    })
})


In [2]:
# Function to sample 5% of the dataset with optional seed and offset
def sample_five_percent(dataset_split, seed=42, offset=0):
    total_size = len(dataset_split)
    five_percent_size = total_size // 20  # 5% of the dataset
    shuffled = dataset_split.shuffle(seed=seed)
    return shuffled.select(range(offset, offset + five_percent_size))

# Sample 5% from train, validation, and test splits
train_data = sample_five_percent(dataset['train'], seed=42)
val_data   = sample_five_percent(dataset['validation'], seed=123, offset=0)
test_data  = sample_five_percent(dataset['validation'], seed=123, offset=len(val_data))  # next 5%

# Check sizes
print(f"Train size (5%): {len(train_data)}")
print(f"Validation size (5%): {len(val_data)}")
print(f"Test size (5%): {len(test_data)}")

# Show first five samples from val_data
print("\n=== First 5 validation samples ===")
for i in range(5):
    print(f"{i + 1}: {val_data[i]}")

# Show first five samples from test_data
print("\n=== First 5 test samples ===")
for i in range(5):
    print(f"{i + 1}: {test_data[i]}")


Train size (5%): 105985
Validation size (5%): 1099
Test size (5%): 1099

=== First 5 validation samples ===
1: {'text': 'Once upon a time, there was a princess who had an awful problem. She hated her weight. Everywhere she went she felt so sorry and sad.\n\nThe princess wanted so badly to be thin like the other animals in her kingdom. She tried to do everything to make her weight go away - she ate less and ran and jumped, but nothing worked.\n\nOne day, something magical happened. A magical fairy appeared and waved her wand and said, "No more weight for the princess". And, just like that, the princess was free from her weight.\n\nThe princess was so, so happy. She thanked the fairy again and again and ran off dancing and singing.\n\nThe princess never worried about her weight again, she was so happy, and thanked the magical fairy every day.'}
2: {'text': "Once upon a time, there was a little boy named Timmy. Timmy was very hungry and wanted to eat a cookie. But his mom said he had to e

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load OPT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")

# Set padding token to EOS token
tokenizer.pad_token = tokenizer.eos_token

In [5]:
# Function to tokenize and prepare inputs/labels
def tokenize_function_with_labels(examples):
    tokenized = tokenizer(
        examples['text'],
        truncation=True,
        padding="max_length",  # Pad to a fixed length for batches
        max_length=512         # Set maximum length for sequences
    )
    # Add labels (same as input_ids for language modeling)
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

# Tokenize train and validation datasets
train_dataset = train_data.map(tokenize_function_with_labels, batched=True, remove_columns=["text"])
val_dataset = val_data.map(tokenize_function_with_labels, batched=True, remove_columns=["text"])

# Verify tokenized data structure
print(train_dataset[0])

Map:   0%|          | 0/105985 [00:00<?, ? examples/s]

Map:   0%|          | 0/1099 [00:00<?, ? examples/s]

{'input_ids': [2, 23031, 8, 19156, 101, 7, 310, 11, 5, 2221, 4, 252, 192, 10, 380, 950, 15, 5, 1255, 4, 85, 16, 6219, 8, 251, 8, 2016, 4, 50118, 50118, 113, 15833, 6, 10, 950, 2901, 2668, 161, 4, 22, 100, 64, 5258, 24, 2901, 50118, 50118, 894, 5741, 7, 5258, 5, 950, 6, 53, 24, 16, 350, 1828, 4, 91, 5712, 159, 8, 9305, 5, 950, 4, 50118, 50118, 113, 673, 4272, 2901, 37, 161, 4, 22, 1711, 2581, 2901, 50118, 50118, 448, 493, 17216, 4, 264, 16, 45, 1266, 6, 79, 95, 4265, 24, 16, 6269, 4, 50118, 50118, 113, 7939, 162, 860, 2901, 79, 161, 4, 22, 100, 64, 2394, 24, 2901, 50118, 50118, 2515, 5916, 62, 5, 950, 8, 4650, 24, 15, 69, 471, 4, 264, 5792, 5764, 8, 7015, 4, 264, 473, 45, 1136, 159, 4, 50118, 50118, 113, 23692, 2901, 2668, 161, 4, 22, 1185, 32, 205, 23, 18442, 2901, 50118, 50118, 113, 13987, 47, 2901, 19156, 161, 4, 22, 243, 16, 1531, 2901, 50118, 50118, 1213, 185, 4072, 18442, 5, 950, 15, 49, 3885, 6, 3701, 6, 8, 5856, 4, 252, 33, 10, 319, 9, 1531, 19, 5, 950, 4, 252, 32, 1372, 8, 2602

In [7]:
from transformers import TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir="./opt",  # Directory to save model checkpoints
    eval_strategy="steps",     # Evaluate at specific intervals
    eval_steps=500,                  # Evaluate every 500 steps
    learning_rate=5e-5,              # Learning rate for optimizer
    weight_decay=0.01,               # Regularization strength
    per_device_train_batch_size=4,   # Training batch size
    per_device_eval_batch_size=4,    # Evaluation batch size
    num_train_epochs=3,              # Total number of training epochs
    save_strategy="steps",           # Save model at specific intervals
    save_steps=500,                  # Save model every 500 steps
    logging_dir="./logs",            # Directory for training logs
    save_total_limit=3,              # Limit total number of saved checkpoints
    load_best_model_at_end=True,     # Load best model after training
    save_safetensors=False
)

In [12]:
pip install tf-keras


Defaulting to user installation because normal site-packages is not writeable
Collecting tf-keras
  Downloading tf_keras-2.19.0-py3-none-any.whl.metadata (1.8 kB)
Collecting numpy<2.2.0,>=1.26.0 (from tensorflow<2.20,>=2.19->tf-keras)
  Using cached numpy-2.1.3-cp311-cp311-win_amd64.whl.metadata (60 kB)
Downloading tf_keras-2.19.0-py3-none-any.whl (1.7 MB)
   ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
   ------ --------------------------------- 0.3/1.7 MB ? eta -:--:--
   ------------------------------ --------- 1.3/1.7 MB 4.0 MB/s eta 0:00:01
   ---------------------------------------- 1.7/1.7 MB 4.5 MB/s eta 0:00:00
Using cached numpy-2.1.3-cp311-cp311-win_amd64.whl (12.9 MB)
Installing collected packages: numpy, tf-keras
  Attempting uninstall: numpy
    Found existing installation: numpy 2.2.6
    Uninstalling numpy-2.2.6:
      Successfully uninstalled numpy-2.2.6
Successfully installed numpy-2.1.3 tf-keras-2.19.0
Note: you may need to restart the kernel to 

  You can safely remove it manually.
  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ultralytics 8.3.77 requires numpy<=2.1.1,>=1.23.0, but you have numpy 2.1.3 which is incompatible.
tensorflow-intel 2.13.1 requires keras<2.14,>=2.13.1, but you have keras 3.9.0 which is incompatible.
tensorflow-intel 2.13.1 requires numpy<=1.24.3,>=1.22, but you have numpy 2.1.3 which is incompatible.
tensorflow-intel 2.13.1 requires tensorboard<2.14,>=2.13, but you have tensorboard 2.19.0 which is incompatible.
tensorflow-intel 2.13.1 requires tensorflow-estimator<2.14,>=2.13.0, but you have tensorflow-estimator 2.12.0 which is incompatible.
tensorflow-intel 2.13.1 requires typing-extensions<4.6.0,>=3.6.6, but you have typing-extensions 4.13.2 which is incompatible.

[notice] A new release of pip is available: 25.0 -> 25.1.1
[notice] To up

In [14]:
from transformers import EarlyStoppingCallback

# Add EarlyStoppingCallback to stop training if validation loss does not improve
early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=2)

In [8]:
from transformers import Trainer

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    
)

  trainer = Trainer(


In [9]:
# Train the model
trainer.train()

Step,Training Loss,Validation Loss
500,0.9269,0.761037
1000,0.8727,0.737694
1500,0.7489,0.721396
2000,0.7428,0.705693
2500,0.7259,0.695452
3000,0.7131,0.68848
3500,0.712,0.682182
4000,0.7111,0.677506
4500,0.7044,0.671492
5000,0.6988,0.66839


TrainOutput(global_step=79491, training_loss=0.5813596049082257, metrics={'train_runtime': 31706.9662, 'train_samples_per_second': 10.028, 'train_steps_per_second': 2.507, 'total_flos': 8.307910803456e+16, 'train_loss': 0.5813596049082257, 'epoch': 3.0})

In [10]:
from transformers import pipeline

# Create a text generation pipeline using the fine-tuned model
story_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Provide a prompt to generate a story
prompt = "Once upon a time in a magical forest"
generated_story = story_generator(prompt, max_length=150, num_return_sequences=1)

# Display the generated story
print("Generated Story:")
print(generated_story[0]['generated_text'])

Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Both `max_new_tokens` (=256) and `max_length`(=151) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Generated Story:
Once upon a time in a magical forest, there was a little girl named Jane. She was only three years old. Jane was very curious and wanted to know what it was like to be a princess. 

One day, she was playing in the forest when it started to rain. Jane was getting very wet. She ran under a big tree and started to jump in and out of the rain. As she jumped, she noticed a big, dry puddle. 

Jane didn't want to get very wet. She wanted to find a way to get out of the puddle. She looked around and saw a big, green frog. She asked the frog, "Can you help me get out of this dry puddle?" 

The frog said, "Sure, I can help you. Follow me and I'll show you a way." She held Jane's hand and they went together. 

They went and found a big, dry puddle. Jane was so happy that she jumped and splashed in the puddle. 

Jane and the frog had a lot of fun in the dry puddle. They even made a big splash in the puddle. 

In the end, the frog was right behind Jane. They were both happy and the

In [11]:
from transformers import pipeline

# Create a text generation pipeline using the fine-tuned model
story_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Provide a prompt to generate a story
prompt = "One day I was walking in an ancient desert when"
generated_story = story_generator(prompt, max_length=150, num_return_sequences=1)

# Display the generated story
print("Generated Story:")
print(generated_story[0]['generated_text'])

Device set to use cuda:0
Both `max_new_tokens` (=256) and `max_length`(=151) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Generated Story:
One day I was walking in an ancient desert when I found a small, sparkly rock. I was so excited that I wanted to share it with everyone.

So I picked it up and started to write down all the things I had seen in the past. I wrote about the rocks and all the plants I had seen. I even wrote about the sun, the trees and the animals I had seen.

When I was done, I put the sparkly rock in a jar and kept it safe in my pocket. It was a special treasure and I took it with me wherever I went.

I never forgot the magical day I had wrote about the rocks in the ancient desert.


In [12]:
from transformers import pipeline

# Create a text generation pipeline using the fine-tuned model
story_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Provide a prompt to generate a story
prompt = "I know a lion with a weird big nose"
generated_story = story_generator(prompt, max_length=250, num_return_sequences=1)

# Display the generated story
print("Generated Story:")
print(generated_story[0]['generated_text'])

Device set to use cuda:0
Both `max_new_tokens` (=256) and `max_length`(=251) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Generated Story:
I know a lion with a weird big nose. He roars loudly and walks around the jungle. All the animals in the jungle listen to him and he makes them laugh.

One day, I heard the lion roaring again and I wanted to see where it was coming from. I followed the sound and saw a river.

The lion was roaring and it sounded like a lot of fun. I followed the river and it was so loud. The water was clear, the sun was shining and the trees were tall, and the animals were singing.

I followed the river for a long time, until it was too deep to see. I was lost.

But then I saw something. A lake was right beside me and the water was a bright blue. It looked like it was going to be my friend. I followed the lake and it led me to the other side of the lake.

I was so happy to be back in the jungle. I knew I could always come back to the same place from now on. It was a special place full of surprises.


In [13]:
from transformers import pipeline

# Create a text generation pipeline using the fine-tuned model
story_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Provide a prompt to generate a story
prompt = "I know a giraffe with a little neck"
generated_story = story_generator(prompt, max_length=250, num_return_sequences=1)

# Display the generated story
print("Generated Story:")
print(generated_story[0]['generated_text'])

Device set to use cuda:0
Both `max_new_tokens` (=256) and `max_length`(=251) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Generated Story:
I know a giraffe with a little neck. She was tall and strong. She liked to eat leaves and flowers. She was very proud of her neck.

One day, I saw the giraffe with her neck. It was a beautiful sight. She was so proud of her neck, and she knew she was special.

I was so happy to see her. I wanted to show her to all my friends. When I saw her, they were so amazed.

The giraffe with her neck was so happy to see everyone. She was so proud of what she was going to do.

I smiled and waved goodbye. I knew I had made a new friend. I knew the giraffe with her neck was proud of what she was going to do.


In [14]:
from transformers import pipeline

# Create a text generation pipeline using the fine-tuned model
story_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Provide a prompt to generate a story
prompt = "I know a leon with a cute tail"
generated_story = story_generator(prompt, max_length=250, num_return_sequences=1)

# Display the generated story
print("Generated Story:")
print(generated_story[0]['generated_text'])

Device set to use cuda:0
Both `max_new_tokens` (=256) and `max_length`(=251) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Generated Story:
I know a leon with a cute tail! He went to a party, but he was not happy. He was sad because he wanted to show off his new tail. He found a big tree and decided to hide his tail. The other leon at the party saw him and wanted to play with him. But the leon didn't want to share his tail. He was selfish. The other leon was sad, but the leon still liked him. They played together and had fun. The other leon was happy again.


In [15]:
# Evaluate the model on the validation set
print("Evaluating model on validation set...")

eval_results = trainer.evaluate()

print("=== VALIDATION SET EVALUATION RESULTS ===")
for key, value in eval_results.items():
    print(f"{key}: {value:.4f}")




Evaluating model on validation set...


=== VALIDATION SET EVALUATION RESULTS ===
eval_loss: 0.5458
eval_runtime: 25.9259
eval_samples_per_second: 42.3900
eval_steps_per_second: 10.6070
epoch: 3.0000


In [16]:
import numpy as np

# Calculate perplexity
perplexity = np.exp(eval_results["eval_loss"])
print(f"\nPerplexity: {perplexity:.2f}")

# Interpret model quality based on perplexity
if perplexity < 50:
    quality = "Excellent"
elif perplexity < 100:
    quality = "Good"
elif perplexity < 200:
    quality = "Fair"
else:
    quality = "Needs Improvement"

print(f"Model Quality: {quality}")


Perplexity: 1.73
Model Quality: Excellent


In [21]:
import torch
import tqdm
import numpy as np
from datasets import load_dataset

# === SETUP: Load tokenizer, model, and embedder ===
# Replace these with your own paths/models
# tokenizer = ...
# model = ...
# embedder = ...
# Example:
# from transformers import AutoTokenizer, AutoModelForCausalLM
# tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
# model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")
from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer("all-MiniLM-L6-v2")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Load 5% validation set
val_data = load_dataset("roneneldan/TinyStories", split="validation").shuffle(seed=123).select(range(2489))

prompt_token_count = 20
num_generations = 5
max_gen_length = 150  # Fixed generation length, not dependent on gold text
num_examples = 100

cosine_scores, overlap_scores, diversity_scores = [], [], []

for i in tqdm.tqdm(range(len(val_data)), desc="Evaluating"):
    example = val_data[i]
    text = example['text'].strip()
    tokens = tokenizer.tokenize(text)

    if len(tokens) <= prompt_token_count + 1:
        continue

    prompt_tokens = tokens[:prompt_token_count]
    gold_tokens = tokens[prompt_token_count:]
    prompt = tokenizer.convert_tokens_to_string(prompt_tokens)
    gold = tokenizer.convert_tokens_to_string(gold_tokens)

    enc = tokenizer(prompt, return_tensors="pt", padding=True)
    input_ids = enc.input_ids.to(device)
    attention_mask = enc.attention_mask.to(device)

    # OPT: Use eos_token_id as pad_token_id if missing
    pad_token_id = tokenizer.pad_token_id
    if pad_token_id is None:
        pad_token_id = tokenizer.eos_token_id

    output_ids = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=len(input_ids[0]) + max_gen_length,  # Fixed generation length
        do_sample=True,
        top_k=50,
        top_p=0.95,
        num_return_sequences=num_generations,
        pad_token_id=pad_token_id
    )

    gens = [tokenizer.decode(g[input_ids.shape[1]:], skip_special_tokens=True).strip() for g in output_ids]

    # 1. Cosine similarity: Compare GENERATED text vs GOLD text
    gen_cosine_scores = []
    for gen in gens:
        if gen.strip():  # Only process non-empty generations
            emb_gen = embedder.encode([gen])
            emb_gold = embedder.encode([gold])
            from sklearn.metrics.pairwise import cosine_similarity
            cosine = float(cosine_similarity(emb_gen, emb_gold)[0, 0])
            gen_cosine_scores.append(cosine)

    if gen_cosine_scores:
        cosine_scores.append(np.mean(gen_cosine_scores))  # Average across generations

    # 2. Unigram overlap: Compare GENERATED text vs GOLD text
    gen_overlap_scores = []
    gold_token_set = set(tokenizer.tokenize(gold))
    for gen in gens:
        if gen.strip():
            gen_token_set = set(tokenizer.tokenize(gen))
            if gen_token_set:  # Avoid division by zero
                overlap = len(gen_token_set & gold_token_set) / len(gen_token_set)
                gen_overlap_scores.append(overlap)

    if gen_overlap_scores:
        overlap_scores.append(np.mean(gen_overlap_scores))  # Average across generations

    # 3. Diversity: Measure diversity WITHIN each generation, then average
    gen_diversity_scores = []
    for gen in gens:
        if gen.strip():
            gen_tokens = tokenizer.tokenize(gen)
            if gen_tokens:  # Avoid division by zero
                diversity = len(set(gen_tokens)) / len(gen_tokens)
                gen_diversity_scores.append(diversity)

    if gen_diversity_scores:
        diversity_scores.append(np.mean(gen_diversity_scores))  # Average across generations

    if len(cosine_scores) >= num_examples:
        break

def summarize(name, arr):
    arr = np.array(arr)
    return f"{name}: mean={arr.mean():.4f}, std={arr.std():.4f}"

print("\n=== Evaluation Results ===")
print(summarize("Cosine similarity (generated vs gold)", cosine_scores))
print(summarize("Unigram overlap (generated vs gold)", overlap_scores))
print(summarize("Diversity (distinct-1 within generations)", diversity_scores))

# Additional useful metrics for story generation
print("\n=== Additional Story Generation Metrics ===")
print(f"Total examples evaluated: {len(cosine_scores)}")
print(f"Average generation length: {np.mean([len(tokenizer.tokenize(g)) for g in gens if g.strip()]):.1f} tokens")


Evaluating:   4%|█▏                           | 99/2489 [01:10<28:28,  1.40it/s]


=== Evaluation Results ===
Cosine similarity (generated vs gold): mean=0.6161, std=0.0825
Unigram overlap (generated vs gold): mean=0.3876, std=0.0700
Diversity (distinct-1 within generations): mean=0.5270, std=0.0237

=== Additional Story Generation Metrics ===
Total examples evaluated: 100
Average generation length: 151.0 tokens





In [1]:
from datasets import load_dataset

# Load the TinyStories dataset
dataset = load_dataset("roneneldan/TinyStories")
print(dataset)


  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 2119719
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 21990
    })
})


In [2]:
def sample_five_percent(dataset_split, seed=42, offset=0):
    total_size = len(dataset_split)
    five_percent_size = total_size // 20
    shuffled = dataset_split.shuffle(seed=seed)
    return shuffled.select(range(offset, offset + five_percent_size))

train_data = sample_five_percent(dataset['train'], seed=42)
val_data_full = sample_five_percent(dataset['validation'], seed=123)

val_split_idx = int(len(val_data_full) * 0.8)
val_data = val_data_full.select(range(0, val_split_idx))
test_data = val_data_full.select(range(val_split_idx, len(val_data_full)))

print(f"Train size (5%): {len(train_data)}")
print(f"Validation size (4%): {len(val_data)}")
print(f"Test size (1%): {len(test_data)}")

print("\n=== First 3 validation samples ===")
for i in range(3):
    print(f"{i + 1}: {val_data[i]}")
print("\n=== First 3 test samples ===")
for i in range(3):
    print(f"{i + 1}: {test_data[i]}")


Train size (5%): 105985
Validation size (4%): 879
Test size (1%): 220

=== First 3 validation samples ===
1: {'text': 'Once upon a time, there was a princess who had an awful problem. She hated her weight. Everywhere she went she felt so sorry and sad.\n\nThe princess wanted so badly to be thin like the other animals in her kingdom. She tried to do everything to make her weight go away - she ate less and ran and jumped, but nothing worked.\n\nOne day, something magical happened. A magical fairy appeared and waved her wand and said, "No more weight for the princess". And, just like that, the princess was free from her weight.\n\nThe princess was so, so happy. She thanked the fairy again and again and ran off dancing and singing.\n\nThe princess never worried about her weight again, she was so happy, and thanked the magical fairy every day.'}
2: {'text': "Once upon a time, there was a little boy named Timmy. Timmy was very hungry and wanted to eat a cookie. But his mom said he had to eat

In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-160m")
tokenizer.pad_token = tokenizer.eos_token  # Safe default


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
def tokenize_function_with_labels(examples):
    tokenized = tokenizer(
        examples['text'],
        truncation=True,
        padding="max_length",
        max_length=512  # Pythia supports much more, but 512 is safe for most hardware
    )
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

train_dataset = train_data.map(tokenize_function_with_labels, batched=True, remove_columns=["text"])
val_dataset = val_data.map(tokenize_function_with_labels, batched=True, remove_columns=["text"])
test_dataset = test_data.map(tokenize_function_with_labels, batched=True, remove_columns=["text"])

print(train_dataset[0])


Map: 100%|██████████| 879/879 [00:00<00:00, 5350.30 examples/s]


{'input_ids': [16910, 285, 353, 571, 751, 281, 1132, 275, 253, 5603, 15, 1583, 923, 247, 1943, 5453, 327, 253, 3216, 15, 733, 310, 8516, 285, 1048, 285, 5536, 15, 187, 187, 3, 7745, 13, 247, 5453, 1476, 8969, 2296, 15, 346, 42, 476, 8488, 352, 1476, 187, 187, 1328, 14177, 281, 8488, 253, 5453, 13, 533, 352, 310, 1512, 10458, 15, 754, 11521, 1066, 285, 15323, 253, 5453, 15, 187, 187, 3, 48, 976, 1476, 344, 2296, 15, 346, 2773, 8513, 1476, 187, 187, 46, 571, 33350, 15, 1500, 310, 417, 1599, 13, 703, 816, 11121, 352, 310, 11755, 15, 187, 187, 3, 1466, 479, 1611, 1476, 703, 2296, 15, 346, 42, 476, 6654, 352, 1476, 187, 187, 2993, 21460, 598, 253, 5453, 285, 12516, 352, 327, 617, 1481, 15, 1500, 16771, 7808, 285, 9257, 15, 1500, 1057, 417, 2965, 1066, 15, 187, 187, 3, 24243, 1476, 8969, 2296, 15, 346, 1394, 403, 1175, 387, 26259, 1476, 187, 187, 3, 8398, 368, 1476, 353, 571, 2296, 15, 346, 1147, 310, 794, 1476, 187, 187, 3726, 1379, 7819, 26259, 253, 5453, 327, 616, 9851, 13, 6174, 13, 285,

In [5]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("EleutherAI/pythia-160m")
model.resize_token_embeddings(len(tokenizer))




Embedding(50277, 768)

In [7]:
from transformers import TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir="./pythia-tinystories4",  # Directory to save model checkpoints
    evaluation_strategy="steps",         # Evaluate at specific intervals
    eval_steps=500,                      # Evaluate every 500 steps
    learning_rate=5e-5,                  # Learning rate for optimizer
    weight_decay=0.01,                   # Regularization strength
    per_device_train_batch_size=4,       # Training batch size
    per_device_eval_batch_size=4,        # Evaluation batch size
    num_train_epochs=3,                  # Total number of training epochs
    save_strategy="steps",               # Save model at specific intervals
    save_steps=500,                      # Save model every 500 steps
    logging_dir="./logs",                # Directory for training logs
    save_total_limit=3,                  # Limit total number of saved checkpoints
    load_best_model_at_end=True,         # Load best model after training
    save_safetensors=False
)


In [8]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)


In [9]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
)


In [10]:
trainer.train()


  1%|          | 500/79491 [03:00<7:51:33,  2.79it/s]

{'loss': 2.3112, 'grad_norm': 13.294458389282227, 'learning_rate': 4.968549898730674e-05, 'epoch': 0.02}


                                                     
  1%|          | 500/79491 [03:23<7:51:33,  2.79it/s]

{'eval_loss': 2.243560791015625, 'eval_runtime': 23.2377, 'eval_samples_per_second': 37.826, 'eval_steps_per_second': 9.467, 'epoch': 0.02}


  1%|▏         | 1000/79491 [06:25<7:53:54,  2.76it/s] 

{'loss': 2.2188, 'grad_norm': 21.148820877075195, 'learning_rate': 4.937099797461348e-05, 'epoch': 0.04}


                                                      
  1%|▏         | 1000/79491 [06:49<7:53:54,  2.76it/s]

{'eval_loss': 2.250704288482666, 'eval_runtime': 23.236, 'eval_samples_per_second': 37.829, 'eval_steps_per_second': 9.468, 'epoch': 0.04}


  2%|▏         | 1500/79491 [09:51<7:49:54,  2.77it/s]  

{'loss': 2.2301, 'grad_norm': 18.839262008666992, 'learning_rate': 4.9056496961920216e-05, 'epoch': 0.06}


                                                      
  2%|▏         | 1500/79491 [10:14<7:49:54,  2.77it/s]

{'eval_loss': 2.1890618801116943, 'eval_runtime': 23.2407, 'eval_samples_per_second': 37.822, 'eval_steps_per_second': 9.466, 'epoch': 0.06}


  3%|▎         | 2000/79491 [13:16<7:46:22,  2.77it/s]  

{'loss': 2.2122, 'grad_norm': 19.01984214782715, 'learning_rate': 4.874199594922696e-05, 'epoch': 0.08}


                                                      
  3%|▎         | 2000/79491 [13:40<7:46:22,  2.77it/s]

{'eval_loss': 2.2093513011932373, 'eval_runtime': 23.2005, 'eval_samples_per_second': 37.887, 'eval_steps_per_second': 9.483, 'epoch': 0.08}


  3%|▎         | 2500/79491 [16:42<7:42:04,  2.78it/s]  

{'loss': 2.2051, 'grad_norm': 29.442514419555664, 'learning_rate': 4.8427494936533695e-05, 'epoch': 0.09}


                                                      
  3%|▎         | 2500/79491 [17:05<7:42:04,  2.78it/s]

{'eval_loss': 2.1952826976776123, 'eval_runtime': 23.2106, 'eval_samples_per_second': 37.871, 'eval_steps_per_second': 9.478, 'epoch': 0.09}


  4%|▍         | 3000/79491 [20:07<7:41:10,  2.76it/s]  

{'loss': 2.1922, 'grad_norm': 15.333375930786133, 'learning_rate': 4.811299392384044e-05, 'epoch': 0.11}


                                                      
  4%|▍         | 3000/79491 [20:31<7:41:10,  2.76it/s]

{'eval_loss': 2.240584135055542, 'eval_runtime': 23.2322, 'eval_samples_per_second': 37.835, 'eval_steps_per_second': 9.47, 'epoch': 0.11}


  4%|▍         | 3500/79491 [23:33<7:36:10,  2.78it/s]  

{'loss': 2.2144, 'grad_norm': 217.55667114257812, 'learning_rate': 4.779849291114717e-05, 'epoch': 0.13}


                                                      
  4%|▍         | 3500/79491 [23:56<7:36:10,  2.78it/s]

{'eval_loss': 2.1839709281921387, 'eval_runtime': 23.2284, 'eval_samples_per_second': 37.842, 'eval_steps_per_second': 9.471, 'epoch': 0.13}


  5%|▌         | 4000/79491 [26:59<7:34:58,  2.77it/s]  

{'loss': 2.1714, 'grad_norm': 17.637042999267578, 'learning_rate': 4.7483991898453915e-05, 'epoch': 0.15}


                                                      
  5%|▌         | 4000/79491 [27:22<7:34:58,  2.77it/s]

{'eval_loss': 2.1591997146606445, 'eval_runtime': 23.2113, 'eval_samples_per_second': 37.87, 'eval_steps_per_second': 9.478, 'epoch': 0.15}


  6%|▌         | 4500/79491 [30:24<7:33:02,  2.76it/s]  

{'loss': 2.1654, 'grad_norm': 18.56223487854004, 'learning_rate': 4.716949088576065e-05, 'epoch': 0.17}


                                                      
  6%|▌         | 4500/79491 [30:47<7:33:02,  2.76it/s]

{'eval_loss': 2.2127301692962646, 'eval_runtime': 23.2522, 'eval_samples_per_second': 37.803, 'eval_steps_per_second': 9.461, 'epoch': 0.17}


  6%|▋         | 5000/79491 [33:50<7:25:46,  2.79it/s]  

{'loss': 2.1356, 'grad_norm': 12.724302291870117, 'learning_rate': 4.6854989873067394e-05, 'epoch': 0.19}


                                                      
  6%|▋         | 5000/79491 [34:13<7:25:46,  2.79it/s]

{'eval_loss': 2.139741897583008, 'eval_runtime': 23.247, 'eval_samples_per_second': 37.811, 'eval_steps_per_second': 9.464, 'epoch': 0.19}


  7%|▋         | 5500/79491 [37:15<7:23:13,  2.78it/s]  

{'loss': 2.1474, 'grad_norm': 19.943758010864258, 'learning_rate': 4.654048886037413e-05, 'epoch': 0.21}


                                                      
  7%|▋         | 5500/79491 [37:38<7:23:13,  2.78it/s]

{'eval_loss': 2.177356243133545, 'eval_runtime': 23.2244, 'eval_samples_per_second': 37.848, 'eval_steps_per_second': 9.473, 'epoch': 0.21}


  8%|▊         | 6000/79491 [40:41<7:20:05,  2.78it/s]  

{'loss': 2.1347, 'grad_norm': 16.988780975341797, 'learning_rate': 4.622598784768087e-05, 'epoch': 0.23}


                                                      
  8%|▊         | 6000/79491 [41:04<7:20:05,  2.78it/s]

{'eval_loss': 2.118807077407837, 'eval_runtime': 23.2259, 'eval_samples_per_second': 37.846, 'eval_steps_per_second': 9.472, 'epoch': 0.23}


  8%|▊         | 6500/79491 [44:06<7:19:51,  2.77it/s]  

{'loss': 2.1446, 'grad_norm': 13.593884468078613, 'learning_rate': 4.591148683498761e-05, 'epoch': 0.25}


                                                      
  8%|▊         | 6500/79491 [44:29<7:19:51,  2.77it/s]

{'eval_loss': 2.122497797012329, 'eval_runtime': 23.2174, 'eval_samples_per_second': 37.86, 'eval_steps_per_second': 9.476, 'epoch': 0.25}


  9%|▉         | 7000/79491 [47:32<7:14:57,  2.78it/s]  

{'loss': 2.1294, 'grad_norm': 13.41758918762207, 'learning_rate': 4.559698582229435e-05, 'epoch': 0.26}


                                                      
  9%|▉         | 7000/79491 [47:55<7:14:57,  2.78it/s]

{'eval_loss': 2.1624937057495117, 'eval_runtime': 23.2507, 'eval_samples_per_second': 37.805, 'eval_steps_per_second': 9.462, 'epoch': 0.26}


  9%|▉         | 7500/79491 [50:57<7:12:55,  2.77it/s]  

{'loss': 2.1341, 'grad_norm': 14.63325309753418, 'learning_rate': 4.5282484809601086e-05, 'epoch': 0.28}


                                                      
  9%|▉         | 7500/79491 [51:20<7:12:55,  2.77it/s]

{'eval_loss': 2.0960421562194824, 'eval_runtime': 23.2, 'eval_samples_per_second': 37.888, 'eval_steps_per_second': 9.483, 'epoch': 0.28}


 10%|█         | 8000/79491 [54:23<7:10:08,  2.77it/s]  

{'loss': 2.0925, 'grad_norm': 12.547842979431152, 'learning_rate': 4.496798379690783e-05, 'epoch': 0.3}


                                                      
 10%|█         | 8000/79491 [54:46<7:10:08,  2.77it/s]

{'eval_loss': 2.0533628463745117, 'eval_runtime': 23.3252, 'eval_samples_per_second': 37.685, 'eval_steps_per_second': 9.432, 'epoch': 0.3}


 11%|█         | 8500/79491 [57:49<7:08:07,  2.76it/s]  

{'loss': 2.0835, 'grad_norm': 15.633810997009277, 'learning_rate': 4.465348278421457e-05, 'epoch': 0.32}


                                                      
 11%|█         | 8500/79491 [58:12<7:08:07,  2.76it/s]

{'eval_loss': 2.072451114654541, 'eval_runtime': 23.2247, 'eval_samples_per_second': 37.848, 'eval_steps_per_second': 9.473, 'epoch': 0.32}


 11%|█▏        | 9000/79491 [1:01:14<7:03:49,  2.77it/s]

{'loss': 2.0544, 'grad_norm': 12.730721473693848, 'learning_rate': 4.433898177152131e-05, 'epoch': 0.34}


                                                        
 11%|█▏        | 9000/79491 [1:01:38<7:03:49,  2.77it/s]

{'eval_loss': 2.0758607387542725, 'eval_runtime': 23.2463, 'eval_samples_per_second': 37.812, 'eval_steps_per_second': 9.464, 'epoch': 0.34}


 12%|█▏        | 9500/79491 [1:04:40<6:57:58,  2.79it/s]  

{'loss': 2.0654, 'grad_norm': 27.60356330871582, 'learning_rate': 4.402448075882805e-05, 'epoch': 0.36}


                                                        
 12%|█▏        | 9500/79491 [1:05:03<6:57:58,  2.79it/s]

{'eval_loss': 2.046332836151123, 'eval_runtime': 23.2252, 'eval_samples_per_second': 37.847, 'eval_steps_per_second': 9.472, 'epoch': 0.36}


 13%|█▎        | 10000/79491 [1:08:05<6:59:43,  2.76it/s] 

{'loss': 2.0713, 'grad_norm': 18.430025100708008, 'learning_rate': 4.3709979746134785e-05, 'epoch': 0.38}


                                                         
 13%|█▎        | 10000/79491 [1:08:29<6:59:43,  2.76it/s]

{'eval_loss': 2.040895462036133, 'eval_runtime': 23.2229, 'eval_samples_per_second': 37.851, 'eval_steps_per_second': 9.473, 'epoch': 0.38}


 13%|█▎        | 10500/79491 [1:11:31<6:55:06,  2.77it/s]  

{'loss': 2.0377, 'grad_norm': 15.015419006347656, 'learning_rate': 4.339547873344153e-05, 'epoch': 0.4}


                                                         
 13%|█▎        | 10500/79491 [1:11:54<6:55:06,  2.77it/s]

{'eval_loss': 2.0527143478393555, 'eval_runtime': 23.2095, 'eval_samples_per_second': 37.872, 'eval_steps_per_second': 9.479, 'epoch': 0.4}


 14%|█▍        | 11000/79491 [1:14:56<6:47:59,  2.80it/s]  

{'loss': 2.0318, 'grad_norm': 12.541086196899414, 'learning_rate': 4.308097772074826e-05, 'epoch': 0.42}


                                                         
 14%|█▍        | 11000/79491 [1:15:20<6:47:59,  2.80it/s]

{'eval_loss': 2.0159049034118652, 'eval_runtime': 23.2377, 'eval_samples_per_second': 37.826, 'eval_steps_per_second': 9.467, 'epoch': 0.42}


 14%|█▍        | 11500/79491 [1:18:22<6:49:58,  2.76it/s]  

{'loss': 2.0308, 'grad_norm': 12.928508758544922, 'learning_rate': 4.2766476708055006e-05, 'epoch': 0.43}


                                                         
 14%|█▍        | 11500/79491 [1:18:45<6:49:58,  2.76it/s]

{'eval_loss': 2.004016876220703, 'eval_runtime': 23.2219, 'eval_samples_per_second': 37.852, 'eval_steps_per_second': 9.474, 'epoch': 0.43}


 15%|█▌        | 12000/79491 [1:21:47<6:43:55,  2.78it/s]  

{'loss': 2.016, 'grad_norm': 12.658319473266602, 'learning_rate': 4.245197569536174e-05, 'epoch': 0.45}


                                                         
 15%|█▌        | 12000/79491 [1:22:11<6:43:55,  2.78it/s]

{'eval_loss': 1.9922921657562256, 'eval_runtime': 23.2305, 'eval_samples_per_second': 37.838, 'eval_steps_per_second': 9.47, 'epoch': 0.45}


 16%|█▌        | 12500/79491 [1:25:13<6:41:43,  2.78it/s]  

{'loss': 1.9911, 'grad_norm': 12.270023345947266, 'learning_rate': 4.2137474682668484e-05, 'epoch': 0.47}


                                                         
 16%|█▌        | 12500/79491 [1:25:36<6:41:43,  2.78it/s]

{'eval_loss': 1.9845373630523682, 'eval_runtime': 23.2474, 'eval_samples_per_second': 37.811, 'eval_steps_per_second': 9.463, 'epoch': 0.47}


 16%|█▋        | 13000/79491 [1:28:39<6:38:26,  2.78it/s]  

{'loss': 1.9668, 'grad_norm': 11.56966495513916, 'learning_rate': 4.182297366997522e-05, 'epoch': 0.49}


                                                         
 16%|█▋        | 13000/79491 [1:29:02<6:38:26,  2.78it/s]

{'eval_loss': 1.9593982696533203, 'eval_runtime': 23.232, 'eval_samples_per_second': 37.836, 'eval_steps_per_second': 9.47, 'epoch': 0.49}


 17%|█▋        | 13500/79491 [1:32:05<6:35:05,  2.78it/s]  

{'loss': 1.9538, 'grad_norm': 23.33204460144043, 'learning_rate': 4.150847265728196e-05, 'epoch': 0.51}


                                                         
 17%|█▋        | 13500/79491 [1:32:28<6:35:05,  2.78it/s]

{'eval_loss': 1.953465223312378, 'eval_runtime': 23.2348, 'eval_samples_per_second': 37.831, 'eval_steps_per_second': 9.469, 'epoch': 0.51}


 18%|█▊        | 14000/79491 [1:35:31<6:35:26,  2.76it/s]  

{'loss': 1.9498, 'grad_norm': 11.56118392944336, 'learning_rate': 4.11939716445887e-05, 'epoch': 0.53}


                                                         
 18%|█▊        | 14000/79491 [1:35:55<6:35:26,  2.76it/s]

{'eval_loss': 1.9298135042190552, 'eval_runtime': 23.2143, 'eval_samples_per_second': 37.865, 'eval_steps_per_second': 9.477, 'epoch': 0.53}


 18%|█▊        | 14500/79491 [1:38:57<6:29:55,  2.78it/s]  

{'loss': 1.9275, 'grad_norm': 9.465400695800781, 'learning_rate': 4.087947063189544e-05, 'epoch': 0.55}


                                                         
 18%|█▊        | 14500/79491 [1:39:21<6:29:55,  2.78it/s]

{'eval_loss': 1.9236372709274292, 'eval_runtime': 23.2188, 'eval_samples_per_second': 37.857, 'eval_steps_per_second': 9.475, 'epoch': 0.55}


 19%|█▉        | 15000/79491 [1:42:23<6:28:19,  2.77it/s]  

{'loss': 1.9189, 'grad_norm': 9.319644927978516, 'learning_rate': 4.0564969619202176e-05, 'epoch': 0.57}


                                                         
 19%|█▉        | 15000/79491 [1:42:46<6:28:19,  2.77it/s]

{'eval_loss': 1.9207005500793457, 'eval_runtime': 23.2334, 'eval_samples_per_second': 37.833, 'eval_steps_per_second': 9.469, 'epoch': 0.57}


 19%|█▉        | 15500/79491 [1:45:49<6:24:28,  2.77it/s]  

{'loss': 1.9256, 'grad_norm': 15.036065101623535, 'learning_rate': 4.025046860650892e-05, 'epoch': 0.58}


                                                         
 19%|█▉        | 15500/79491 [1:46:12<6:24:28,  2.77it/s]

{'eval_loss': 1.915084719657898, 'eval_runtime': 23.2281, 'eval_samples_per_second': 37.842, 'eval_steps_per_second': 9.471, 'epoch': 0.58}


 20%|██        | 16000/79491 [1:49:14<6:26:50,  2.74it/s]  

{'loss': 1.9096, 'grad_norm': 17.087997436523438, 'learning_rate': 3.9935967593815654e-05, 'epoch': 0.6}


                                                         
 20%|██        | 16000/79491 [1:49:38<6:26:50,  2.74it/s]

{'eval_loss': 1.9052501916885376, 'eval_runtime': 23.2376, 'eval_samples_per_second': 37.827, 'eval_steps_per_second': 9.467, 'epoch': 0.6}


 21%|██        | 16500/79491 [1:52:40<6:18:54,  2.77it/s]  

{'loss': 1.9024, 'grad_norm': 10.575530052185059, 'learning_rate': 3.962146658112239e-05, 'epoch': 0.62}


                                                         
 21%|██        | 16500/79491 [1:53:04<6:18:54,  2.77it/s]

{'eval_loss': 1.9058706760406494, 'eval_runtime': 23.2679, 'eval_samples_per_second': 37.777, 'eval_steps_per_second': 9.455, 'epoch': 0.62}


 21%|██▏       | 17000/79491 [1:56:06<6:15:08,  2.78it/s]  

{'loss': 1.8999, 'grad_norm': 32.764122009277344, 'learning_rate': 3.930696556842913e-05, 'epoch': 0.64}


                                                         
 21%|██▏       | 17000/79491 [1:56:29<6:15:08,  2.78it/s]

{'eval_loss': 1.8793312311172485, 'eval_runtime': 23.2395, 'eval_samples_per_second': 37.824, 'eval_steps_per_second': 9.467, 'epoch': 0.64}


 22%|██▏       | 17500/79491 [1:59:32<6:12:23,  2.77it/s]  

{'loss': 1.8839, 'grad_norm': 11.177746772766113, 'learning_rate': 3.899246455573587e-05, 'epoch': 0.66}


                                                         
 22%|██▏       | 17500/79491 [1:59:56<6:12:23,  2.77it/s]

{'eval_loss': 1.8929705619812012, 'eval_runtime': 23.2136, 'eval_samples_per_second': 37.866, 'eval_steps_per_second': 9.477, 'epoch': 0.66}


 23%|██▎       | 18000/79491 [2:02:59<6:11:06,  2.76it/s]  

{'loss': 1.8903, 'grad_norm': 17.264190673828125, 'learning_rate': 3.867796354304261e-05, 'epoch': 0.68}


                                                         
 23%|██▎       | 18000/79491 [2:03:22<6:11:06,  2.76it/s]

{'eval_loss': 1.889125943183899, 'eval_runtime': 23.2207, 'eval_samples_per_second': 37.854, 'eval_steps_per_second': 9.474, 'epoch': 0.68}


 23%|██▎       | 18500/79491 [2:06:24<6:08:12,  2.76it/s]  

{'loss': 1.8837, 'grad_norm': 9.949320793151855, 'learning_rate': 3.836346253034935e-05, 'epoch': 0.7}


                                                         
 23%|██▎       | 18500/79491 [2:06:48<6:08:12,  2.76it/s]

{'eval_loss': 1.8727738857269287, 'eval_runtime': 23.2423, 'eval_samples_per_second': 37.819, 'eval_steps_per_second': 9.466, 'epoch': 0.7}


 24%|██▍       | 19000/79491 [2:09:50<6:03:46,  2.77it/s]  

{'loss': 1.8737, 'grad_norm': 8.973023414611816, 'learning_rate': 3.804896151765609e-05, 'epoch': 0.72}


                                                         
 24%|██▍       | 19000/79491 [2:10:14<6:03:46,  2.77it/s]

{'eval_loss': 1.8622713088989258, 'eval_runtime': 23.2287, 'eval_samples_per_second': 37.841, 'eval_steps_per_second': 9.471, 'epoch': 0.72}


 25%|██▍       | 19500/79491 [2:13:16<6:00:58,  2.77it/s]  

{'loss': 1.8755, 'grad_norm': 8.106534957885742, 'learning_rate': 3.7734460504962825e-05, 'epoch': 0.74}


                                                         
 25%|██▍       | 19500/79491 [2:13:39<6:00:58,  2.77it/s]

{'eval_loss': 1.8546905517578125, 'eval_runtime': 23.1982, 'eval_samples_per_second': 37.891, 'eval_steps_per_second': 9.484, 'epoch': 0.74}


 25%|██▌       | 20000/79491 [2:16:42<5:56:34,  2.78it/s]  

{'loss': 1.8524, 'grad_norm': 13.795807838439941, 'learning_rate': 3.741995949226957e-05, 'epoch': 0.75}


                                                         
 25%|██▌       | 20000/79491 [2:17:05<5:56:34,  2.78it/s]

{'eval_loss': 1.8508551120758057, 'eval_runtime': 23.2333, 'eval_samples_per_second': 37.834, 'eval_steps_per_second': 9.469, 'epoch': 0.75}


 26%|██▌       | 20500/79491 [2:20:07<5:55:49,  2.76it/s]  

{'loss': 1.842, 'grad_norm': 6.036003589630127, 'learning_rate': 3.71054584795763e-05, 'epoch': 0.77}


                                                         
 26%|██▌       | 20500/79491 [2:20:31<5:55:49,  2.76it/s]

{'eval_loss': 1.834805965423584, 'eval_runtime': 23.2002, 'eval_samples_per_second': 37.888, 'eval_steps_per_second': 9.483, 'epoch': 0.77}


 26%|██▋       | 21000/79491 [2:23:33<5:51:48,  2.77it/s]  

{'loss': 1.8102, 'grad_norm': 9.135393142700195, 'learning_rate': 3.6790957466883046e-05, 'epoch': 0.79}


                                                         
 26%|██▋       | 21000/79491 [2:23:57<5:51:48,  2.77it/s]

{'eval_loss': 1.8206902742385864, 'eval_runtime': 23.242, 'eval_samples_per_second': 37.819, 'eval_steps_per_second': 9.466, 'epoch': 0.79}


 27%|██▋       | 21500/79491 [2:26:59<5:46:02,  2.79it/s]  

{'loss': 1.8295, 'grad_norm': 10.798088073730469, 'learning_rate': 3.647645645418978e-05, 'epoch': 0.81}


                                                         
 27%|██▋       | 21500/79491 [2:27:22<5:46:02,  2.79it/s]

{'eval_loss': 1.822757601737976, 'eval_runtime': 23.226, 'eval_samples_per_second': 37.846, 'eval_steps_per_second': 9.472, 'epoch': 0.81}


 28%|██▊       | 22000/79491 [2:30:25<5:45:34,  2.77it/s]  

{'loss': 1.8398, 'grad_norm': 8.866782188415527, 'learning_rate': 3.6161955441496524e-05, 'epoch': 0.83}


                                                         
 28%|██▊       | 22000/79491 [2:30:48<5:45:34,  2.77it/s]

{'eval_loss': 1.8142855167388916, 'eval_runtime': 23.1985, 'eval_samples_per_second': 37.89, 'eval_steps_per_second': 9.483, 'epoch': 0.83}


 28%|██▊       | 22500/79491 [2:33:50<5:40:37,  2.79it/s]  

{'loss': 1.824, 'grad_norm': 17.633291244506836, 'learning_rate': 3.584745442880326e-05, 'epoch': 0.85}


                                                         
 28%|██▊       | 22500/79491 [2:34:14<5:40:37,  2.79it/s]

{'eval_loss': 1.8150358200073242, 'eval_runtime': 23.1997, 'eval_samples_per_second': 37.888, 'eval_steps_per_second': 9.483, 'epoch': 0.85}


 29%|██▉       | 23000/79491 [2:37:16<5:40:01,  2.77it/s]  

{'loss': 1.7987, 'grad_norm': 8.322505950927734, 'learning_rate': 3.553295341611e-05, 'epoch': 0.87}


                                                         
 29%|██▉       | 23000/79491 [2:37:39<5:40:01,  2.77it/s]

{'eval_loss': 1.7996954917907715, 'eval_runtime': 23.2453, 'eval_samples_per_second': 37.814, 'eval_steps_per_second': 9.464, 'epoch': 0.87}


 30%|██▉       | 23500/79491 [2:40:42<5:36:21,  2.77it/s]  

{'loss': 1.7987, 'grad_norm': 7.9520673751831055, 'learning_rate': 3.521845240341674e-05, 'epoch': 0.89}


                                                         
 30%|██▉       | 23500/79491 [2:41:05<5:36:21,  2.77it/s]

{'eval_loss': 1.7921994924545288, 'eval_runtime': 23.2278, 'eval_samples_per_second': 37.843, 'eval_steps_per_second': 9.471, 'epoch': 0.89}


 30%|███       | 24000/79491 [2:44:08<5:35:49,  2.75it/s]  

{'loss': 1.7876, 'grad_norm': 14.093362808227539, 'learning_rate': 3.490395139072348e-05, 'epoch': 0.91}


                                                         
 30%|███       | 24000/79491 [2:44:31<5:35:49,  2.75it/s]

{'eval_loss': 1.7949602603912354, 'eval_runtime': 23.2163, 'eval_samples_per_second': 37.861, 'eval_steps_per_second': 9.476, 'epoch': 0.91}


 31%|███       | 24500/79491 [2:47:33<5:32:30,  2.76it/s]  

{'loss': 1.7947, 'grad_norm': 19.172388076782227, 'learning_rate': 3.4589450378030216e-05, 'epoch': 0.92}


                                                         
 31%|███       | 24500/79491 [2:47:57<5:32:30,  2.76it/s]

{'eval_loss': 1.7946630716323853, 'eval_runtime': 23.2319, 'eval_samples_per_second': 37.836, 'eval_steps_per_second': 9.47, 'epoch': 0.92}


 31%|███▏      | 25000/79491 [2:50:59<5:26:14,  2.78it/s]  

{'loss': 1.7886, 'grad_norm': 6.9366936683654785, 'learning_rate': 3.427494936533696e-05, 'epoch': 0.94}


                                                         
 31%|███▏      | 25000/79491 [2:51:22<5:26:14,  2.78it/s]

{'eval_loss': 1.7775028944015503, 'eval_runtime': 23.2183, 'eval_samples_per_second': 37.858, 'eval_steps_per_second': 9.475, 'epoch': 0.94}


 32%|███▏      | 25500/79491 [2:54:24<5:24:23,  2.77it/s]  

{'loss': 1.7898, 'grad_norm': 10.416181564331055, 'learning_rate': 3.39604483526437e-05, 'epoch': 0.96}


                                                         
 32%|███▏      | 25500/79491 [2:54:48<5:24:23,  2.77it/s]

{'eval_loss': 1.7749261856079102, 'eval_runtime': 23.2263, 'eval_samples_per_second': 37.845, 'eval_steps_per_second': 9.472, 'epoch': 0.96}


 33%|███▎      | 26000/79491 [2:57:50<5:23:06,  2.76it/s]  

{'loss': 1.7836, 'grad_norm': 7.397017002105713, 'learning_rate': 3.364594733995044e-05, 'epoch': 0.98}


                                                         
 33%|███▎      | 26000/79491 [2:58:13<5:23:06,  2.76it/s]

{'eval_loss': 1.7719323635101318, 'eval_runtime': 23.234, 'eval_samples_per_second': 37.832, 'eval_steps_per_second': 9.469, 'epoch': 0.98}


 33%|███▎      | 26500/79491 [3:01:15<4:53:59,  3.00it/s]  

{'loss': 1.7772, 'grad_norm': 12.982422828674316, 'learning_rate': 3.333144632725718e-05, 'epoch': 1.0}


                                                         
 33%|███▎      | 26500/79491 [3:01:39<4:53:59,  3.00it/s]

{'eval_loss': 1.7618515491485596, 'eval_runtime': 23.2397, 'eval_samples_per_second': 37.823, 'eval_steps_per_second': 9.467, 'epoch': 1.0}


 34%|███▍      | 27000/79491 [3:04:41<5:16:27,  2.76it/s]  

{'loss': 1.728, 'grad_norm': 7.764881610870361, 'learning_rate': 3.3016945314563915e-05, 'epoch': 1.02}


                                                         
 34%|███▍      | 27000/79491 [3:05:04<5:16:27,  2.76it/s]

{'eval_loss': 1.764837622642517, 'eval_runtime': 23.2666, 'eval_samples_per_second': 37.779, 'eval_steps_per_second': 9.456, 'epoch': 1.02}


 35%|███▍      | 27500/79491 [3:08:07<5:14:37,  2.75it/s]  

{'loss': 1.7233, 'grad_norm': 11.470574378967285, 'learning_rate': 3.270244430187066e-05, 'epoch': 1.04}


                                                         
 35%|███▍      | 27500/79491 [3:08:30<5:14:37,  2.75it/s]

{'eval_loss': 1.7621756792068481, 'eval_runtime': 23.2074, 'eval_samples_per_second': 37.876, 'eval_steps_per_second': 9.48, 'epoch': 1.04}


 35%|███▌      | 28000/79491 [3:11:33<5:12:02,  2.75it/s]  

{'loss': 1.7351, 'grad_norm': 9.544557571411133, 'learning_rate': 3.2387943289177393e-05, 'epoch': 1.06}


                                                         
 35%|███▌      | 28000/79491 [3:11:56<5:12:02,  2.75it/s]

{'eval_loss': 1.7641828060150146, 'eval_runtime': 23.2227, 'eval_samples_per_second': 37.851, 'eval_steps_per_second': 9.473, 'epoch': 1.06}


 36%|███▌      | 28500/79491 [3:14:58<5:05:30,  2.78it/s]  

{'loss': 1.7082, 'grad_norm': 7.137022972106934, 'learning_rate': 3.2073442276484136e-05, 'epoch': 1.08}


                                                         
 36%|███▌      | 28500/79491 [3:15:21<5:05:30,  2.78it/s]

{'eval_loss': 1.7498372793197632, 'eval_runtime': 23.1827, 'eval_samples_per_second': 37.916, 'eval_steps_per_second': 9.49, 'epoch': 1.08}


 36%|███▋      | 29000/79491 [3:18:24<5:03:59,  2.77it/s]  

{'loss': 1.7288, 'grad_norm': 8.469012260437012, 'learning_rate': 3.175894126379087e-05, 'epoch': 1.09}


                                                         
 36%|███▋      | 29000/79491 [3:18:47<5:03:59,  2.77it/s]

{'eval_loss': 1.75425124168396, 'eval_runtime': 23.2185, 'eval_samples_per_second': 37.858, 'eval_steps_per_second': 9.475, 'epoch': 1.09}


 37%|███▋      | 29500/79491 [3:21:49<4:59:52,  2.78it/s]  

{'loss': 1.7186, 'grad_norm': 8.431702613830566, 'learning_rate': 3.1444440251097614e-05, 'epoch': 1.11}


                                                         
 37%|███▋      | 29500/79491 [3:22:12<4:59:52,  2.78it/s]

{'eval_loss': 1.7383538484573364, 'eval_runtime': 23.2257, 'eval_samples_per_second': 37.846, 'eval_steps_per_second': 9.472, 'epoch': 1.11}


 38%|███▊      | 30000/79491 [3:25:15<4:57:19,  2.77it/s]  

{'loss': 1.691, 'grad_norm': 7.143097877502441, 'learning_rate': 3.112993923840435e-05, 'epoch': 1.13}


                                                         
 38%|███▊      | 30000/79491 [3:25:38<4:57:19,  2.77it/s]

{'eval_loss': 1.7412322759628296, 'eval_runtime': 23.2473, 'eval_samples_per_second': 37.811, 'eval_steps_per_second': 9.463, 'epoch': 1.13}


 38%|███▊      | 30500/79491 [3:28:40<4:52:23,  2.79it/s]  

{'loss': 1.7026, 'grad_norm': 7.263406276702881, 'learning_rate': 3.081543822571109e-05, 'epoch': 1.15}


                                                         
 38%|███▊      | 30500/79491 [3:29:03<4:52:23,  2.79it/s]

{'eval_loss': 1.732862114906311, 'eval_runtime': 23.2104, 'eval_samples_per_second': 37.871, 'eval_steps_per_second': 9.479, 'epoch': 1.15}


 39%|███▉      | 31000/79491 [3:32:06<4:56:46,  2.72it/s]  

{'loss': 1.7041, 'grad_norm': 6.563695430755615, 'learning_rate': 3.0500937213017828e-05, 'epoch': 1.17}


                                                         
 39%|███▉      | 31000/79491 [3:32:29<4:56:46,  2.72it/s]

{'eval_loss': 1.7250311374664307, 'eval_runtime': 23.2095, 'eval_samples_per_second': 37.872, 'eval_steps_per_second': 9.479, 'epoch': 1.17}


 40%|███▉      | 31500/79491 [3:35:31<4:48:44,  2.77it/s]  

{'loss': 1.6875, 'grad_norm': 7.121053695678711, 'learning_rate': 3.0186436200324564e-05, 'epoch': 1.19}


                                                         
 40%|███▉      | 31500/79491 [3:35:55<4:48:44,  2.77it/s]

{'eval_loss': 1.7201350927352905, 'eval_runtime': 23.224, 'eval_samples_per_second': 37.849, 'eval_steps_per_second': 9.473, 'epoch': 1.19}


 40%|████      | 32000/79491 [3:38:57<4:45:45,  2.77it/s]  

{'loss': 1.6936, 'grad_norm': 9.30715274810791, 'learning_rate': 2.9871935187631306e-05, 'epoch': 1.21}


                                                         
 40%|████      | 32000/79491 [3:39:20<4:45:45,  2.77it/s]

{'eval_loss': 1.7215172052383423, 'eval_runtime': 23.2047, 'eval_samples_per_second': 37.88, 'eval_steps_per_second': 9.481, 'epoch': 1.21}


 41%|████      | 32500/79491 [3:42:22<4:45:24,  2.74it/s]  

{'loss': 1.6847, 'grad_norm': 9.191243171691895, 'learning_rate': 2.9557434174938042e-05, 'epoch': 1.23}


                                                         
 41%|████      | 32500/79491 [3:42:46<4:45:24,  2.74it/s]

{'eval_loss': 1.7122553586959839, 'eval_runtime': 23.2356, 'eval_samples_per_second': 37.83, 'eval_steps_per_second': 9.468, 'epoch': 1.23}


 42%|████▏     | 33000/79491 [3:45:48<4:39:36,  2.77it/s]  

{'loss': 1.6727, 'grad_norm': 6.223240852355957, 'learning_rate': 2.9242933162244785e-05, 'epoch': 1.25}


                                                         
 42%|████▏     | 33000/79491 [3:46:11<4:39:36,  2.77it/s]

{'eval_loss': 1.707911729812622, 'eval_runtime': 23.2212, 'eval_samples_per_second': 37.853, 'eval_steps_per_second': 9.474, 'epoch': 1.25}


 42%|████▏     | 33500/79491 [3:49:14<4:35:05,  2.79it/s]  

{'loss': 1.6934, 'grad_norm': 8.704690933227539, 'learning_rate': 2.892843214955152e-05, 'epoch': 1.26}


                                                         
 42%|████▏     | 33500/79491 [3:49:37<4:35:05,  2.79it/s]

{'eval_loss': 1.7193617820739746, 'eval_runtime': 23.2445, 'eval_samples_per_second': 37.815, 'eval_steps_per_second': 9.465, 'epoch': 1.26}


 43%|████▎     | 34000/79491 [3:52:39<4:34:02,  2.77it/s]  

{'loss': 1.6727, 'grad_norm': 9.566182136535645, 'learning_rate': 2.8613931136858263e-05, 'epoch': 1.28}


                                                         
 43%|████▎     | 34000/79491 [3:53:03<4:34:02,  2.77it/s]

{'eval_loss': 1.7065935134887695, 'eval_runtime': 23.2004, 'eval_samples_per_second': 37.887, 'eval_steps_per_second': 9.483, 'epoch': 1.28}


 43%|████▎     | 34500/79491 [3:56:05<4:29:04,  2.79it/s] 

{'loss': 1.6865, 'grad_norm': 17.935312271118164, 'learning_rate': 2.8299430124165e-05, 'epoch': 1.3}


                                                         
 43%|████▎     | 34500/79491 [3:56:28<4:29:04,  2.79it/s]

{'eval_loss': 1.7016355991363525, 'eval_runtime': 23.2278, 'eval_samples_per_second': 37.843, 'eval_steps_per_second': 9.471, 'epoch': 1.3}


 44%|████▍     | 35000/79491 [3:59:30<4:27:05,  2.78it/s] 

{'loss': 1.6728, 'grad_norm': 14.67026424407959, 'learning_rate': 2.798492911147174e-05, 'epoch': 1.32}


                                                         
 44%|████▍     | 35000/79491 [3:59:54<4:27:05,  2.78it/s]

{'eval_loss': 1.6975852251052856, 'eval_runtime': 23.241, 'eval_samples_per_second': 37.821, 'eval_steps_per_second': 9.466, 'epoch': 1.32}


 45%|████▍     | 35500/79491 [4:02:56<4:24:22,  2.77it/s] 

{'loss': 1.667, 'grad_norm': 7.629891395568848, 'learning_rate': 2.7670428098778477e-05, 'epoch': 1.34}


                                                         
 45%|████▍     | 35500/79491 [4:03:19<4:24:22,  2.77it/s]

{'eval_loss': 1.6914359331130981, 'eval_runtime': 23.2332, 'eval_samples_per_second': 37.834, 'eval_steps_per_second': 9.469, 'epoch': 1.34}


 45%|████▌     | 36000/79491 [4:06:21<4:19:36,  2.79it/s] 

{'loss': 1.6828, 'grad_norm': 6.078959941864014, 'learning_rate': 2.735592708608522e-05, 'epoch': 1.36}


                                                         
 45%|████▌     | 36000/79491 [4:06:45<4:19:36,  2.79it/s]

{'eval_loss': 1.6953459978103638, 'eval_runtime': 23.2651, 'eval_samples_per_second': 37.782, 'eval_steps_per_second': 9.456, 'epoch': 1.36}


 46%|████▌     | 36500/79491 [4:09:47<4:17:21,  2.78it/s] 

{'loss': 1.6662, 'grad_norm': 5.36260461807251, 'learning_rate': 2.7041426073391955e-05, 'epoch': 1.38}


                                                         
 46%|████▌     | 36500/79491 [4:10:10<4:17:21,  2.78it/s]

{'eval_loss': 1.6856300830841064, 'eval_runtime': 23.2314, 'eval_samples_per_second': 37.837, 'eval_steps_per_second': 9.47, 'epoch': 1.38}


 47%|████▋     | 37000/79491 [4:13:13<4:15:34,  2.77it/s] 

{'loss': 1.6492, 'grad_norm': 6.0414204597473145, 'learning_rate': 2.6726925060698698e-05, 'epoch': 1.4}


                                                         
 47%|████▋     | 37000/79491 [4:13:36<4:15:34,  2.77it/s]

{'eval_loss': 1.6863176822662354, 'eval_runtime': 23.2154, 'eval_samples_per_second': 37.863, 'eval_steps_per_second': 9.476, 'epoch': 1.4}


 47%|████▋     | 37500/79491 [4:16:38<4:12:13,  2.77it/s] 

{'loss': 1.6577, 'grad_norm': 9.852368354797363, 'learning_rate': 2.6412424048005437e-05, 'epoch': 1.42}


                                                         
 47%|████▋     | 37500/79491 [4:17:01<4:12:13,  2.77it/s]

{'eval_loss': 1.681777000427246, 'eval_runtime': 23.1921, 'eval_samples_per_second': 37.901, 'eval_steps_per_second': 9.486, 'epoch': 1.42}


 48%|████▊     | 38000/79491 [4:20:04<4:09:09,  2.78it/s] 

{'loss': 1.6516, 'grad_norm': 8.788723945617676, 'learning_rate': 2.6097923035312176e-05, 'epoch': 1.43}


                                                         
 48%|████▊     | 38000/79491 [4:20:27<4:09:09,  2.78it/s]

{'eval_loss': 1.679007887840271, 'eval_runtime': 23.1831, 'eval_samples_per_second': 37.916, 'eval_steps_per_second': 9.49, 'epoch': 1.43}


 48%|████▊     | 38500/79491 [4:23:29<4:08:43,  2.75it/s] 

{'loss': 1.6564, 'grad_norm': 38.91945266723633, 'learning_rate': 2.5783422022618915e-05, 'epoch': 1.45}


                                                         
 48%|████▊     | 38500/79491 [4:23:52<4:08:43,  2.75it/s]

{'eval_loss': 1.676835060119629, 'eval_runtime': 23.2517, 'eval_samples_per_second': 37.804, 'eval_steps_per_second': 9.462, 'epoch': 1.45}


 49%|████▉     | 39000/79491 [4:26:55<4:02:05,  2.79it/s] 

{'loss': 1.6551, 'grad_norm': 51.93972396850586, 'learning_rate': 2.5468921009925654e-05, 'epoch': 1.47}


                                                         
 49%|████▉     | 39000/79491 [4:27:18<4:02:05,  2.79it/s]

{'eval_loss': 1.6776484251022339, 'eval_runtime': 23.2093, 'eval_samples_per_second': 37.873, 'eval_steps_per_second': 9.479, 'epoch': 1.47}


 50%|████▉     | 39500/79491 [4:30:20<4:02:03,  2.75it/s] 

{'loss': 1.6429, 'grad_norm': 13.745712280273438, 'learning_rate': 2.5154419997232393e-05, 'epoch': 1.49}


                                                         
 50%|████▉     | 39500/79491 [4:30:43<4:02:03,  2.75it/s]

{'eval_loss': 1.6726460456848145, 'eval_runtime': 23.2563, 'eval_samples_per_second': 37.796, 'eval_steps_per_second': 9.46, 'epoch': 1.49}


 50%|█████     | 40000/79491 [4:33:46<3:56:58,  2.78it/s] 

{'loss': 1.6508, 'grad_norm': 8.286460876464844, 'learning_rate': 2.4839918984539132e-05, 'epoch': 1.51}


                                                         
 50%|█████     | 40000/79491 [4:34:09<3:56:58,  2.78it/s]

{'eval_loss': 1.6735340356826782, 'eval_runtime': 23.2312, 'eval_samples_per_second': 37.837, 'eval_steps_per_second': 9.47, 'epoch': 1.51}


 51%|█████     | 40500/79491 [4:37:11<3:52:51,  2.79it/s] 

{'loss': 1.6397, 'grad_norm': 8.323348999023438, 'learning_rate': 2.452541797184587e-05, 'epoch': 1.53}


                                                         
 51%|█████     | 40500/79491 [4:37:34<3:52:51,  2.79it/s]

{'eval_loss': 1.6649380922317505, 'eval_runtime': 23.2168, 'eval_samples_per_second': 37.86, 'eval_steps_per_second': 9.476, 'epoch': 1.53}


 52%|█████▏    | 41000/79491 [4:40:37<3:51:34,  2.77it/s] 

{'loss': 1.6245, 'grad_norm': 9.092350959777832, 'learning_rate': 2.421091695915261e-05, 'epoch': 1.55}


                                                         
 52%|█████▏    | 41000/79491 [4:41:00<3:51:34,  2.77it/s]

{'eval_loss': 1.6569610834121704, 'eval_runtime': 23.2317, 'eval_samples_per_second': 37.836, 'eval_steps_per_second': 9.47, 'epoch': 1.55}


 52%|█████▏    | 41500/79491 [4:44:02<3:46:51,  2.79it/s] 

{'loss': 1.6383, 'grad_norm': 5.207590103149414, 'learning_rate': 2.389641594645935e-05, 'epoch': 1.57}


                                                         
 52%|█████▏    | 41500/79491 [4:44:26<3:46:51,  2.79it/s]

{'eval_loss': 1.653092861175537, 'eval_runtime': 23.2122, 'eval_samples_per_second': 37.868, 'eval_steps_per_second': 9.478, 'epoch': 1.57}


 53%|█████▎    | 42000/79491 [4:47:28<3:44:51,  2.78it/s] 

{'loss': 1.6364, 'grad_norm': 9.39238166809082, 'learning_rate': 2.358191493376609e-05, 'epoch': 1.59}


                                                         
 53%|█████▎    | 42000/79491 [4:47:51<3:44:51,  2.78it/s]

{'eval_loss': 1.6617462635040283, 'eval_runtime': 23.1996, 'eval_samples_per_second': 37.889, 'eval_steps_per_second': 9.483, 'epoch': 1.59}


 53%|█████▎    | 42500/79491 [4:50:53<3:40:57,  2.79it/s] 

{'loss': 1.6389, 'grad_norm': 7.631802082061768, 'learning_rate': 2.3267413921072828e-05, 'epoch': 1.6}


                                                         
 53%|█████▎    | 42500/79491 [4:51:17<3:40:57,  2.79it/s]

{'eval_loss': 1.6507048606872559, 'eval_runtime': 23.2172, 'eval_samples_per_second': 37.86, 'eval_steps_per_second': 9.476, 'epoch': 1.6}


 54%|█████▍    | 43000/79491 [4:54:19<3:39:41,  2.77it/s] 

{'loss': 1.6159, 'grad_norm': 6.513359069824219, 'learning_rate': 2.2952912908379567e-05, 'epoch': 1.62}


                                                         
 54%|█████▍    | 43000/79491 [4:54:42<3:39:41,  2.77it/s]

{'eval_loss': 1.6438158750534058, 'eval_runtime': 23.2727, 'eval_samples_per_second': 37.77, 'eval_steps_per_second': 9.453, 'epoch': 1.62}


 55%|█████▍    | 43500/79491 [4:57:45<3:36:04,  2.78it/s] 

{'loss': 1.6168, 'grad_norm': 7.124067783355713, 'learning_rate': 2.2638411895686303e-05, 'epoch': 1.64}


                                                         
 55%|█████▍    | 43500/79491 [4:58:08<3:36:04,  2.78it/s]

{'eval_loss': 1.641317367553711, 'eval_runtime': 23.2465, 'eval_samples_per_second': 37.812, 'eval_steps_per_second': 9.464, 'epoch': 1.64}


 55%|█████▌    | 44000/79491 [5:01:10<3:36:36,  2.73it/s] 

{'loss': 1.6342, 'grad_norm': 6.290946006774902, 'learning_rate': 2.2323910882993042e-05, 'epoch': 1.66}


                                                         
 55%|█████▌    | 44000/79491 [5:01:34<3:36:36,  2.73it/s]

{'eval_loss': 1.6369121074676514, 'eval_runtime': 23.2391, 'eval_samples_per_second': 37.824, 'eval_steps_per_second': 9.467, 'epoch': 1.66}


 56%|█████▌    | 44500/79491 [5:04:36<3:30:32,  2.77it/s] 

{'loss': 1.6153, 'grad_norm': 5.942343235015869, 'learning_rate': 2.200940987029978e-05, 'epoch': 1.68}


                                                         
 56%|█████▌    | 44500/79491 [5:04:59<3:30:32,  2.77it/s]

{'eval_loss': 1.6328763961791992, 'eval_runtime': 23.2048, 'eval_samples_per_second': 37.88, 'eval_steps_per_second': 9.481, 'epoch': 1.68}


 57%|█████▋    | 45000/79491 [5:08:01<3:29:06,  2.75it/s] 

{'loss': 1.6225, 'grad_norm': 6.264257431030273, 'learning_rate': 2.169490885760652e-05, 'epoch': 1.7}


                                                         
 57%|█████▋    | 45000/79491 [5:08:24<3:29:06,  2.75it/s]

{'eval_loss': 1.631859540939331, 'eval_runtime': 23.2032, 'eval_samples_per_second': 37.883, 'eval_steps_per_second': 9.481, 'epoch': 1.7}


 57%|█████▋    | 45500/79491 [5:11:27<3:24:10,  2.77it/s] 

{'loss': 1.6049, 'grad_norm': 7.745671272277832, 'learning_rate': 2.138040784491326e-05, 'epoch': 1.72}


                                                         
 57%|█████▋    | 45500/79491 [5:11:50<3:24:10,  2.77it/s]

{'eval_loss': 1.630358338356018, 'eval_runtime': 23.2167, 'eval_samples_per_second': 37.861, 'eval_steps_per_second': 9.476, 'epoch': 1.72}


 58%|█████▊    | 46000/79491 [5:14:52<3:22:17,  2.76it/s] 

{'loss': 1.6139, 'grad_norm': 6.3927812576293945, 'learning_rate': 2.1065906832220002e-05, 'epoch': 1.74}


                                                         
 58%|█████▊    | 46000/79491 [5:15:15<3:22:17,  2.76it/s]

{'eval_loss': 1.6242091655731201, 'eval_runtime': 23.2064, 'eval_samples_per_second': 37.877, 'eval_steps_per_second': 9.48, 'epoch': 1.74}


 58%|█████▊    | 46500/79491 [5:18:18<3:17:09,  2.79it/s] 

{'loss': 1.6179, 'grad_norm': 7.455056667327881, 'learning_rate': 2.075140581952674e-05, 'epoch': 1.75}


                                                         
 58%|█████▊    | 46500/79491 [5:18:41<3:17:09,  2.79it/s]

{'eval_loss': 1.6183668375015259, 'eval_runtime': 23.2209, 'eval_samples_per_second': 37.854, 'eval_steps_per_second': 9.474, 'epoch': 1.75}


 59%|█████▉    | 47000/79491 [5:21:43<3:16:57,  2.75it/s] 

{'loss': 1.5972, 'grad_norm': 8.668328285217285, 'learning_rate': 2.043690480683348e-05, 'epoch': 1.77}


                                                         
 59%|█████▉    | 47000/79491 [5:22:07<3:16:57,  2.75it/s]

{'eval_loss': 1.6187729835510254, 'eval_runtime': 23.2126, 'eval_samples_per_second': 37.867, 'eval_steps_per_second': 9.478, 'epoch': 1.77}


 60%|█████▉    | 47500/79491 [5:25:09<3:14:02,  2.75it/s] 

{'loss': 1.6074, 'grad_norm': 6.394242286682129, 'learning_rate': 2.012240379414022e-05, 'epoch': 1.79}


                                                         
 60%|█████▉    | 47500/79491 [5:25:32<3:14:02,  2.75it/s]

{'eval_loss': 1.6160188913345337, 'eval_runtime': 23.1912, 'eval_samples_per_second': 37.902, 'eval_steps_per_second': 9.486, 'epoch': 1.79}


 60%|██████    | 48000/79491 [5:28:35<3:09:27,  2.77it/s] 

{'loss': 1.5925, 'grad_norm': 5.955446243286133, 'learning_rate': 1.980790278144696e-05, 'epoch': 1.81}


                                                         
 60%|██████    | 48000/79491 [5:28:58<3:09:27,  2.77it/s]

{'eval_loss': 1.6095439195632935, 'eval_runtime': 23.2009, 'eval_samples_per_second': 37.886, 'eval_steps_per_second': 9.482, 'epoch': 1.81}


 61%|██████    | 48500/79491 [5:32:00<3:04:55,  2.79it/s] 

{'loss': 1.5818, 'grad_norm': 5.694262981414795, 'learning_rate': 1.9493401768753698e-05, 'epoch': 1.83}


                                                         
 61%|██████    | 48500/79491 [5:32:23<3:04:55,  2.79it/s]

{'eval_loss': 1.605433702468872, 'eval_runtime': 23.2119, 'eval_samples_per_second': 37.868, 'eval_steps_per_second': 9.478, 'epoch': 1.83}


 62%|██████▏   | 49000/79491 [5:35:26<3:03:04,  2.78it/s] 

{'loss': 1.5801, 'grad_norm': 6.3957672119140625, 'learning_rate': 1.9178900756060437e-05, 'epoch': 1.85}


                                                         
 62%|██████▏   | 49000/79491 [5:35:49<3:03:04,  2.78it/s]

{'eval_loss': 1.6000418663024902, 'eval_runtime': 23.2292, 'eval_samples_per_second': 37.84, 'eval_steps_per_second': 9.471, 'epoch': 1.85}


 62%|██████▏   | 49500/79491 [5:38:51<2:59:30,  2.78it/s] 

{'loss': 1.5904, 'grad_norm': 4.503657817840576, 'learning_rate': 1.8864399743367176e-05, 'epoch': 1.87}


                                                         
 62%|██████▏   | 49500/79491 [5:39:14<2:59:30,  2.78it/s]

{'eval_loss': 1.5969667434692383, 'eval_runtime': 23.2381, 'eval_samples_per_second': 37.826, 'eval_steps_per_second': 9.467, 'epoch': 1.87}


 63%|██████▎   | 50000/79491 [5:42:17<2:56:44,  2.78it/s] 

{'loss': 1.5681, 'grad_norm': 4.933863639831543, 'learning_rate': 1.8549898730673915e-05, 'epoch': 1.89}


                                                         
 63%|██████▎   | 50000/79491 [5:42:40<2:56:44,  2.78it/s]

{'eval_loss': 1.5925376415252686, 'eval_runtime': 23.2069, 'eval_samples_per_second': 37.877, 'eval_steps_per_second': 9.48, 'epoch': 1.89}


 64%|██████▎   | 50500/79491 [5:45:42<2:54:20,  2.77it/s] 

{'loss': 1.5731, 'grad_norm': 6.472629070281982, 'learning_rate': 1.8235397717980654e-05, 'epoch': 1.91}


                                                         
 64%|██████▎   | 50500/79491 [5:46:05<2:54:20,  2.77it/s]

{'eval_loss': 1.591697335243225, 'eval_runtime': 23.2327, 'eval_samples_per_second': 37.835, 'eval_steps_per_second': 9.469, 'epoch': 1.91}


 64%|██████▍   | 51000/79491 [5:49:08<2:50:59,  2.78it/s] 

{'loss': 1.5699, 'grad_norm': 8.220708847045898, 'learning_rate': 1.792089670528739e-05, 'epoch': 1.92}


                                                         
 64%|██████▍   | 51000/79491 [5:49:31<2:50:59,  2.78it/s]

{'eval_loss': 1.588707447052002, 'eval_runtime': 23.1946, 'eval_samples_per_second': 37.897, 'eval_steps_per_second': 9.485, 'epoch': 1.92}


 65%|██████▍   | 51500/79491 [5:52:33<2:47:46,  2.78it/s] 

{'loss': 1.5804, 'grad_norm': 5.434292316436768, 'learning_rate': 1.760639569259413e-05, 'epoch': 1.94}


                                                         
 65%|██████▍   | 51500/79491 [5:52:57<2:47:46,  2.78it/s]

{'eval_loss': 1.5885545015335083, 'eval_runtime': 23.243, 'eval_samples_per_second': 37.818, 'eval_steps_per_second': 9.465, 'epoch': 1.94}


 65%|██████▌   | 52000/79491 [5:55:59<2:46:50,  2.75it/s] 

{'loss': 1.5811, 'grad_norm': 5.423194885253906, 'learning_rate': 1.7291894679900868e-05, 'epoch': 1.96}


                                                         
 65%|██████▌   | 52000/79491 [5:56:22<2:46:50,  2.75it/s]

{'eval_loss': 1.5871150493621826, 'eval_runtime': 23.2224, 'eval_samples_per_second': 37.851, 'eval_steps_per_second': 9.474, 'epoch': 1.96}


 66%|██████▌   | 52500/79491 [5:59:25<2:40:39,  2.80it/s] 

{'loss': 1.5648, 'grad_norm': 6.038466930389404, 'learning_rate': 1.6977393667207607e-05, 'epoch': 1.98}


                                                         
 66%|██████▌   | 52500/79491 [5:59:48<2:40:39,  2.80it/s]

{'eval_loss': 1.5846022367477417, 'eval_runtime': 23.2066, 'eval_samples_per_second': 37.877, 'eval_steps_per_second': 9.48, 'epoch': 1.98}


 67%|██████▋   | 53000/79491 [6:02:50<2:36:59,  2.81it/s] 

{'loss': 1.5726, 'grad_norm': 4.918396949768066, 'learning_rate': 1.6662892654514346e-05, 'epoch': 2.0}


                                                         
 67%|██████▋   | 53000/79491 [6:03:14<2:36:59,  2.81it/s]

{'eval_loss': 1.5811796188354492, 'eval_runtime': 23.5268, 'eval_samples_per_second': 37.362, 'eval_steps_per_second': 9.351, 'epoch': 2.0}


 67%|██████▋   | 53500/79491 [6:06:16<2:36:43,  2.76it/s] 

{'loss': 1.5106, 'grad_norm': 4.745438575744629, 'learning_rate': 1.6348391641821085e-05, 'epoch': 2.02}


                                                         
 67%|██████▋   | 53500/79491 [6:06:39<2:36:43,  2.76it/s]

{'eval_loss': 1.5816086530685425, 'eval_runtime': 23.2352, 'eval_samples_per_second': 37.83, 'eval_steps_per_second': 9.468, 'epoch': 2.02}


 68%|██████▊   | 54000/79491 [6:09:42<2:33:19,  2.77it/s] 

{'loss': 1.4973, 'grad_norm': 7.415199279785156, 'learning_rate': 1.6033890629127825e-05, 'epoch': 2.04}


                                                         
 68%|██████▊   | 54000/79491 [6:10:05<2:33:19,  2.77it/s]

{'eval_loss': 1.5772428512573242, 'eval_runtime': 23.2229, 'eval_samples_per_second': 37.851, 'eval_steps_per_second': 9.473, 'epoch': 2.04}


 69%|██████▊   | 54500/79491 [6:13:07<2:29:41,  2.78it/s] 

{'loss': 1.5078, 'grad_norm': 8.346083641052246, 'learning_rate': 1.5719389616434567e-05, 'epoch': 2.06}


                                                         
 69%|██████▊   | 54500/79491 [6:13:30<2:29:41,  2.78it/s]

{'eval_loss': 1.5755925178527832, 'eval_runtime': 23.2244, 'eval_samples_per_second': 37.848, 'eval_steps_per_second': 9.473, 'epoch': 2.06}


 69%|██████▉   | 55000/79491 [6:16:33<2:26:58,  2.78it/s] 

{'loss': 1.5054, 'grad_norm': 5.187618732452393, 'learning_rate': 1.5404888603741306e-05, 'epoch': 2.08}


                                                         
 69%|██████▉   | 55000/79491 [6:16:56<2:26:58,  2.78it/s]

{'eval_loss': 1.5729483366012573, 'eval_runtime': 23.1744, 'eval_samples_per_second': 37.93, 'eval_steps_per_second': 9.493, 'epoch': 2.08}


 70%|██████▉   | 55500/79491 [6:19:58<2:24:35,  2.77it/s] 

{'loss': 1.5013, 'grad_norm': 5.334388256072998, 'learning_rate': 1.5090387591048044e-05, 'epoch': 2.09}


                                                         
 70%|██████▉   | 55500/79491 [6:20:21<2:24:35,  2.77it/s]

{'eval_loss': 1.5703054666519165, 'eval_runtime': 23.2202, 'eval_samples_per_second': 37.855, 'eval_steps_per_second': 9.475, 'epoch': 2.09}


 70%|███████   | 56000/79491 [6:23:24<2:21:32,  2.77it/s] 

{'loss': 1.5041, 'grad_norm': 5.343078136444092, 'learning_rate': 1.4775886578354783e-05, 'epoch': 2.11}


                                                         
 70%|███████   | 56000/79491 [6:23:47<2:21:32,  2.77it/s]

{'eval_loss': 1.5698421001434326, 'eval_runtime': 23.2359, 'eval_samples_per_second': 37.829, 'eval_steps_per_second': 9.468, 'epoch': 2.11}


 71%|███████   | 56500/79491 [6:26:49<2:18:03,  2.78it/s] 

{'loss': 1.4925, 'grad_norm': 5.544467926025391, 'learning_rate': 1.4461385565661522e-05, 'epoch': 2.13}


                                                         
 71%|███████   | 56500/79491 [6:27:13<2:18:03,  2.78it/s]

{'eval_loss': 1.56678307056427, 'eval_runtime': 23.2248, 'eval_samples_per_second': 37.847, 'eval_steps_per_second': 9.473, 'epoch': 2.13}


 72%|███████▏  | 57000/79491 [6:30:15<2:14:00,  2.80it/s] 

{'loss': 1.4982, 'grad_norm': 6.085906982421875, 'learning_rate': 1.4146884552968263e-05, 'epoch': 2.15}


                                                         
 72%|███████▏  | 57000/79491 [6:30:38<2:14:00,  2.80it/s]

{'eval_loss': 1.5641669034957886, 'eval_runtime': 23.234, 'eval_samples_per_second': 37.832, 'eval_steps_per_second': 9.469, 'epoch': 2.15}


 72%|███████▏  | 57500/79491 [6:33:40<2:11:54,  2.78it/s] 

{'loss': 1.4733, 'grad_norm': 7.791744232177734, 'learning_rate': 1.3832383540275002e-05, 'epoch': 2.17}


                                                         
 72%|███████▏  | 57500/79491 [6:34:04<2:11:54,  2.78it/s]

{'eval_loss': 1.564388632774353, 'eval_runtime': 23.2342, 'eval_samples_per_second': 37.832, 'eval_steps_per_second': 9.469, 'epoch': 2.17}


 73%|███████▎  | 58000/79491 [6:37:06<2:09:01,  2.78it/s] 

{'loss': 1.4982, 'grad_norm': 6.732273101806641, 'learning_rate': 1.3517882527581741e-05, 'epoch': 2.19}


                                                         
 73%|███████▎  | 58000/79491 [6:37:29<2:09:01,  2.78it/s]

{'eval_loss': 1.561294674873352, 'eval_runtime': 23.2148, 'eval_samples_per_second': 37.864, 'eval_steps_per_second': 9.477, 'epoch': 2.19}


 74%|███████▎  | 58500/79491 [6:40:32<2:05:35,  2.79it/s] 

{'loss': 1.4808, 'grad_norm': 6.717987060546875, 'learning_rate': 1.320338151488848e-05, 'epoch': 2.21}


                                                         
 74%|███████▎  | 58500/79491 [6:40:55<2:05:35,  2.79it/s]

{'eval_loss': 1.55855393409729, 'eval_runtime': 23.2454, 'eval_samples_per_second': 37.814, 'eval_steps_per_second': 9.464, 'epoch': 2.21}


 74%|███████▍  | 59000/79491 [6:43:57<2:03:32,  2.76it/s] 

{'loss': 1.4859, 'grad_norm': 6.476016998291016, 'learning_rate': 1.2888880502195216e-05, 'epoch': 2.23}


                                                         
 74%|███████▍  | 59000/79491 [6:44:21<2:03:32,  2.76it/s]

{'eval_loss': 1.55461585521698, 'eval_runtime': 23.1954, 'eval_samples_per_second': 37.896, 'eval_steps_per_second': 9.485, 'epoch': 2.23}


 75%|███████▍  | 59500/79491 [6:47:23<1:59:17,  2.79it/s] 

{'loss': 1.4911, 'grad_norm': 22.409255981445312, 'learning_rate': 1.2574379489501955e-05, 'epoch': 2.25}


                                                         
 75%|███████▍  | 59500/79491 [6:47:46<1:59:17,  2.79it/s]

{'eval_loss': 1.553324818611145, 'eval_runtime': 23.246, 'eval_samples_per_second': 37.813, 'eval_steps_per_second': 9.464, 'epoch': 2.25}


 75%|███████▌  | 60000/79491 [6:50:49<1:57:51,  2.76it/s] 

{'loss': 1.4905, 'grad_norm': 7.139934539794922, 'learning_rate': 1.2259878476808696e-05, 'epoch': 2.26}


                                                         
 75%|███████▌  | 60000/79491 [6:51:12<1:57:51,  2.76it/s]

{'eval_loss': 1.5498844385147095, 'eval_runtime': 23.1931, 'eval_samples_per_second': 37.899, 'eval_steps_per_second': 9.486, 'epoch': 2.26}


 76%|███████▌  | 60500/79491 [6:54:14<1:53:19,  2.79it/s] 

{'loss': 1.4766, 'grad_norm': 6.080734729766846, 'learning_rate': 1.1945377464115435e-05, 'epoch': 2.28}


                                                         
 76%|███████▌  | 60500/79491 [6:54:37<1:53:19,  2.79it/s]

{'eval_loss': 1.5485265254974365, 'eval_runtime': 23.2381, 'eval_samples_per_second': 37.826, 'eval_steps_per_second': 9.467, 'epoch': 2.28}


 77%|███████▋  | 61000/79491 [6:57:40<1:52:11,  2.75it/s] 

{'loss': 1.4839, 'grad_norm': 5.298403263092041, 'learning_rate': 1.1630876451422174e-05, 'epoch': 2.3}


                                                         
 77%|███████▋  | 61000/79491 [6:58:03<1:52:11,  2.75it/s]

{'eval_loss': 1.545462965965271, 'eval_runtime': 23.2188, 'eval_samples_per_second': 37.857, 'eval_steps_per_second': 9.475, 'epoch': 2.3}


 77%|███████▋  | 61500/79491 [7:01:05<1:47:49,  2.78it/s] 

{'loss': 1.4768, 'grad_norm': 5.22205114364624, 'learning_rate': 1.1316375438728913e-05, 'epoch': 2.32}


                                                         
 77%|███████▋  | 61500/79491 [7:01:29<1:47:49,  2.78it/s]

{'eval_loss': 1.5444835424423218, 'eval_runtime': 23.2307, 'eval_samples_per_second': 37.838, 'eval_steps_per_second': 9.47, 'epoch': 2.32}


 78%|███████▊  | 62000/79491 [7:04:31<1:45:05,  2.77it/s] 

{'loss': 1.4744, 'grad_norm': 5.829160213470459, 'learning_rate': 1.1001874426035652e-05, 'epoch': 2.34}


                                                         
 78%|███████▊  | 62000/79491 [7:04:54<1:45:05,  2.77it/s]

{'eval_loss': 1.5442696809768677, 'eval_runtime': 23.1945, 'eval_samples_per_second': 37.897, 'eval_steps_per_second': 9.485, 'epoch': 2.34}


 79%|███████▊  | 62500/79491 [7:07:56<1:41:32,  2.79it/s] 

{'loss': 1.4771, 'grad_norm': 6.90127420425415, 'learning_rate': 1.0687373413342391e-05, 'epoch': 2.36}


                                                         
 79%|███████▊  | 62500/79491 [7:08:20<1:41:32,  2.79it/s]

{'eval_loss': 1.5402623414993286, 'eval_runtime': 23.216, 'eval_samples_per_second': 37.862, 'eval_steps_per_second': 9.476, 'epoch': 2.36}


 79%|███████▉  | 63000/79491 [7:11:22<1:38:57,  2.78it/s] 

{'loss': 1.4715, 'grad_norm': 6.317216873168945, 'learning_rate': 1.037287240064913e-05, 'epoch': 2.38}


                                                         
 79%|███████▉  | 63000/79491 [7:11:45<1:38:57,  2.78it/s]

{'eval_loss': 1.537973403930664, 'eval_runtime': 23.2238, 'eval_samples_per_second': 37.849, 'eval_steps_per_second': 9.473, 'epoch': 2.38}


 80%|███████▉  | 63500/79491 [7:14:47<1:36:16,  2.77it/s] 

{'loss': 1.4628, 'grad_norm': 6.071233749389648, 'learning_rate': 1.005837138795587e-05, 'epoch': 2.4}


                                                         
 80%|███████▉  | 63500/79491 [7:15:10<1:36:16,  2.77it/s]

{'eval_loss': 1.5393320322036743, 'eval_runtime': 23.2077, 'eval_samples_per_second': 37.875, 'eval_steps_per_second': 9.48, 'epoch': 2.4}


 81%|████████  | 64000/79491 [7:18:13<1:33:48,  2.75it/s] 

{'loss': 1.4703, 'grad_norm': 6.63535737991333, 'learning_rate': 9.743870375262609e-06, 'epoch': 2.42}


                                                         
 81%|████████  | 64000/79491 [7:18:36<1:33:48,  2.75it/s]

{'eval_loss': 1.5375456809997559, 'eval_runtime': 23.198, 'eval_samples_per_second': 37.891, 'eval_steps_per_second': 9.484, 'epoch': 2.42}


 81%|████████  | 64500/79491 [7:21:38<1:30:26,  2.76it/s] 

{'loss': 1.4711, 'grad_norm': 7.150820732116699, 'learning_rate': 9.429369362569348e-06, 'epoch': 2.43}


                                                         
 81%|████████  | 64500/79491 [7:22:01<1:30:26,  2.76it/s]

{'eval_loss': 1.5341291427612305, 'eval_runtime': 23.2179, 'eval_samples_per_second': 37.859, 'eval_steps_per_second': 9.475, 'epoch': 2.43}


 82%|████████▏ | 65000/79491 [7:25:03<1:27:08,  2.77it/s] 

{'loss': 1.4747, 'grad_norm': 4.902370929718018, 'learning_rate': 9.114868349876087e-06, 'epoch': 2.45}


                                                         
 82%|████████▏ | 65000/79491 [7:25:27<1:27:08,  2.77it/s]

{'eval_loss': 1.5306953191757202, 'eval_runtime': 23.2477, 'eval_samples_per_second': 37.81, 'eval_steps_per_second': 9.463, 'epoch': 2.45}


 82%|████████▏ | 65500/79491 [7:28:29<1:24:04,  2.77it/s] 

{'loss': 1.4714, 'grad_norm': 5.527097225189209, 'learning_rate': 8.800367337182826e-06, 'epoch': 2.47}


                                                         
 82%|████████▏ | 65500/79491 [7:28:52<1:24:04,  2.77it/s]

{'eval_loss': 1.5305064916610718, 'eval_runtime': 23.2148, 'eval_samples_per_second': 37.864, 'eval_steps_per_second': 9.477, 'epoch': 2.47}


 83%|████████▎ | 66000/79491 [7:31:54<1:21:04,  2.77it/s] 

{'loss': 1.482, 'grad_norm': 5.373748779296875, 'learning_rate': 8.485866324489565e-06, 'epoch': 2.49}


                                                         
 83%|████████▎ | 66000/79491 [7:32:18<1:21:04,  2.77it/s]

{'eval_loss': 1.5292952060699463, 'eval_runtime': 23.2136, 'eval_samples_per_second': 37.866, 'eval_steps_per_second': 9.477, 'epoch': 2.49}


 84%|████████▎ | 66500/79491 [7:35:20<1:17:48,  2.78it/s] 

{'loss': 1.4688, 'grad_norm': 8.003175735473633, 'learning_rate': 8.171365311796304e-06, 'epoch': 2.51}


                                                         
 84%|████████▎ | 66500/79491 [7:35:43<1:17:48,  2.78it/s]

{'eval_loss': 1.5268241167068481, 'eval_runtime': 23.216, 'eval_samples_per_second': 37.862, 'eval_steps_per_second': 9.476, 'epoch': 2.51}


 84%|████████▍ | 67000/79491 [7:38:45<1:15:06,  2.77it/s] 

{'loss': 1.4603, 'grad_norm': 5.425024509429932, 'learning_rate': 7.856864299103044e-06, 'epoch': 2.53}


                                                         
 84%|████████▍ | 67000/79491 [7:39:09<1:15:06,  2.77it/s]

{'eval_loss': 1.524277687072754, 'eval_runtime': 23.2319, 'eval_samples_per_second': 37.836, 'eval_steps_per_second': 9.47, 'epoch': 2.53}


 85%|████████▍ | 67500/79491 [7:42:11<1:12:00,  2.78it/s] 

{'loss': 1.4673, 'grad_norm': 5.901165962219238, 'learning_rate': 7.542363286409783e-06, 'epoch': 2.55}


                                                         
 85%|████████▍ | 67500/79491 [7:42:34<1:12:00,  2.78it/s]

{'eval_loss': 1.5230025053024292, 'eval_runtime': 23.2436, 'eval_samples_per_second': 37.817, 'eval_steps_per_second': 9.465, 'epoch': 2.55}


 86%|████████▌ | 68000/79491 [7:45:36<1:08:46,  2.78it/s] 

{'loss': 1.468, 'grad_norm': 6.187441825866699, 'learning_rate': 7.227862273716522e-06, 'epoch': 2.57}


                                                         
 86%|████████▌ | 68000/79491 [7:45:59<1:08:46,  2.78it/s]

{'eval_loss': 1.5205508470535278, 'eval_runtime': 23.2355, 'eval_samples_per_second': 37.83, 'eval_steps_per_second': 9.468, 'epoch': 2.57}


 86%|████████▌ | 68500/79491 [7:49:02<1:05:56,  2.78it/s] 

{'loss': 1.4626, 'grad_norm': 12.988409996032715, 'learning_rate': 6.913361261023262e-06, 'epoch': 2.59}


                                                         
 86%|████████▌ | 68500/79491 [7:49:25<1:05:56,  2.78it/s]

{'eval_loss': 1.518417477607727, 'eval_runtime': 23.1931, 'eval_samples_per_second': 37.899, 'eval_steps_per_second': 9.486, 'epoch': 2.59}


 87%|████████▋ | 69000/79491 [7:52:27<1:02:46,  2.79it/s] 

{'loss': 1.4608, 'grad_norm': 7.0739922523498535, 'learning_rate': 6.598860248329999e-06, 'epoch': 2.6}


                                                         
 87%|████████▋ | 69000/79491 [7:52:50<1:02:46,  2.79it/s]

{'eval_loss': 1.5167135000228882, 'eval_runtime': 23.2377, 'eval_samples_per_second': 37.826, 'eval_steps_per_second': 9.467, 'epoch': 2.6}


 87%|████████▋ | 69500/79491 [7:55:53<59:30,  2.80it/s]   

{'loss': 1.4491, 'grad_norm': 9.121662139892578, 'learning_rate': 6.284359235636738e-06, 'epoch': 2.62}


                                                       
 87%|████████▋ | 69500/79491 [7:56:16<59:30,  2.80it/s]

{'eval_loss': 1.5152748823165894, 'eval_runtime': 23.2177, 'eval_samples_per_second': 37.859, 'eval_steps_per_second': 9.476, 'epoch': 2.62}


 88%|████████▊ | 70000/79491 [7:59:18<56:54,  2.78it/s]   

{'loss': 1.4541, 'grad_norm': 5.5099663734436035, 'learning_rate': 5.969858222943478e-06, 'epoch': 2.64}


                                                       
 88%|████████▊ | 70000/79491 [7:59:41<56:54,  2.78it/s]

{'eval_loss': 1.5138604640960693, 'eval_runtime': 23.2061, 'eval_samples_per_second': 37.878, 'eval_steps_per_second': 9.48, 'epoch': 2.64}


 89%|████████▊ | 70500/79491 [8:02:44<54:07,  2.77it/s]   

{'loss': 1.4392, 'grad_norm': 5.5505051612854, 'learning_rate': 5.6553572102502174e-06, 'epoch': 2.66}


                                                       
 89%|████████▊ | 70500/79491 [8:03:07<54:07,  2.77it/s]

{'eval_loss': 1.5119668245315552, 'eval_runtime': 23.2087, 'eval_samples_per_second': 37.874, 'eval_steps_per_second': 9.479, 'epoch': 2.66}


 89%|████████▉ | 71000/79491 [8:06:09<51:16,  2.76it/s]   

{'loss': 1.4307, 'grad_norm': 5.57731819152832, 'learning_rate': 5.3408561975569566e-06, 'epoch': 2.68}


                                                       
 89%|████████▉ | 71000/79491 [8:06:32<51:16,  2.76it/s]

{'eval_loss': 1.5101814270019531, 'eval_runtime': 23.2163, 'eval_samples_per_second': 37.861, 'eval_steps_per_second': 9.476, 'epoch': 2.68}


 90%|████████▉ | 71500/79491 [8:09:34<48:27,  2.75it/s]   

{'loss': 1.4489, 'grad_norm': 4.610359191894531, 'learning_rate': 5.026355184863696e-06, 'epoch': 2.7}


                                                       
 90%|████████▉ | 71500/79491 [8:09:58<48:27,  2.75it/s]

{'eval_loss': 1.5083388090133667, 'eval_runtime': 23.2019, 'eval_samples_per_second': 37.885, 'eval_steps_per_second': 9.482, 'epoch': 2.7}


 91%|█████████ | 72000/79491 [8:13:00<44:59,  2.77it/s]   

{'loss': 1.4522, 'grad_norm': 4.758246898651123, 'learning_rate': 4.711854172170435e-06, 'epoch': 2.72}


                                                       
 91%|█████████ | 72000/79491 [8:13:23<44:59,  2.77it/s]

{'eval_loss': 1.507104516029358, 'eval_runtime': 23.2108, 'eval_samples_per_second': 37.87, 'eval_steps_per_second': 9.478, 'epoch': 2.72}


 91%|█████████ | 72500/79491 [8:16:25<41:50,  2.78it/s]   

{'loss': 1.4433, 'grad_norm': 6.058782577514648, 'learning_rate': 4.397353159477174e-06, 'epoch': 2.74}


                                                       
 91%|█████████ | 72500/79491 [8:16:49<41:50,  2.78it/s]

{'eval_loss': 1.5056802034378052, 'eval_runtime': 23.2295, 'eval_samples_per_second': 37.84, 'eval_steps_per_second': 9.471, 'epoch': 2.74}


 92%|█████████▏| 73000/79491 [8:19:51<39:15,  2.76it/s]   

{'loss': 1.432, 'grad_norm': 5.0160298347473145, 'learning_rate': 4.082852146783913e-06, 'epoch': 2.76}


                                                       
 92%|█████████▏| 73000/79491 [8:20:14<39:15,  2.76it/s]

{'eval_loss': 1.5047298669815063, 'eval_runtime': 23.2202, 'eval_samples_per_second': 37.855, 'eval_steps_per_second': 9.475, 'epoch': 2.76}


 92%|█████████▏| 73500/79491 [8:23:16<36:04,  2.77it/s]   

{'loss': 1.4503, 'grad_norm': 4.958550453186035, 'learning_rate': 3.768351134090652e-06, 'epoch': 2.77}


                                                       
 92%|█████████▏| 73500/79491 [8:23:39<36:04,  2.77it/s]

{'eval_loss': 1.50277578830719, 'eval_runtime': 23.1979, 'eval_samples_per_second': 37.891, 'eval_steps_per_second': 9.484, 'epoch': 2.77}


 93%|█████████▎| 74000/79491 [8:26:42<33:36,  2.72it/s]   

{'loss': 1.4365, 'grad_norm': 4.759469985961914, 'learning_rate': 3.453850121397391e-06, 'epoch': 2.79}


                                                       
 93%|█████████▎| 74000/79491 [8:27:05<33:36,  2.72it/s]

{'eval_loss': 1.5009868144989014, 'eval_runtime': 23.1697, 'eval_samples_per_second': 37.938, 'eval_steps_per_second': 9.495, 'epoch': 2.79}


 94%|█████████▎| 74500/79491 [8:30:07<30:08,  2.76it/s]   

{'loss': 1.4432, 'grad_norm': 6.176833152770996, 'learning_rate': 3.13934910870413e-06, 'epoch': 2.81}


                                                       
 94%|█████████▎| 74500/79491 [8:30:30<30:08,  2.76it/s]

{'eval_loss': 1.499476671218872, 'eval_runtime': 23.206, 'eval_samples_per_second': 37.878, 'eval_steps_per_second': 9.48, 'epoch': 2.81}


 94%|█████████▍| 75000/79491 [8:33:32<26:53,  2.78it/s]   

{'loss': 1.4163, 'grad_norm': 4.648059368133545, 'learning_rate': 2.824848096010869e-06, 'epoch': 2.83}


                                                       
 94%|█████████▍| 75000/79491 [8:33:56<26:53,  2.78it/s]

{'eval_loss': 1.4986878633499146, 'eval_runtime': 23.2431, 'eval_samples_per_second': 37.818, 'eval_steps_per_second': 9.465, 'epoch': 2.83}


 95%|█████████▍| 75500/79491 [8:36:58<23:56,  2.78it/s]  

{'loss': 1.4248, 'grad_norm': 8.097125053405762, 'learning_rate': 2.5103470833176083e-06, 'epoch': 2.85}


                                                       
 95%|█████████▍| 75500/79491 [8:37:21<23:56,  2.78it/s]

{'eval_loss': 1.49735689163208, 'eval_runtime': 23.241, 'eval_samples_per_second': 37.821, 'eval_steps_per_second': 9.466, 'epoch': 2.85}


 96%|█████████▌| 76000/79491 [8:40:24<21:01,  2.77it/s]  

{'loss': 1.4392, 'grad_norm': 5.109857082366943, 'learning_rate': 2.1958460706243474e-06, 'epoch': 2.87}


                                                       
 96%|█████████▌| 76000/79491 [8:40:47<21:01,  2.77it/s]

{'eval_loss': 1.4961588382720947, 'eval_runtime': 23.1816, 'eval_samples_per_second': 37.918, 'eval_steps_per_second': 9.49, 'epoch': 2.87}


 96%|█████████▌| 76500/79491 [8:43:49<17:53,  2.79it/s]  

{'loss': 1.4264, 'grad_norm': 7.628746032714844, 'learning_rate': 1.8813450579310867e-06, 'epoch': 2.89}


                                                       
 96%|█████████▌| 76500/79491 [8:44:12<17:53,  2.79it/s]

{'eval_loss': 1.4945080280303955, 'eval_runtime': 23.2359, 'eval_samples_per_second': 37.829, 'eval_steps_per_second': 9.468, 'epoch': 2.89}


 97%|█████████▋| 77000/79491 [8:47:15<14:54,  2.79it/s]  

{'loss': 1.4364, 'grad_norm': 5.212543487548828, 'learning_rate': 1.5668440452378257e-06, 'epoch': 2.91}


                                                       
 97%|█████████▋| 77000/79491 [8:47:38<14:54,  2.79it/s]

{'eval_loss': 1.4939227104187012, 'eval_runtime': 23.2048, 'eval_samples_per_second': 37.88, 'eval_steps_per_second': 9.481, 'epoch': 2.91}


 97%|█████████▋| 77500/79491 [8:50:40<11:53,  2.79it/s]  

{'loss': 1.4164, 'grad_norm': 5.485532760620117, 'learning_rate': 1.2523430325445648e-06, 'epoch': 2.92}


                                                       
 97%|█████████▋| 77500/79491 [8:51:03<11:53,  2.79it/s]

{'eval_loss': 1.493747591972351, 'eval_runtime': 23.2126, 'eval_samples_per_second': 37.867, 'eval_steps_per_second': 9.478, 'epoch': 2.92}


 98%|█████████▊| 78000/79491 [8:54:05<08:55,  2.78it/s]  

{'loss': 1.4198, 'grad_norm': 4.510046482086182, 'learning_rate': 9.37842019851304e-07, 'epoch': 2.94}


                                                       
 98%|█████████▊| 78000/79491 [8:54:29<08:55,  2.78it/s]

{'eval_loss': 1.4929429292678833, 'eval_runtime': 23.2233, 'eval_samples_per_second': 37.85, 'eval_steps_per_second': 9.473, 'epoch': 2.94}


 99%|█████████▉| 78500/79491 [8:57:31<05:59,  2.76it/s]  

{'loss': 1.4452, 'grad_norm': 5.519307613372803, 'learning_rate': 6.233410071580431e-07, 'epoch': 2.96}


                                                       
 99%|█████████▉| 78500/79491 [8:57:54<05:59,  2.76it/s]

{'eval_loss': 1.4919567108154297, 'eval_runtime': 23.2316, 'eval_samples_per_second': 37.836, 'eval_steps_per_second': 9.47, 'epoch': 2.96}


 99%|█████████▉| 79000/79491 [9:00:56<02:57,  2.77it/s]  

{'loss': 1.4342, 'grad_norm': 5.5116753578186035, 'learning_rate': 3.088399944647822e-07, 'epoch': 2.98}


                                                       
 99%|█████████▉| 79000/79491 [9:01:20<02:57,  2.77it/s]

{'eval_loss': 1.491692304611206, 'eval_runtime': 23.3051, 'eval_samples_per_second': 37.717, 'eval_steps_per_second': 9.44, 'epoch': 2.98}


100%|██████████| 79491/79491 [9:04:19<00:00,  2.43it/s]  

{'train_runtime': 32659.7658, 'train_samples_per_second': 9.735, 'train_steps_per_second': 2.434, 'train_loss': 1.6984189284694966, 'epoch': 3.0}





TrainOutput(global_step=79491, training_loss=1.6984189284694966, metrics={'train_runtime': 32659.7658, 'train_samples_per_second': 9.735, 'train_steps_per_second': 2.434, 'total_flos': 1.2079439755739136e+17, 'train_loss': 1.6984189284694966, 'epoch': 3.0})

In [11]:
from transformers import pipeline

# Create a text generation pipeline using the fine-tuned model
story_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Provide a prompt to generate a story
prompt = "Once upon a time in a magical forest"
generated_story = story_generator(prompt, max_length=150, num_return_sequences=1)

# Display the generated story
print("Generated Story:")
print(generated_story[0]['generated_text'])


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Generated Story:
Once upon a time in a magical forest, there lived a little girl named Lily. She loved to play in the forest and explore the forest. One day, she found a big, shiny rock. She picked it up and showed it to her mom.

"Look, Mommy! I found a pretty rock!" Lily said.

Her mom smiled and said, "That's a very special rock, Lily. It's very special because it's a magic rock. It can make anything you wish for."

Lily was so excited to have found the rock. She put it in her pocket and ran home to show her mom.

"Look, Mommy! I found a pretty rock!" Lily said.



In [12]:
from transformers import pipeline

# Create a text generation pipeline using the fine-tuned model
story_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Provide a prompt to generate a story
prompt = "One day I was walking in an ancient desert when"
generated_story = story_generator(prompt, max_length=150, num_return_sequences=1)

# Display the generated story
print("Generated Story:")
print(generated_story[0]['generated_text'])


Generated Story:
One day I was walking in an ancient desert when suddenly a giant appeared. He was huge and strong and he was very strong. He was so big that he could almost touch the sky!

The giant was very angry and he shouted at the giant. He said, "You are too big for me! I will destroy you!"

The giant was very scared and he ran away. He was so scared that he ran away as fast as he could.

The giant was so angry that he ran away and never came back. He was so sad that he never saw the giant again.

The end. The giant was gone and the giant was gone. The giant was gone and the giant was gone. The giant was


In [13]:
from transformers import pipeline

# Create a text generation pipeline using the fine-tuned model
story_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Provide a prompt to generate a story
prompt = "I know a lion with a weird big nose"
generated_story = story_generator(prompt, max_length=250, num_return_sequences=1)

# Display the generated story
print("Generated Story:")
print(generated_story[0]['generated_text'])


Generated Story:
I know a lion with a weird big nose. He was very big and had a long tail. He was very hairy and had a long tail. He liked to roar very loud.

One day, a little girl came to the lion. She saw the lion and said, "Hello, Mr. Lion. What are you doing?"

The lion replied, "I am roaring because I am a lion. I am very hairy and have a big nose."

The little girl said, "That is a funny name. Can you teach me how to roar?"

The lion said, "Sure, I can teach you. But first, you have to roar very loud. Then you have to roar very loud."

The little girl said, "OK, I will roar very loud. But you have to be careful. The lion is very big and has a lot of hairy skin."

The lion said, "OK, I will be careful. I will roar very loud. Thank you for teaching me."

The little girl said, "You're welcome, Mr. Lion. You are very kind and brave. I will roar very loud."

The lion said,


In [14]:
from transformers import pipeline

# Create a text generation pipeline using the fine-tuned model
story_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Provide a prompt to generate a story
prompt = "I know a giraffe with a little neck"
generated_story = story_generator(prompt, max_length=250, num_return_sequences=1)

# Display the generated story
print("Generated Story:")
print(generated_story[0]['generated_text'])


Generated Story:
I know a giraffe with a little neck. He was very small and had a big trunk. He was very proud of himself.

One day, he saw a little girl walking by. She was wearing a pretty dress and had a big smile on her face.

"Hi, I'm Anna. What's your name?" she asked.

"I'm Anna. What's yours?" he asked.

"I'm Anna. I'm a giraffe. Do you want to play with me?" she said.

"Yes, please!" Anna said.

Anna and Anna played together for a while. They had so much fun.

But then, Anna started to feel tired. She had been running around and had been running around. She was so tired that she had to stop.

"Anna, I'm tired. I'm tired. Can you help me?" she asked.

Anna smiled and said, "Of course, I can help you. Let's go home and rest."

Anna and Anna went home and rested. They were happy and tired. They had a good day. They were both very tired.

The end. Anna and Anna were


In [15]:
from transformers import pipeline

# Create a text generation pipeline using the fine-tuned model
story_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Provide a prompt to generate a story
prompt = "I know a leon with a cute tail"
generated_story = story_generator(prompt, max_length=250, num_return_sequences=1)

# Display the generated story
print("Generated Story:")
print(generated_story[0]['generated_text'])


Generated Story:
I know a leon with a cute tail. It was a very special leon. It was a very special leon because it could talk!

One day, a little girl named Lily wanted to try the leon. She asked her mom if she could have it. Her mom said yes and gave her the leon.

Lily was so happy. She took the leon and ran to show her friends. They all wanted to try the leon too.

Lily and her friends tried the leon. It was so fun! They all laughed and cheered.

Lily was so proud of her new leon. She had used it to talk to her friends and make them laugh. She was so happy that she had found the leon.

The end. Lily and her friends were very proud of their new lemons. They all had a great time playing with them. They were very proud of their new lemons. They all agreed that they had a very special and special lemons. They all lived happily ever after. The end. The end. The end. The end. The end. The end. The end. The end. The end. The end.


In [17]:
import torch
import numpy as np

# Set device to CUDA if available, otherwise CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Make sure your model is on the right device
model = model.to(device)

# Evaluate the model on the validation set
print("Evaluating model on validation set...")

eval_results = trainer.evaluate()

print("=== VALIDATION SET EVALUATION RESULTS ===")
for key, value in eval_results.items():
    print(f"{key}: {value:.4f}")

# Calculate perplexity
perplexity = np.exp(eval_results["eval_loss"])
print(f"\nPerplexity: {perplexity:.2f}")

# Interpret model quality based on perplexity
if perplexity < 50:
    quality = "Excellent"
elif perplexity < 100:
    quality = "Good"
elif perplexity < 200:
    quality = "Fair"
else:
    quality = "Needs Improvement"

print(f"Model Quality: {quality}")


Evaluating model on validation set...


100%|██████████| 220/220 [00:23<00:00,  9.51it/s]

=== VALIDATION SET EVALUATION RESULTS ===
eval_loss: 1.4917
eval_runtime: 23.2166
eval_samples_per_second: 37.8610
eval_steps_per_second: 9.4760
epoch: 3.0000

Perplexity: 4.44
Model Quality: Excellent





In [22]:
import torch
import tqdm
import numpy as np
from datasets import load_dataset

# === SETUP: Load tokenizer, model, and embedder ===
# Replace these with your own paths/models
# tokenizer = ...
# model = ...
# embedder = ...
# Example:
# from transformers import AutoTokenizer, AutoModelForCausalLM
# tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
# model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")
from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer("all-MiniLM-L6-v2")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Load 5% validation set
val_data = load_dataset("roneneldan/TinyStories", split="validation").shuffle(seed=123).select(range(2489))

prompt_token_count = 20
num_generations = 5
max_gen_length = 150  # Fixed generation length, not dependent on gold text
num_examples = 100

cosine_scores, overlap_scores, diversity_scores = [], [], []

for i in tqdm.tqdm(range(len(val_data)), desc="Evaluating"):
    example = val_data[i]
    text = example['text'].strip()
    tokens = tokenizer.tokenize(text)

    if len(tokens) <= prompt_token_count + 1:
        continue

    prompt_tokens = tokens[:prompt_token_count]
    gold_tokens = tokens[prompt_token_count:]
    prompt = tokenizer.convert_tokens_to_string(prompt_tokens)
    gold = tokenizer.convert_tokens_to_string(gold_tokens)

    enc = tokenizer(prompt, return_tensors="pt", padding=True)
    input_ids = enc.input_ids.to(device)
    attention_mask = enc.attention_mask.to(device)

    # OPT: Use eos_token_id as pad_token_id if missing
    pad_token_id = tokenizer.pad_token_id
    if pad_token_id is None:
        pad_token_id = tokenizer.eos_token_id

    output_ids = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=len(input_ids[0]) + max_gen_length,  # Fixed generation length
        do_sample=True,
        top_k=50,
        top_p=0.95,
        num_return_sequences=num_generations,
        pad_token_id=pad_token_id
    )

    gens = [tokenizer.decode(g[input_ids.shape[1]:], skip_special_tokens=True).strip() for g in output_ids]

    # 1. Cosine similarity: Compare GENERATED text vs GOLD text
    gen_cosine_scores = []
    for gen in gens:
        if gen.strip():  # Only process non-empty generations
            emb_gen = embedder.encode([gen])
            emb_gold = embedder.encode([gold])
            from sklearn.metrics.pairwise import cosine_similarity
            cosine = float(cosine_similarity(emb_gen, emb_gold)[0, 0])
            gen_cosine_scores.append(cosine)

    if gen_cosine_scores:
        cosine_scores.append(np.mean(gen_cosine_scores))  # Average across generations

    # 2. Unigram overlap: Compare GENERATED text vs GOLD text
    gen_overlap_scores = []
    gold_token_set = set(tokenizer.tokenize(gold))
    for gen in gens:
        if gen.strip():
            gen_token_set = set(tokenizer.tokenize(gen))
            if gen_token_set:  # Avoid division by zero
                overlap = len(gen_token_set & gold_token_set) / len(gen_token_set)
                gen_overlap_scores.append(overlap)

    if gen_overlap_scores:
        overlap_scores.append(np.mean(gen_overlap_scores))  # Average across generations

    # 3. Diversity: Measure diversity WITHIN each generation, then average
    gen_diversity_scores = []
    for gen in gens:
        if gen.strip():
            gen_tokens = tokenizer.tokenize(gen)
            if gen_tokens:  # Avoid division by zero
                diversity = len(set(gen_tokens)) / len(gen_tokens)
                gen_diversity_scores.append(diversity)

    if gen_diversity_scores:
        diversity_scores.append(np.mean(gen_diversity_scores))  # Average across generations

    if len(cosine_scores) >= num_examples:
        break

def summarize(name, arr):
    arr = np.array(arr)
    return f"{name}: mean={arr.mean():.4f}, std={arr.std():.4f}"

print("\n=== Evaluation Results ===")
print(summarize("Cosine similarity (generated vs gold)", cosine_scores))
print(summarize("Unigram overlap (generated vs gold)", overlap_scores))
print(summarize("Diversity (distinct-1 within generations)", diversity_scores))

# Additional useful metrics for story generation
print("\n=== Additional Story Generation Metrics ===")
print(f"Total examples evaluated: {len(cosine_scores)}")
print(f"Average generation length: {np.mean([len(tokenizer.tokenize(g)) for g in gens if g.strip()]):.1f} tokens")


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Evaluating:   4%|▍         | 99/2489 [02:23<57:49,  1.45s/it]  


=== Evaluation Results ===
Cosine similarity (generated vs gold): mean=0.6060, std=0.0772
Unigram overlap (generated vs gold): mean=0.3754, std=0.0682
Diversity (distinct-1 within generations): mean=0.5138, std=0.0288

=== Additional Story Generation Metrics ===
Total examples evaluated: 100
Average generation length: 150.2 tokens





In [None]:
# ========================================
# CELL 1: Mount Google Drive and Install Dependencies
# ========================================

from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Install required packages
!pip install transformers torch flask pyngrok sentence-transformers -q

print("✅ Google Drive mounted and dependencies installed!")

Mounted at /content/drive
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m47.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m42.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m48.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from sentence_transformers import SentenceTransformer
import gc

# Update this path to your model location in Google Drive
MODEL_PATH = "/content/drive/MyDrive/Model"  # Update this path!

print("🔄 Loading model and tokenizer...")

# Setup device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load OPT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
tokenizer.pad_token = tokenizer.eos_token

# Load model with memory optimization
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto" if torch.cuda.is_available() else None,
    low_cpu_mem_usage=True
)

if not torch.cuda.is_available():
    model = model.to(device)

model.eval()

# Load sentence transformer for embeddings (optional)
embedder = SentenceTransformer('all-MiniLM-L6-v2', device=device)

print("✅ Model loaded successfully!")
print(f"Model size: ~{sum(p.numel() for p in model.parameters()) / 1e9:.2f}B parameters")

🔄 Loading model and tokenizer...
Using device: cpu


ERROR:root:Unexpected exception finding object shape
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/google/colab/_debugpy_repr.py", line 54, in get_shape
    shape = getattr(obj, 'shape', None)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/werkzeug/local.py", line 318, in __get__
    obj = instance._get_current_object()
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/werkzeug/local.py", line 519, in _get_current_object
    raise RuntimeError(unbound_message) from None
RuntimeError: Working outside of request context.

This typically means that you attempted to use functionality that needed
an active HTTP request. Consult the documentation on testing for
information about how to avoid this problem.
ERROR:root:Unexpected exception finding object shape
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/google/colab/_debugpy_repr.py", line 5

✅ Model loaded successfully!
Model size: ~0.13B parameters


In [None]:
def generate_text(prompt, max_length=250, num_return_sequences=1, temperature=0.8):
    """Generate text using the loaded OPT model"""
    try:
        # Tokenize input
        inputs = tokenizer(prompt, return_tensors="pt", padding=True)
        input_ids = inputs.input_ids.to(device)
        attention_mask = inputs.attention_mask.to(device)

        # Generate
        with torch.no_grad():
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=len(input_ids[0]) + max_length,
                do_sample=True,
                temperature=temperature,
                top_k=50,
                top_p=0.95,
                num_return_sequences=num_return_sequences,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id
            )

        # Decode generated text
        generated_texts = []
        for output in outputs:
            generated = tokenizer.decode(output[input_ids.shape[1]:], skip_special_tokens=True)
            generated_texts.append(generated.strip())

        return generated_texts

    except Exception as e:
        return [f"Error: {str(e)}"]

# Test the model
test_prompt = "Once upon a time"
test_results = generate_text(test_prompt, max_length=50, num_return_sequences=2)

print("🧪 Test Generation:")
print(f"Prompt: {test_prompt}")
for i, result in enumerate(test_results):
    print(f"Generation {i+1}: {result}")

ERROR:root:Unexpected exception finding object shape
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/google/colab/_debugpy_repr.py", line 54, in get_shape
    shape = getattr(obj, 'shape', None)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/werkzeug/local.py", line 318, in __get__
    obj = instance._get_current_object()
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/werkzeug/local.py", line 519, in _get_current_object
    raise RuntimeError(unbound_message) from None
RuntimeError: Working outside of request context.

This typically means that you attempted to use functionality that needed
an active HTTP request. Consult the documentation on testing for
information about how to avoid this problem.
ERROR:root:Unexpected exception finding object shape
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/google/colab/_debugpy_repr.py", line 5

🧪 Test Generation:
Prompt: Once upon a time
Generation 1: , there was a little girl named Lily. She loved to bake cookies with her mom. They would mix flour, sugar, and eggs together in a big bowl. They would mix it all up and add some yummy stuff to make the dough.
Generation 2: , there was a little girl named Lily. She had a big, fluffy dog named Max. Max was very adorable and Lily loved to pet him. One day, Lily and Max went for a walk in the park. They saw a squirrel and started


In [None]:
from flask import Flask, request, jsonify
import json
import threading
import time

app = Flask(__name__)

# Manual CORS setup
@app.after_request
def after_request(response):
    response.headers.add('Access-Control-Allow-Origin', '*')
    response.headers.add('Access-Control-Allow-Headers', 'Content-Type,Authorization')
    response.headers.add('Access-Control-Allow-Methods', 'GET,PUT,POST,DELETE,OPTIONS')
    return response

@app.route('/', methods=['GET'])
def home():
    return jsonify({
        "message": "OPT Model API is running!",
        "endpoints": {
            "/generate": "POST - Generate text",
            "/health": "GET - Health check"
        }
    })

@app.route('/health', methods=['GET'])
def health():
    return jsonify({
        "status": "healthy",
        "model_loaded": True,
        "device": str(device)
    })

@app.route('/generate', methods=['POST'])
def generate():
    try:
        # Get request data
        data = request.get_json()

        if not data or 'prompt' not in data:
            return jsonify({"error": "Missing 'prompt' in request"}), 400

        prompt = data['prompt']
        max_length = data.get('max_length', 100)
        num_return_sequences = data.get('num_return_sequences', 1)
        temperature = data.get('temperature', 0.8)

        # Validate parameters
        max_length = min(max_length, 200)  # Limit max length
        num_return_sequences = min(num_return_sequences, 5)  # Limit number of sequences
        temperature = max(0.1, min(temperature, 2.0))  # Clamp temperature

        # Generate text
        generated_texts = generate_text(
            prompt=prompt,
            max_length=max_length,
            num_return_sequences=num_return_sequences,
            temperature=temperature
        )

        return jsonify({
            "prompt": prompt,
            "generated_texts": generated_texts,
            "parameters": {
                "max_length": max_length,
                "num_return_sequences": num_return_sequences,
                "temperature": temperature
            }
        })

    except Exception as e:
        return jsonify({"error": str(e)}), 500

@app.route('/batch_generate', methods=['POST'])
def batch_generate():
    """Handle multiple prompts at once"""
    try:
        data = request.get_json()

        if not data or 'prompts' not in data:
            return jsonify({"error": "Missing 'prompts' array in request"}), 400

        prompts = data['prompts']
        max_length = data.get('max_length', 100)
        temperature = data.get('temperature', 0.8)

        if len(prompts) > 10:  # Limit batch size
            return jsonify({"error": "Maximum 10 prompts per batch"}), 400

        results = []
        for prompt in prompts:
            generated = generate_text(
                prompt=prompt,
                max_length=max_length,
                num_return_sequences=1,
                temperature=temperature
            )
            results.append({
                "prompt": prompt,
                "generated_text": generated[0] if generated else ""
            })

        return jsonify({
            "results": results,
            "parameters": {
                "max_length": max_length,
                "temperature": temperature
            }
        })

    except Exception as e:
        return jsonify({"error": str(e)}), 500

print("✅ Flask API server created!")

✅ Flask API server created!


In [None]:
# --- Install dependencies (uncomment if running in Colab)
# !pip install flask flask_cors transformers cloudflared --quiet

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from flask import Flask, request, jsonify
from flask_cors import CORS
import threading, time, socket, subprocess, re, shutil

# === FIND FREE PORT ===
def find_free_port(start=8001, end=8100):
    for port in range(start, end):
        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
            if s.connect_ex(('127.0.0.1', port)) != 0:
                return port
    raise RuntimeError("No free port found!")


tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForCausalLM.from_pretrained(MODEL_PATH)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

# === FLASK APP ===
app = Flask(__name__)
CORS(app)  # <-- Enables CORS for all routes

@app.route('/hello')
def hello():
    return "Hello from free port!"

def generate_text(prompt, max_length=250, num_return_sequences=1, temperature=0.8):
    """Generate text using the loaded OPT (or similar) model"""
    try:
        # Tokenize input
        inputs = tokenizer(prompt, return_tensors="pt", padding=True)
        input_ids = inputs.input_ids.to(device)
        attention_mask = inputs.attention_mask.to(device)

        # Generate
        with torch.no_grad():
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=len(input_ids[0]) + max_length,
                do_sample=True,
                temperature=temperature,
                top_k=50,
                top_p=0.95,
                num_return_sequences=num_return_sequences,
                pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id else tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id
            )

        # Decode generated text (strip prompt tokens from output)
        generated_texts = []
        for output in outputs:
            generated = tokenizer.decode(output[input_ids.shape[1]:], skip_special_tokens=True)
            generated_texts.append(generated.strip())
        return generated_texts
    except Exception as e:
        return [f"Error: {str(e)}"]

@app.route('/generate', methods=['POST'])
def generate():
    data = request.get_json()
    prompt = data.get("prompt", "")
    max_length = int(data.get("max_length", 250))
    num_return_sequences = int(data.get("num_return_sequences", 1))
    temperature = float(data.get("temperature", 0.8))

    results = generate_text(
        prompt,
        max_length=max_length,
        num_return_sequences=num_return_sequences,
        temperature=temperature
    )
    return jsonify({"generations": results})

# === RUN FLASK IN THREAD ===
free_port = find_free_port()
def run_flask():
    app.run(port=free_port, host="0.0.0.0")
threading.Thread(target=run_flask, daemon=True).start()
time.sleep(3)

# === INSTALL & RUN CLOUDFLARED ===
if shutil.which('cloudflared') is None:
    print('Installing cloudflared...')
    subprocess.run(['wget', 'https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64.deb'])
    subprocess.run(['dpkg', '-i', 'cloudflared-linux-amd64.deb'])

print(f"Starting Cloudflared tunnel to expose port {free_port}...")

cloudflared_proc = subprocess.Popen(
    ['cloudflared', 'tunnel', '--url', f'http://localhost:{free_port}'],
    stdout=subprocess.PIPE,
    stderr=subprocess.STDOUT,
    universal_newlines=True
)

public_url = None
for line in cloudflared_proc.stdout:
    print(line, end='')  # Show logs for debugging
    match = re.search(r'(https://[a-zA-Z0-9\-]+\.trycloudflare\.com)', line)
    if match:
        public_url = match.group(1)
        break

if public_url:
    print(f"\nYour public endpoint is: {public_url}/generate")
else:
    print("Failed to get public URL from cloudflared. Check logs.")


ERROR:root:Unexpected exception finding object shape
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/google/colab/_debugpy_repr.py", line 54, in get_shape
    shape = getattr(obj, 'shape', None)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/werkzeug/local.py", line 318, in __get__
    obj = instance._get_current_object()
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/werkzeug/local.py", line 519, in _get_current_object
    raise RuntimeError(unbound_message) from None
RuntimeError: Working outside of request context.

This typically means that you attempted to use functionality that needed
an active HTTP request. Consult the documentation on testing for
information about how to avoid this problem.


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:8006
 * Running on http://172.28.0.12:8006
INFO:werkzeug:[33mPress CTRL+C to quit[0m
ERROR:root:Unexpected exception finding object shape
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/google/colab/_debugpy_repr.py", line 54, in get_shape
    shape = getattr(obj, 'shape', None)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/werkzeug/local.py", line 318, in __get__
    obj = instance._get_current_object()
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/werkzeug/local.py", line 519, in _get_current_object
    raise RuntimeError(unbound_message) from None
RuntimeError: Working outside of request context.

This typically means that you attempted to use functionality that needed
an active HTTP request. Consult the documentation on testing for
information about how to avoid this problem.
ERROR:root:Unexpec

Starting Cloudflared tunnel to expose port 8006...
2025-07-15T17:59:48Z INF Thank you for trying Cloudflare Tunnel. Doing so, without a Cloudflare account, is a quick way to experiment and try it out. However, be aware that these account-less Tunnels have no uptime guarantee, are subject to the Cloudflare Online Services Terms of Use (https://www.cloudflare.com/website-terms/), and Cloudflare reserves the right to investigate your use of Tunnels for violations of such terms. If you intend to use Tunnels in production you should use a pre-created named tunnel by following: https://developers.cloudflare.com/cloudflare-one/connections/connect-apps
2025-07-15T17:59:48Z INF Requesting new quick Tunnel on trycloudflare.com...


ERROR:root:Unexpected exception finding object shape
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/google/colab/_debugpy_repr.py", line 54, in get_shape
    shape = getattr(obj, 'shape', None)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/werkzeug/local.py", line 318, in __get__
    obj = instance._get_current_object()
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/werkzeug/local.py", line 519, in _get_current_object
    raise RuntimeError(unbound_message) from None
RuntimeError: Working outside of request context.

This typically means that you attempted to use functionality that needed
an active HTTP request. Consult the documentation on testing for
information about how to avoid this problem.
ERROR:root:Unexpected exception finding object shape
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/google/colab/_debugpy_repr.py", line 5

2025-07-15T17:59:53Z INF +--------------------------------------------------------------------------------------------+
2025-07-15T17:59:53Z INF |  Your quick Tunnel has been created! Visit it at (it may take some time to be reachable):  |
2025-07-15T17:59:53Z INF |  https://kirk-referred-pendant-builder.trycloudflare.com                                   |

Your public endpoint is: https://kirk-referred-pendant-builder.trycloudflare.com/generate


In [None]:

# --- Kill any old servers/tunnels for a clean Colab state
import os, time
os.system("pkill -f flask")
os.system("pkill -f cloudflared")
time.sleep(2)

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from flask import Flask, request, jsonify
import threading, socket, subprocess, re, shutil, time

# --- You MUST define your model path in advance (no reassign here)
# For example:
# MODEL_PATH = "./"  # Or wherever your model files are

# --- Find free port
def find_free_port(start=8001, end=8100):
    for port in range(start, end):
        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
            if s.connect_ex(('127.0.0.1', port)) != 0:
                return port
    raise RuntimeError("No free port found!")

# --- Load your model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForCausalLM.from_pretrained(MODEL_PATH)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

# --- Flask app - Using MANUAL CORS headers (no flask_cors)
app = Flask(__name__)

# NOT using flask_cors to avoid duplicate headers

def generate_text(prompt, max_length=250, num_return_sequences=1, temperature=0.8):
    try:
        inputs = tokenizer(prompt, return_tensors="pt", padding=True)
        input_ids = inputs.input_ids.to(device)
        attention_mask = inputs.attention_mask.to(device)
        with torch.no_grad():
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=len(input_ids[0]) + max_length,
                do_sample=True,
                temperature=temperature,
                top_k=50,
                top_p=0.95,
                num_return_sequences=num_return_sequences,
                pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id else tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id
            )
        generated_texts = []
        for output in outputs:
            generated = tokenizer.decode(output[input_ids.shape[1]:], skip_special_tokens=True)
            generated_texts.append(generated.strip())
        return generated_texts
    except Exception as e:
        return [f"Error: {str(e)}"]

@app.route('/generate', methods=['POST', 'OPTIONS'])
def generate():
    # Handle preflight OPTIONS request
    if request.method == 'OPTIONS':
        response = jsonify({})
        response.headers.add('Access-Control-Allow-Origin', '*')
        response.headers.add('Access-Control-Allow-Headers', 'Content-Type,Authorization')
        response.headers.add('Access-Control-Allow-Methods', 'POST,OPTIONS')
        return response

    # Handle actual POST request
    try:
        data = request.get_json()
        if not data:
            return jsonify({"error": "No JSON data provided"}), 400

        prompt = data.get("prompt", "")
        if not prompt:
            return jsonify({"error": "No prompt provided"}), 400

        max_length = int(data.get("max_length", 250))
        num_return_sequences = int(data.get("num_return_sequences", 1))
        temperature = float(data.get("temperature", 0.8))

        results = generate_text(
            prompt,
            max_length=max_length,
            num_return_sequences=num_return_sequences,
            temperature=temperature
        )

        response = jsonify({"generations": results})
        response.headers.add('Access-Control-Allow-Origin', '*')
        return response

    except Exception as e:
        error_response = jsonify({"error": str(e)})
        error_response.headers.add('Access-Control-Allow-Origin', '*')
        return error_response, 500

# Add a health check endpoint
@app.route('/health', methods=['GET'])
def health():
    response = jsonify({"status": "healthy", "device": device})
    response.headers.add('Access-Control-Allow-Origin', '*')
    return response

# --- Run Flask in a thread
free_port = find_free_port()
def run_flask():
    app.run(port=free_port, host="0.0.0.0", debug=False)

threading.Thread(target=run_flask, daemon=True).start()
time.sleep(3)

# --- Install and run cloudflared
if shutil.which('cloudflared') is None:
    print('Installing cloudflared...')
    subprocess.run(['wget', 'https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64.deb'])
    subprocess.run(['dpkg', '-i', 'cloudflared-linux-amd64.deb'])

print(f"Starting Cloudflared tunnel to expose port {free_port}...")

cloudflared_proc = subprocess.Popen(
    ['cloudflared', 'tunnel', '--url', f'http://localhost:{free_port}'],
    stdout=subprocess.PIPE,
    stderr=subprocess.STDOUT,
    universal_newlines=True
)

public_url = None
for line in cloudflared_proc.stdout:
    print(line, end='')  # Show logs for debugging
    match = re.search(r'(https://[a-zA-Z0-9\-]+\.trycloudflare\.com)', line)
    if match:
        public_url = match.group(1)
        break

if public_url:
    print(f"\nYour public endpoint is: {public_url}/generate")
    print(f"Health check endpoint: {public_url}/health")
    print(f"\nExample usage:")
    print(f"curl -X POST {public_url}/generate \\")
    print(f"  -H 'Content-Type: application/json' \\")
    print(f"  -d '{{\"prompt\": \"Hello world\", \"max_length\": 100}}'")
else:
    print("Failed to get public URL from cloudflared. Check logs.")

# Keep the script running
try:
    while True:
        time.sleep(1)
except KeyboardInterrupt:
    print("\nShutting down...")
    cloudflared_proc.terminate()


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:8012
 * Running on http://172.28.0.12:8012
INFO:werkzeug:[33mPress CTRL+C to quit[0m


Starting Cloudflared tunnel to expose port 8012...
2025-07-15T18:12:53Z INF Thank you for trying Cloudflare Tunnel. Doing so, without a Cloudflare account, is a quick way to experiment and try it out. However, be aware that these account-less Tunnels have no uptime guarantee, are subject to the Cloudflare Online Services Terms of Use (https://www.cloudflare.com/website-terms/), and Cloudflare reserves the right to investigate your use of Tunnels for violations of such terms. If you intend to use Tunnels in production you should use a pre-created named tunnel by following: https://developers.cloudflare.com/cloudflare-one/connections/connect-apps
2025-07-15T18:12:53Z INF Requesting new quick Tunnel on trycloudflare.com...
2025-07-15T18:12:57Z INF +--------------------------------------------------------------------------------------------+
2025-07-15T18:12:57Z INF |  Your quick Tunnel has been created! Visit it at (it may take some time to be reachable):  |
2025-07-15T18:12:57Z INF |  ht

INFO:werkzeug:127.0.0.1 - - [15/Jul/2025 18:13:21] "POST /generate HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [15/Jul/2025 18:14:45] "OPTIONS /generate HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [15/Jul/2025 18:14:52] "POST /generate HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [15/Jul/2025 18:15:28] "OPTIONS /generate HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [15/Jul/2025 18:15:40] "POST /generate HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [15/Jul/2025 18:15:55] "OPTIONS /generate HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [15/Jul/2025 18:16:08] "POST /generate HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [15/Jul/2025 18:44:18] "[31m[1mGET /generate HTTP/1.1[0m" 405 -
