# Dependency install

In [1]:
!pip install datasets
!pip install -q -U google-generativeai



# Imports

In [2]:
import re
import os
import time
import nltk
import wandb
import torch
import random
import numpy as np
import pandas as pd
import torch.nn as nn
from tqdm import tqdm
from torch.utils import data
from collections import Counter
from google.api_core import retry
import google.generativeai as gemini_ai
from transformers import GPT2TokenizerFast
from kaggle_secrets import UserSecretsClient
from google.generativeai.types import RequestOptions
from datasets import load_dataset, Dataset, DatasetDict
from torch.nn import TransformerDecoder, TransformerEncoder
from torch.nn import TransformerEncoderLayer, TransformerDecoderLayer
from transformers import AutoTokenizer, AutoModelForCausalLM, PreTrainedTokenizerFast
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, decoders, trainers


# Setup

## API Keys

In [3]:
user_secrets = UserSecretsClient()

### Wandb

In [4]:
wandb_api_key = user_secrets.get_secret("WANDB_API_KEY")

In [5]:
wandb.login(key=wandb_api_key)

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

### Gemini

In [6]:
gemini_api_key = user_secrets.get_secret("GEMINI_API_KEY")

In [7]:
gemini_ai.configure(api_key=gemini_api_key)

## Running on GPU (if available)

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Config

## Config for Hyperparameters

In [9]:
config = {
    "BLOCK_SIZE": 128,
    "EMB_SIZE": 80,
    "N_ATTENTION_HEADS": 8,
    "N_DECODER_BLOCKS": 4,
    "DIM_FEEDFORWRD": 2048,
    "VOCAB_SIZE": 10000,
    "MAX_OUT_TOKENS": 200,
    "EVAL_ITER": 100,
    "LR": 3e-4,
    "BATCH_SIZE": 32,
    "EPOCHS": 5,
    "PATIENCE": 3,
    "MODEL_NAME": "custom-3M",
    "WORKING_DIR": "/kaggle/working",
    "VOCAB_DIRNAME": "/kaggle/input/vocab-dict-v2/vocab_dict_v2",
    "DF_PATH": "/kaggle/input/compare-dataframes/rating_df_baseline-3M.pkl",
    "LOAD_MODELPATH": "/kaggle/input/custom-3M/pytorch/default/1/custom-3M.pt",
    "DEVICE": 'cuda' if torch.cuda.is_available() else 'cpu'
}
assert config['EMB_SIZE'] % config['N_ATTENTION_HEADS'] == 0

In [10]:
wandb.init(
    project='custom-3M-Pruning',
    config=config
)
text_table = wandb.Table(columns=['epoch', 'loss', 'predicted text'])

[34m[1mwandb[0m: Currently logged in as: [33msiddharthdhara17[0m ([33mdhara[0m). Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112848155555513, max=1.0…

## Config for training

In [11]:
load_model = False

In [12]:
load_df = False

# Dataset

## Download the dataset

In [13]:
dataset = load_dataset("roneneldan/TinyStories")

README.md:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

(…)-00000-of-00004-2d5a1467fff1081b.parquet:   0%|          | 0.00/249M [00:00<?, ?B/s]

(…)-00001-of-00004-5852b56a2bd28fd9.parquet:   0%|          | 0.00/248M [00:00<?, ?B/s]

(…)-00002-of-00004-a26307300439e943.parquet:   0%|          | 0.00/246M [00:00<?, ?B/s]

(…)-00003-of-00004-d243063613e5a057.parquet:   0%|          | 0.00/248M [00:00<?, ?B/s]

(…)-00000-of-00001-869c898b519ad725.parquet:   0%|          | 0.00/9.99M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2119719 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/21990 [00:00<?, ? examples/s]

## Build the Vocabulary

In [14]:
# def preprocess_text(text):
#     # Split text into sentences
#     sentences = nltk.sent_tokenize(text)

#     punctuation_pattern = r"[^\w\s]"

#     # Add <SOS> and <EOS> tokens to each sentence and remove punctuation
#     processed_sentences = [f"{re.sub(punctuation_pattern, '', sentence).strip()}" for sentence in sentences]
    
#     # Join sentences back into a single string
#     return " ".join(processed_sentences)

# # Update the vocabulary building function to include this preprocessing
# def build_vocabulary(dataset_dict, vocab_size=50, num_samples=None):
#     # Initialize a counter for word frequencies
#     word_counter = Counter()

#     # Use tqdm to add a progress bar for the iteration
#     data = dataset_dict['train']['text']
    
#     # Randomly sample num_samples if specified
#     if num_samples:
#         data = random.sample(data, num_samples)

#     # Tokenize and clean text using the tokenizer and update word frequencies
#     for text in tqdm(data, desc="Building vocabulary", unit="text"):
#         processed_text = preprocess_text(text.lower())
#         tokens = [token.replace('Ġ', '') for token in tokenizer.tokenize(processed_text)]
#         word_counter.update(tokens)

#     # Get the most common tokens and create a vocabulary dictionary
#     vocab_dict = {word: idx for idx, (word, _) in enumerate(word_counter.most_common(vocab_size))}
    
#     # Convert the vocabulary dictionary to a DatasetDict
#     vocab_dataset = DatasetDict({
#         'train': Dataset.from_dict({'word': list(vocab_dict.keys()), 'index': list(vocab_dict.values())}),
#         'validation': dataset_dict['validation']
#     })
#     return vocab_dataset

# # Example usage
# vocab_dataset = build_vocabulary(dataset, vocab_size=9996)
# print(vocab_dataset['train'][:10])  # Print the first 10 tokens from the vocabulary dataset

## Saving the Vocabulary

In [15]:
# vocab_dataset.save_to_disk('/kaggle/working/vocab_dict_v2')

## Loading the Vocabulary

In [16]:
loaded_vocab_dataset = DatasetDict.load_from_disk(config['VOCAB_DIRNAME'])

In [17]:
custom_vocab = loaded_vocab_dataset['train']['word']
new_vocab_size = len(custom_vocab)
print(new_vocab_size)

9996


In [18]:
custom_vocab_dict = {word: idx for idx, word in enumerate(custom_vocab)}
if "[UNK]" not in custom_vocab_dict:
    print("Adding [UNK] token to the vocabulary.")
    custom_vocab_dict["[UNK]"] = len(custom_vocab_dict)

Adding [UNK] token to the vocabulary.


## Split the dataset

In [19]:
used_dataset_size = 100000

In [20]:
sampled_dataset = dataset['train'].train_test_split(train_size=0.8, test_size=0.2)
train_dataset, val_dataset = sampled_dataset['train'].select(range(int(0.8 * used_dataset_size))), sampled_dataset['test'].select(range(int(0.2 * used_dataset_size)))

# Model and tokenizer

## Model

In [21]:
# class Transformer21MFinalSingleLayer(nn.Module):
#     def __init__(self, vocab_size=50258, d_model=192, nhead=4, num_encoder_layers=1, num_decoder_layers=1, dim_feedforward=768, max_len=1000, device="cpu"):
#         super(Transformer21MFinalSingleLayer, self).__init__()

#         self.device = device
        
#         # Embedding layer
#         self.embedding = nn.Embedding(vocab_size, d_model).to(self.device)
        
#         # Positional encoding
#         self.positional_encoding = nn.Parameter(torch.zeros(1, max_len, d_model)).to(self.device)  # Assuming max length of 1000

#         # Transformer Encoder and Decoder layers
#         encoder_layer = TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, dropout=0.1)
#         decoder_layer = TransformerDecoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, dropout=0.1)
        
#         self.transformer_encoder = TransformerEncoder(encoder_layer, num_layers=num_encoder_layers).to(self.device)
#         self.transformer_decoder = TransformerDecoder(decoder_layer, num_layers=num_decoder_layers).to(self.device)
        
#         # Output linear layer
#         self.fc_out = nn.Linear(d_model, vocab_size).to(self.device)
        
#         self.logits = None  # Store logits as an attribute

#     def forward(self, src, past_key_values=None):
#         # Move src to the correct device
#         src = src.to(self.device)
        
#         # Create tgt as src shifted by one position
#         tgt = src[:, 1:].to(self.device)
#         src = src[:, :-1]

#         # Embedding and positional encoding
#         src = self.embedding(src) + self.positional_encoding[:, :src.size(1), :]
#         tgt = self.embedding(tgt) + self.positional_encoding[:, :tgt.size(1), :]

#         # Pass through the encoder
#         memory = self.transformer_encoder(src)

#         # Pass through the decoder with past_key_values if provided
#         output = self.transformer_decoder(tgt, memory, memory_key_padding_mask=None, 
#                                    tgt_key_padding_mask=None, 
#                                    memory_mask=None, 
#                                    tgt_mask=None)

#         # Output layer to vocab logits
#         self.logits = self.fc_out(output)

#         # Ensure logits are on the correct device
#         self.logits = self.logits.to(self.device)

#         return CausalLMOutputWithPast(
#             loss=None, 
#             logits=self.logits, 
#         )
    

In [22]:
# Define GPT-2 Architecture

class GPT2FromScratch(nn.Module):
    def __init__(self, config):
        super(GPT2FromScratch, self).__init__()
        self.embeddings = nn.Embedding(config["VOCAB_SIZE"], config["EMB_SIZE"])
        self.blocks = nn.ModuleList([
            nn.TransformerEncoderLayer(
                d_model=config["EMB_SIZE"],
                nhead=config["N_ATTENTION_HEADS"],
                dim_feedforward=config["DIM_FEEDFORWRD"],
                activation='gelu'
            )
            for _ in range(config["N_DECODER_BLOCKS"])
        ])
        self.final_norm = nn.LayerNorm(config["EMB_SIZE"])
        self.head = nn.Linear(config["EMB_SIZE"], config["VOCAB_SIZE"])

    def forward(self, x):
        x = self.embeddings(x)
        for block in self.blocks:
            x = block(x)
        x = self.final_norm(x)
        return self.head(x)

In [35]:
# Pruning Implementation

from torch.nn.utils import prune

def prune_layer(layer, amount):
    """
    Prune the weights of the linear layers.
    Args:
        layer: the layer to prune
        amount: percentage of weights to prune
    """
    # Applying pruning to the weight
    prune.l1_unstructured(layer, name="weight", amount=amount)
    
    # Ensure that the biases are not pruned (only remove 'weight' if needed)
    if hasattr(layer, 'weight'):
        prune.remove(layer, 'weight')  # Remove the pruning reparameterization from 'weight' if desired


In [36]:
class GPT2WithPruning(nn.Module):
    def __init__(self, config):
        super(GPT2WithPruning, self).__init__()
        self.model = GPT2FromScratch(config)
        # Prune the final linear layer (head)
        prune_layer(self.model.head, amount=0.2)  # Example: pruning 20% of the weights
        
        # Prune other layers if needed
        for block in self.model.blocks:
            prune_layer(block.self_attn.out_proj, amount=0.2)
            prune_layer(block.linear1, amount=0.2)
            prune_layer(block.linear2, amount=0.2)

    def forward(self, x):
        logits = self.model(x)
        return {'logits': logits}  # Return a dictionary with logits

    def generate(self, input_ids, max_length=50, **kwargs):
        output = input_ids
        for _ in range(max_length):
            logits = self.forward(output)['logits']
            next_token = torch.argmax(logits[:, -1], dim=-1).unsqueeze(-1)
            output = torch.cat((output, next_token), dim=1)
        return output

## Tokenizer

In [25]:
# Create a tokenizer from scratch with custom vocab
tokenizer = Tokenizer(models.WordLevel(vocab=custom_vocab_dict, unk_token="[UNK]"))
base_tokenizer = AutoTokenizer.from_pretrained("roneneldan/TinyStories-1Layer-21M")
# Set up pre-tokenizer, normalizer, and decoder (as used in most tokenizers)
tokenizer.normalizer = normalizers.Lowercase()
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
tokenizer.decoder = decoders.WordPiece()

# Save the tokenizer to a file
tokenizer.save("custom_tokenizer.json")

# Load this tokenizer into a PreTrainedTokenizerFast
custom_tokenizer = PreTrainedTokenizerFast(
    tokenizer_file="custom_tokenizer.json",
    model_max_length=base_tokenizer.model_max_length
)

# Add special tokens if needed
custom_tokenizer.add_special_tokens({'additional_special_tokens': ['<sos>', '<eos>']})
custom_tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# Save the custom tokenizer
custom_tokenizer.save_pretrained("custom_tokenizer")

# Reload and print the vocabulary size to confirm
custom_tokenizer = AutoTokenizer.from_pretrained("custom_tokenizer")
print(f"Custom tokenizer vocabulary size: {custom_tokenizer.vocab_size}")

tokenizer_config.json:   0%|          | 0.00/722 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

Custom tokenizer vocabulary size: 9997




## Tokenize the dataset

In [26]:
# Tokenization function for HuggingFace dataset
def tokenize_function(examples):
    return custom_tokenizer(examples["text"], padding="max_length", truncation=True, max_length=config['BLOCK_SIZE'])

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

## Loading the dataset

In [27]:
# Convert tokenized dataset to PyTorch tensors
train_dataset.set_format(type='torch', columns=['input_ids'])
val_dataset.set_format(type='torch', columns=['input_ids'])

train_loader = data.DataLoader(train_dataset, batch_size=config['BATCH_SIZE'], shuffle=True)
val_loader = data.DataLoader(val_dataset, batch_size=config['BATCH_SIZE'], shuffle=False)

In [28]:
print(len(custom_tokenizer))

10000


# Training the model

## Initializing the model

In [29]:
# model = AutoModelForCausalLM.from_pretrained("roneneldan/TinyStories-1Layer-21M")

In [30]:
# model = Transformer21MFinalSingleLayer(device=config['DEVICE'])

In [37]:
model = GPT2WithPruning(config)

In [38]:
model = model.to(config['DEVICE'])

In [39]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total Parameters: {total_params}")

Total Parameters: 3034352


## Optimizer and Loss Function

In [40]:
optimizer = torch.optim.Adam(model.parameters(), lr=config['LR'])
loss_fn = torch.nn.CrossEntropyLoss()

## Evaluation function for validation set

In [41]:
@torch.no_grad()
def eval_model(training_model: torch.nn.Module, val_loader: torch.utils.data.DataLoader):
    training_model.eval()
    losses = torch.zeros(config['EVAL_ITER'])
    for k in range(config['EVAL_ITER']):
        batch = next(iter(val_loader))  # Get the batch as a single value
        s_val = batch['input_ids'].to(config['DEVICE'])  # Access 'input_ids' from the batch
        t_val = s_val[:, 1:].clone()  # Shift for language model prediction
        s_val = s_val[:, :-1]  # Remove last token from source
        
        # Forward pass through the model
        val_output = training_model(s_val)
        val_logits = val_output['logits']  # Access logits from the model's output
        
        # Reshape logits and targets
        val_logits = val_logits.view(s_val.size(0) * s_val.size(1), config['VOCAB_SIZE'])
        t_val = t_val.view(s_val.size(0) * s_val.size(1))
        
        # Compute the loss
        losses[k] = torch.nn.functional.cross_entropy(val_logits, t_val).item()
    
    training_model.train()
    return losses.mean()

## Training function 

In [42]:
def train_model(model, train_loader, val_loader, optimizer, config, loss_fn):
    """
    Trains the model and logs the training and validation losses, with progress tracking using tqdm.
    """

    best_val_loss = float('inf')
    patience_counter = 0
    
    try:
        for epoch in range(config['EPOCHS']):
            model.train()
            epoch_loss = 0

            epoch_progress = tqdm(train_loader, desc=f"Training Epoch {epoch+1}/{config['EPOCHS']}: ", leave=False)
            
            for b_idx, batch in enumerate(epoch_progress):
                sources = batch['input_ids'].to(config['DEVICE'])
                targets = sources[:, 1:].clone().to(config['DEVICE'])  # Shift for language model prediction
                sources = sources[:, :-1]  # Remove last token from source
                logits = model(sources)['logits']  # Access logits from the model output

                # Get the actual batch size and sequence length
                batch_size = sources.size(0)
                seq_length = sources.size(1)
                
                # Reshape logits and targets
                logits = logits.view(batch_size * seq_length, config['VOCAB_SIZE'])
                targets = targets.view(batch_size * seq_length)
                
                loss = loss_fn(logits, targets)
                wandb.log({"loss": loss.item()})
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                epoch_loss += loss.item()
                avg_loss = epoch_loss / (b_idx + 1)
                epoch_progress.set_postfix(training_loss=avg_loss)

            avg_epoch_loss = epoch_loss / len(train_loader)
            print(f"Epoch {epoch+1}/{config['EPOCHS']} completed with average training loss: {avg_epoch_loss}")
            
            val_loss = eval_model(model, val_loader)
            print(f"Validation loss after {epoch+1} epochs: {val_loss}")
            wandb.log({"val_loss": val_loss})

            if val_loss < best_val_loss:
                best_val_loss = val_loss
                patience_counter = 0
                print(f"New best validation loss: {val_loss}.")
            else:
                patience_counter += 1
                print(f"No improvement in validation loss. Patience counter: {patience_counter}")
                
            if patience_counter >= config['PATIENCE']:
                print("Early stopping triggered.")
                break
            
    except KeyboardInterrupt:
        print("Training interrupted.")
    print("Training completed.")


## Running the training loop

In [43]:
if not load_model:
    train_model(model, train_loader, val_loader, optimizer, config, loss_fn)

                                                                                           

Epoch 1/5 completed with average training loss: 3.9226291582107544
Validation loss after 1 epochs: 3.618668794631958
New best validation loss: 3.618668794631958.


                                                                                           

Epoch 2/5 completed with average training loss: 3.531200269126892
Validation loss after 2 epochs: 3.5424044132232666
New best validation loss: 3.5424044132232666.


                                                                                           

Epoch 3/5 completed with average training loss: 3.483588570213318
Validation loss after 3 epochs: 3.5197248458862305
New best validation loss: 3.5197248458862305.


                                                                                           

Epoch 4/5 completed with average training loss: 3.4600314918518067
Validation loss after 4 epochs: 3.4948320388793945
New best validation loss: 3.4948320388793945.


                                                                                           

Epoch 5/5 completed with average training loss: 3.444957460308075
Validation loss after 5 epochs: 3.4903998374938965
New best validation loss: 3.4903998374938965.
Training completed.


## Saving the model

In [44]:
model_req_path = config['WORKING_DIR']+'/model'

In [45]:
if not load_model:
    if not os.path.exists(model_req_path):
        os.mkdir(model_req_path)
        
    torch.save({'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict()
                }, model_req_path+'/'+config['MODEL_NAME']+'.pt')

    print("Model saved!")

Model saved!


## Loading the model

In [46]:
if load_model and os.path.exists(model_req_path):
    checkpoint = torch.load(model_req_path+'/'+config['MODEL_NAME']+'.pt', weights_only=True)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    print("Loaded the model!")
elif not os.path.exists(model_req_path):
    print("Model directory not found! Please check the path.")

# Testing the model

## Functions for generating and scoring the outputs

In [47]:
def prepare_input(text, tokenizer, device, block_size=128):
    # Tokenize and encode the input text
    inputs = tokenizer(
        text, return_tensors="pt", padding="max_length",
        truncation=True, max_length=block_size
    )
    # Move input tensors to the appropriate device
    return {key: val.to(device) for key, val in inputs.items()}

In [48]:
def generate_text(model, tokenizer, input_text, config):
    # Prepare the input
    inputs = prepare_input(input_text, tokenizer, config["DEVICE"], config["BLOCK_SIZE"])
    
    # Generate text
    output_ids = model.generate(
        inputs["input_ids"],
        max_new_tokens=config['MAX_OUT_TOKENS'],  # max tokens to generate
        pad_token_id=tokenizer.pad_token_id,  # Ensure proper padding
        eos_token_id=tokenizer.eos_token_id,  # End generation on EOS
        do_sample=True,  # Enable sampling for variety in output
        temperature=0.7,  # Adjust temperature for randomness in sampling
        top_k=50  # Limit to top-k tokens to avoid unlikely predictions
    )
    
    # Decode the generated IDs to text
    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return generated_text

In [49]:
def evaluate_text_gemini(generated_text):
    # Use the generative model directly for evaluation
    model2 = gemini_ai.GenerativeModel("gemini-1.5-flash")
    
    # Generate content and get the response
    response = model2.generate_content(generated_text, request_options=RequestOptions(retry=retry.Retry(initial=10, multiplier=2, maximum=60, timeout=300)))
    
    # Check if response contains valid content
    if response.candidates:
        eval_response = response.candidates[0].content.parts[0].text  # Get the text of the first candidate
        return eval_response
    else:
        print("No valid candidates in the response. Check the generated text or API settings.")
        return None

## Lists of input prompts (generated by ChatGPT)

In [50]:
input_texts_list = [
    "In a bustling city filled with secrets, a shadow loomed.",
    "High in the mountains, a lone traveler braved the storm.",
    "Beneath the waves, in a hidden underwater kingdom, life thrived.",
    "It was a quiet night until the distant howls broke the silence.",
    "In a world where dragons flew free, danger was never far.",
    "On a small farm, a young girl discovered a mysterious egg.",
    "In a town where everyone whispered, the new stranger caused a stir.",
    "Under the light of the full moon, something magical began to stir.",
    "In a time of peace, a hidden evil began to rise.",
    "Amidst the golden sands of the desert, a lost caravan wandered.",
    "In the heart of the enchanted forest, a hidden village thrived.",
    "Aboard a ship sailing unknown seas, the crew faced a peculiar sight.",
    "In a world where animals spoke, a young boy sought adventure.",
    "Deep within the icy tundra, an ancient secret lay buried.",
    "On a stormy night, a stranger knocked at the castle door.",
    "In a village plagued by mysteries, a young detective took charge.",
    "Across the galaxy, explorers marveled at a new world.",
    "Underneath the quiet streets, a hidden society had formed.",
    "Long ago, a powerful wizard disappeared without a trace.",
    "At the edge of the world, a brave crew faced the unknown.",
    "In a city that never slept, two souls crossed paths unexpectedly.",
    "In a school for magical creatures, new students arrived.",
    "Hidden in the clouds, a floating kingdom kept its secrets.",
    "Far in the distant future, humanity encountered its first alien.",
    "Under a blanket of stars, two friends made a promise.",
    "In a library of forgotten books, a mysterious journal appeared.",
    "Aboard a train that never stopped, secrets unraveled slowly.",
    "Amid a sea of stars, a lone spaceship drifted in silence.",
    "In a kingdom of snow and ice, a prophecy was foretold.",
    "Deep in the jungle, explorers discovered a glowing stone.",
    "In a village under a curse, a hero was born.",
    "At the dawn of time, the first humans encountered magic.",
    "On a distant moon, an outpost signaled for help.",
    "In a quiet town, every night brought new mysteries.",
    "Beneath the great pyramids, an ancient secret was uncovered.",
    "In a world without color, a single red flower bloomed.",
    "On a ship lost at sea, whispers of an island spread.",
    "In the middle of nowhere, a door to another world appeared.",
    "In a castle of mirrors, reflections began to act strangely.",
    "On the edge of a cliff, a young prince made a fateful decision.",
    "In a forest where time stood still, a visitor arrived.",
    "In a kingdom ruled by animals, a lion declared his rule.",
    "In a world where wishes came true, a girl wished for more time.",
    "In a school where shadows came alive, mysteries abounded.",
    "In a library that seemed endless, a strange book was found.",
    "On a small island, villagers began to notice odd happenings.",
    "In the great desert, a treasure was hidden for centuries.",
    "In a city beneath the earth, a new ruler emerged.",
    "In a world divided by seasons, an eternal summer began.",
    "At the top of a mountain, a temple held the key to truth.",
    "In a town where time rewound each day, mysteries deepened.",
    "On a snowy peak, two climbers discovered an ancient statue.",
    "In a mansion where paintings moved, a mystery unraveled.",
    "At the crossroads of realms, two adventurers met.",
    "In a school hidden in the woods, every student had a secret.",
    "In a town of endless rain, hope was a rare sight.",
    "On a train bound for nowhere, strange passengers arrived.",
    "In a forest where trees whispered, a path emerged.",
    "In a world where stars guided destiny, a comet foretold change.",
    "In a lonely tower, a forgotten sorceress waited.",
    "At the edge of a lake, the reflection showed another world.",
    "In a world beneath the clouds, legends of the sky spread.",
    "In a kingdom of night, a lone warrior sought dawn.",
    "In a garden of eternal flowers, time stood still.",
    "On an island that disappeared each night, a story began.",
    "In a city that glittered like gold, shadows lurked.",
    "In a land where dreams came alive, a nightmare was born.",
    "On the longest night, a hero's journey began.",
    "In a kingdom lost to time, an old legend resurfaced.",
    "In a forest cloaked in fog, paths led nowhere.",
    "In a small shop, a mysterious item granted wishes.",
    "On the day the sun didn't rise, fear spread.",
    "In a town with no maps, wanderers were welcome.",
    "Under a sky of falling stars, two souls met.",
    "In a house with endless rooms, a mystery unraveled.",
    "In a town where no one aged, secrets were kept.",
    "In the middle of the ocean, a floating castle appeared.",
    "At the heart of the desert, a lone tree bloomed.",
    "In a world ruled by music, silence was feared.",
    "On the night of the festival, a strange guest arrived.",
    "In a land where darkness ruled, a light began to shine.",
    "In a city where everyone wore masks, truths hid.",
    "In a world of whispers, silence was a power.",
    "On the eve of battle, a hero was forged.",
    "In a castle of glass, a kingdom looked on.",
    "In the kingdom of echoes, a voice was heard.",
    "In a forest where dreams came true, nightmares hid.",
    "In a town with endless winters, a new day dawned.",
    "In a realm where seasons changed daily, stories grew.",
    "Under the gaze of ancient gods, mortals lived.",
    "In a world frozen in time, a clock began to tick.",
    "In a library of the lost, an old tale was read.",
    "In a meadow under starlight, two friends found magic.",
    "In a city where clocks ran backward, futures changed.",
    "In a kingdom ruled by children, a new game began.",
    "In a land where the moon never rose, stars told tales.",
    "In the far north, where the aurora danced, legends lived.",
    "At the edge of eternity, two lovers met.",
    "In a village of music, silence brought fear.",
    "In a world where memories could be traded, one boy remembered."
]

## Prompts for scoring the output by Gemini

In [51]:
step_1_static = (
    "The following exercise, the student is given the beginning of a story. The student needs to complete it into a full story. "
    "The exercise tests the student’s language abilities and creativity. The symbol *** marks the separator between the "
    "prescribed beginning and the student’s completion: "
)

step_2 = (
    "Please provide your general assessment about the part written by the student (the one after the *** symbol). "
    "Only give the ratings without description and overall could be omitted"
    "Do not give explainations for the ratings"
    "Give them in one single line, separated by semi-colon"
    "Is it grammatically correct? Is it consistent with the beginning of the story? Pay special attention to whether the "
    "student manages to complete the sentence which is split in the middle by the separator ***."
)

step_3 = (
    "Now, grade the student’s completion in terms of grammar, creativity, consistency with the story’s beginning and "
    "whether the plot makes sense. Moreover, please provide your best guess of what the age of the student might be, "
    "as reflected from the completion. Choose from possible age groups: A: 3 or under. B: 4-5. C: 6-7. D: 8-9. E: 10-12. F: 13-16. "
    "e.g. Grammar: 8/10, Creativity: 7/10, Consistency: 7/10, Age group: E (10-12)"
)

## Getting the scores

In [52]:
pattern = r"Grammar: (\d+)/10; Consistency: (\d+)/10; Creativity: (\d+)/10; Age group: (.+)"
score_list = []
count = 0

In [53]:
if not load_df:
    for input_text in input_texts_list:
        output_text = generate_text(model, custom_tokenizer, input_text, config)
        dynamic_part = f"{input_text} Story begins here:***  {''.join(output_text)}. *** The story ends here"
        final_prompt = f"{step_1_static}{dynamic_part}\n{step_2}\n{step_3}"
        gemini_generated_response = evaluate_text_gemini(final_prompt)
        gemini_generated_response = gemini_generated_response.strip()
        count += 1
    
        # print(f"{input_text}; {gemini_generated_response}")
    
        match = re.search(pattern, gemini_generated_response)
        if match:
            grammar, consistency, creativity, age_group = match.groups()
            score_list.append([input_text, int(grammar), int(consistency), int(creativity), age_group])
        else:
            score_list.append([input_text, 0, 0, 0, "DNF"])
        print(f"Number of prompts appended: {count}")

Number of prompts appended: 1
Number of prompts appended: 2
Number of prompts appended: 3
Number of prompts appended: 4
Number of prompts appended: 5
Number of prompts appended: 6
Number of prompts appended: 7
Number of prompts appended: 8
Number of prompts appended: 9
Number of prompts appended: 10
Number of prompts appended: 11
Number of prompts appended: 12
Number of prompts appended: 13
Number of prompts appended: 14
Number of prompts appended: 15
Number of prompts appended: 16
Number of prompts appended: 17
Number of prompts appended: 18
Number of prompts appended: 19
Number of prompts appended: 20
Number of prompts appended: 21
Number of prompts appended: 22
Number of prompts appended: 23
Number of prompts appended: 24
Number of prompts appended: 25
Number of prompts appended: 26
Number of prompts appended: 27
Number of prompts appended: 28
Number of prompts appended: 29
Number of prompts appended: 30
Number of prompts appended: 31
Number of prompts appended: 32
Number of prompts

## Putting them in Pandas dataframe

In [54]:
df = pd.DataFrame(score_list, columns=["Input Prompt", "Grammar", "Creativity", "Consistency", "Age Group"])

In [55]:
df

Unnamed: 0,Input Prompt,Grammar,Creativity,Consistency,Age Group
0,"In a bustling city filled with secrets, a shad...",0,0,0,A (3 or under)
1,"High in the mountains, a lone traveler braved ...",0,0,0,A (3 or under)
2,"Beneath the waves, in a hidden underwater king...",3,2,2,C
3,It was a quiet night until the distant howls b...,5,3,4,D (8-9)
4,"In a world where dragons flew free, danger was...",0,0,0,DNF
...,...,...,...,...,...
95,"In a land where the moon never rose, stars tol...",0,0,0,A (3 or under)
96,"In the far north, where the aurora danced, leg...",0,0,0,A (3 or under)
97,"At the edge of eternity, two lovers met.",0,0,0,A (3 or under)
98,"In a village of music, silence brought fear.",0,0,0,A (3 or under)


## Saving the Pandas Dataframe

In [56]:
result_req_path = config['WORKING_DIR']+'/result'

In [57]:
if not load_df:
    if not os.path.exists(result_req_path):
        os.mkdir(result_req_path)
        
    df.to_pickle(result_req_path+'/'+'rating_df_'+config['MODEL_NAME']+'.pkl')

    print("Dataframe saved!")

Dataframe saved!


## Loading the Pandas Dataframe

In [58]:
if load_df and os.path.exists(config['DF_PATH']):
    df = pd.read_pickle(config['DF_PATH'])
    print("Loaded dataframe!")
elif not os.path.exists(config['DF_PATH']):
    print("Result directory not found! Please check the path.")

Result directory not found! Please check the path.


In [59]:
df

Unnamed: 0,Input Prompt,Grammar,Creativity,Consistency,Age Group
0,"In a bustling city filled with secrets, a shad...",0,0,0,A (3 or under)
1,"High in the mountains, a lone traveler braved ...",0,0,0,A (3 or under)
2,"Beneath the waves, in a hidden underwater king...",3,2,2,C
3,It was a quiet night until the distant howls b...,5,3,4,D (8-9)
4,"In a world where dragons flew free, danger was...",0,0,0,DNF
...,...,...,...,...,...
95,"In a land where the moon never rose, stars tol...",0,0,0,A (3 or under)
96,"In the far north, where the aurora danced, leg...",0,0,0,A (3 or under)
97,"At the edge of eternity, two lovers met.",0,0,0,A (3 or under)
98,"In a village of music, silence brought fear.",0,0,0,A (3 or under)
