In [1]:
import os
import random
from argparse import ArgumentParser
from tqdm import tqdm
import json

import torch

from lima_dataset import load_lima_dataset
from utils import (
    read_yaml,
    get_model_config,
    get_tokenizer_config,
    get_split_config,
    get_dataset_config,
    get_generation_config,
)
from model import (
    load_model,
    load_tokenizer,
    generate,
    compute_metrics,
)

In [2]:
config = read_yaml("./configs/generate_config_llama.yaml")

In [3]:
tokenizer_name, tokenizer_path, tokenizer_config = get_tokenizer_config(config)
tokenizer = load_tokenizer(
    tokenizer_name=tokenizer_name,
    tokenizer_path=tokenizer_path,
    tokenizer_config=tokenizer_config,
)
tokenizer_name, tokenizer_path, tokenizer_config

('llama2',
 'meta-llama/Llama-2-7b-hf',
 {'special_token_kwargs': {'pad_token': 'eos_token',
   'additional_tokens': ['EOT_TOKEN']}})

In [4]:
dataset_desc, (_, _, test_split_config) = (
    get_split_config(config)
)

test_dataset_path, test_sub_split_size, test_dataset_config = get_dataset_config(
    test_split_config
)
dataset = load_lima_dataset(
    test_dataset_path, "test", test_sub_split_size, **test_dataset_config
)

In [5]:
model_name, model_path, base_model_path, model_config = get_model_config(config)
model_config['pad_token_id'] = tokenizer.pad_token_id
model_config['tokenizer_length'] = len(tokenizer)
model_config

{'force_download': False,
 'device_map': 'cuda:0',
 'bnb_config': {'load_in_4bit': True,
  'bnb_4bit_quant_type': 'nf4',
  'bnb_4bit_compute_dtype': 'float16',
  'bnb_4bit_use_double_quant': False},
 'pad_token_id': 2,
 'tokenizer_length': 32001}

In [6]:
model = load_model(
    model_string=model_name,
    model_path=model_path,
    base_model_path=base_model_path,
    model_config=model_config,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [7]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32001, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.

In [8]:
generation_config = get_generation_config(config)
generation_config

{'max_length': 500,
 'top_p': 0.85,
 'temperature': 0.5,
 'num_beams': 1,
 'top_k': None,
 'do_sample': True,
 'repetition_penalty': 1.2}

In [9]:
# dataset[0]['conversations'][0]

In [10]:
from transformers import GenerationConfig

In [13]:
gen = GenerationConfig(**generation_config)
gen.pad_token_id = tokenizer.pad_token_id

In [None]:
output_file = "/home/hmankodi/instruct_tuning/FineTune-Llama2-LIMA/generated_outs/2_lima_finetuned_generated_outputs.json"

try:
    with open(output_file, "w", encoding="utf-8") as f:
        f.write("[\n")  # Start of JSON array
        success = False  # Flag to track successful writes

        with tqdm(total=len(dataset), desc="Generating Outputs", unit="sample") as pbar:
            for idx, prompt in enumerate(dataset):
                try:
                    prompt = prompt["conversations"][0]
                    decoded_text = generate(
                        model,
                        tokenizer,
                        prompt_samples=prompt,
                        generation_config=generation_config,
                        use_encode=True,
                    )
                    # tokenized_prompt = tokenizer.encode(
                    #     prompt["conversations"][0], return_tensors="pt"
                    # ).to(device="cuda:0")

                    # logits = model.generate(
                    #     tokenized_prompt, generation_config=generation_config
                    # )
                    # decoded_text = tokenizer.batch_decode(
                    #     logits, skip_special_tokens=True
                    # )[0]

                    # Create a JSON entry
                    output_entry = {
                        "index": idx,
                        "original_prompt": prompt,
                        "generated_output": decoded_text,
                    }

                    # Write to file immediately (JSON streaming)
                    json.dump(output_entry, f, indent=4, ensure_ascii=False)

                    # Add a comma for the next entry, except for the last one
                    if idx < len(dataset) - 1:
                        f.write(",\n")

                    success = (
                        True  # Mark that at least one entry was successfully written
                    )

                except Exception as e:
                    print(f"Error generating output for index {idx}: {e}")

                pbar.update(1)  # Update the progress bar after each sample
                # break

        f.write("\n]")  # End of JSON array

    if not success:
        print(
            "No outputs were successfully generated. Please check your model and dataset."
        )
    else:
        print(f"Saved generated outputs dynamically to {output_file}")

except Exception as e:
    print(f"Fatal error: Unable to write to file {output_file}. Error: {e}")

Generating Outputs:   0%|          | 0/300 [00:00<?, ?sample/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Generating Outputs: 100%|██████████| 300/300 [2:22:12<00:00, 28.44s/sample]  

Saved generated outputs dynamically to /home/hmankodi/instruct_tuning/FineTune-Llama2-LIMA/generated_outs/lima_finetuned_generated_outputs.json



