# LoRA Model Merging and Testing
Script to merge trained LoRA weights with base LLaMA model, save the merged model, and optionally verify generation capabilities. Includes error handling and device detection.

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import os

def merge_lora_with_base(
    base_model_name="NousResearch/Llama-2-7b-hf",
    lora_path="/content/adapter",
    save_path="merged_model",
    device="suto",
    dtype=torch.float16,
):
    print(f"Loading base model: {base_model_name}")

    # Load base model
    model = AutoModelForCausalLM.from_pretrained(
        base_model_name,
        torch_dtype=dtype,
        device_map="auto" if device == "cuda" else None,
        trust_remote_code=True,
    )

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        base_model_name,
        trust_remote_code=True
    )

    print(f"Loading LoRA adapter: {lora_path}")

    # Load PEFT model
    model = PeftModel.from_pretrained(
        model,
        lora_path,
        device_map="auto" if device == "cuda" else None,
    )

    print("Merging LoRA weights with base model...")
    model = model.merge_and_unload()

    # Create save directory if it doesn't exist
    os.makedirs(save_path, exist_ok=True)

    print(f"Saving merged model to: {save_path}")

    # Save merged model
    model.save_pretrained(
        save_path,
        safe_serialization=True,  # Use safetensors format
    )

    # Save tokenizer
    tokenizer.save_pretrained(save_path)

    print("Model merging and saving completed successfully!")

    return model, tokenizer


    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"Test generation:\nPrompt: {prompt}\nOutput: {generated_text}")

if __name__ == "__main__":
    # Configuration
    BASE_MODEL = "NousResearch/Llama-2-7b-hf"  # Replace with your base model path/name
    LORA_PATH = "/content/adapter"  # Replace with your LoRA adapter path
    SAVE_PATH = "merged_llama2_lora"         # Replace with your desired save path

    # Set device
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {DEVICE}")

    try:
        # Merge and save model
        merged_model, tokenizer = merge_lora_with_base(
            base_model_name=BASE_MODEL,
            lora_path=LORA_PATH,
            save_path=SAVE_PATH,
            device=DEVICE,
        )

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        raise

Using device: cuda
Loading base model: NousResearch/Llama-2-7b-hf


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

Loading LoRA adapter: /content/adapter
Merging LoRA weights with base model...
Saving merged model to: merged_llama2_lora
Model merging and saving completed successfully!


# Text Generation Class
Class to load and run inference with LLaMA model, supporting various generation parameters and device configurations. Includes efficient batched generation with customizable sampling settings.

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

class TextGenerator:
    def __init__(
        self,
        model_path,
        device="cuda" if torch.cuda.is_available() else "cpu",
        load_in_8bit=False
    ):
        print(f"Loading model from {model_path}")
        self.model = AutoModelForCausalLM.from_pretrained(
            model_path,
            device_map="auto" if device == "cuda" else None,
            load_in_8bit=load_in_8bit,
            torch_dtype=torch.float16 if device == "cuda" else torch.float32
        )
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.device = device

    def generate(
        self,
        prompt,
        max_length=100,
        temperature=0.7,
        top_p=0.9,
        top_k=50,
        num_return_sequences=1,
        do_sample=True,
        repetition_penalty=1.2
    ):
        """
        Generate text based on the input prompt.

        Args:
            prompt (str): Input text
            max_length (int): Maximum length of generated text
            temperature (float): Higher values increase randomness (0.0-1.0)
            top_p (float): Nucleus sampling parameter (0.0-1.0)
            top_k (int): Top-k sampling parameter
            num_return_sequences (int): Number of sequences to generate
            do_sample (bool): If False, uses greedy decoding
            repetition_penalty (float): Penalty for repeating tokens
        """
        # Encode the input prompt
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)

        # Generate
        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_length=max_length,
                temperature=temperature,
                top_p=top_p,
                top_k=top_k,
                num_return_sequences=num_return_sequences,
                do_sample=do_sample,
                repetition_penalty=repetition_penalty,
                pad_token_id=self.tokenizer.pad_token_id if self.tokenizer.pad_token_id else self.tokenizer.eos_token_id
            )

        # Decode and return the generated text
        generated_texts = [
            self.tokenizer.decode(output, skip_special_tokens=True)
            for output in outputs
        ]

        return generated_texts


In [None]:
# Initialize
generator = TextGenerator("/content/merged_llama2_lora")
# input_ids = input_ids.to('cpu')
# Generate text
response = generator.generate(
    prompt="what do you know about large language models?",
    temperature=0.5,  # Balanced creativity
    max_length=100    # Adjust as needed
)[0]

print(response)

Loading model from /content/merged_llama2_lora


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Write a story about AI: The End of Humanity
Ще немає попередньої версії цього сторіжка.
Використання цілого числа для верхнього індексу вказано, щоб уникнути проблеми з перевіркою номерів версій на малих платформах.
Повна назва "Український міф про «


In [None]:
response = generator.generate(
    prompt="what are large language models? Give answer to the point",
    temperature=0.1,  # Balanced creativity
    max_length=200    # Adjust as needed
)[0]

print(response)

what are large language models? Give answer to the point.
Large Language Models (LLMs) are a type of deep learning model that is designed specifically for natural language processing tasks such as text generation, sentiment analysis and machine translation. These models have been trained on massive amounts of unstructured data, including web texts, books, articles and other sources in order to develop their ability to understand and generate human-like language. LLMs typically consist of several layers of interconnected neural networks with millions or even billions of parameters. The most advanced LLMs can process up to 20 trillion words worth of training data before being deployed into real world applications. Once activated, these powerful machines can quickly analyze vast quantities of textual information and produce accurate results within seconds or minutes depending upon complexity level required by user input requesting assistance from this technology platform solution provider