#1 : SETUP & LIBRARIES

In [1]:
import os
import gc
import torch
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
from google.colab import drive

# Mount Google Drive to save the final model
drive.mount('/content/drive')

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Current compute device: {device}")

Mounted at /content/drive
Current compute device: cuda


# 2 : Merging method's (SLERP) LOGIC

In [2]:
# This cell contains the mathematical implementation of SLERP.

def lerp(t: float, v0: np.ndarray, v1: np.ndarray) -> np.ndarray:
    """
    Linear Interpolation (LERP).

    Used as a fallback when vectors are colinear (pointing in same direction).
    Formula: (1 - t) * v0 + t * v1

    Args:
        t (float): Interpolation factor [0, 1].
        v0 (np.ndarray): Starting vector.
        v1 (np.ndarray): Target vector.
    """
    return (1 - t) * v0 + t * v1

def normalize(v: np.ndarray, eps: float = 1e-8) -> np.ndarray:
    """
    Normalizes a vector to unit length.

    In high-dimensional spaces (like LLM weights), magnitude represents
    significance. Normalizing allows us to compare 'directions' of features
    independent of their strength.

    Args:
        v (np.ndarray): Input vector.
        eps (float): Epsilon value to prevent division by zero errors.
    """
    norm_v = np.linalg.norm(v)
    if norm_v > eps:
        v = v / norm_v
    return v

def slerp(
    t: float,
    v0: torch.Tensor,
    v1: torch.Tensor,
    dot_threshold: float = 0.9995,
    eps: float = 1e-8
) -> torch.Tensor:
    """
    Spherical Linear Interpolation (SLERP).

    Unlike LERP, which cuts a straight line through the sphere (shortening vectors),
    SLERP follows the curvature of the sphere. This preserves the 'magnitude'
    (variance) of the weights, which is crucial for maintaining LLM stability.

    Args:
        t (float): The mixing factor.
                   0.0 = 100% Model A, 1.0 = 100% Model B.
        v0 (torch.Tensor): Weights from Model A.
        v1 (torch.Tensor): Weights from Model B.
        dot_threshold (float): Threshold to decide when vectors are parallel.
                               If dot product > 0.9995, we switch to LERP for stability.
        eps (float): Epsilon for numerical stability.

    Returns:
        torch.Tensor: The merged weight tensor.
    """

    # 1. Convert PyTorch Tensors to NumPy
    # We move to CPU and convert to float32 (numpy default) for high-precision math.
    # LLM weights are often Float16/Bfloat16, but interpolation requires Float32 to avoid rounding errors destroying the model.
    v0_np = v0.detach().cpu().float().numpy()
    v1_np = v1.detach().cpu().float().numpy()

    # 2. Copy vectors to preserve original magnitudes for the final reconstruction
    v0_copy = np.copy(v0_np)
    v1_copy = np.copy(v1_np)

    # 3. Normalize vectors to unit sphere to calculate angles
    v0_np = normalize(v0_np, eps)
    v1_np = normalize(v1_np, eps)

    # 4. Calculate the Dot Product (Cosine of angle theta)
    # This tells us how "close" the two models are in feature space.
    dot = np.sum(v0_np * v1_np)

    # 5. Handle Alignment (Colinearity)
    # If the vectors are almost identical (dot close to 1), SLERP calculation
    # involves division by sin(0), which is unstable. We fallback to LERP.
    if np.abs(dot) > dot_threshold:
        res = lerp(t, v0_copy, v1_copy)
        return torch.from_numpy(res).to(v0.dtype).to(v0.device)

    # 6. Calculate Angles (Theta)
    # arccos(dot) gives us the angle between the two vectors
    theta_0 = np.arccos(dot)
    sin_theta_0 = np.sin(theta_0)

    # Calculate the angle at the interpolation point t
    theta_t = theta_0 * t
    sin_theta_t = np.sin(theta_t)

    # 7. Apply the SLERP Formula
    # factor0 scales the contribution of v0 based on angle
    # factor1 scales the contribution of v1 based on angle
    s0 = np.sin(theta_0 - theta_t) / sin_theta_0
    s1 = sin_theta_t / sin_theta_0

    # Reconstruct the vector using the original magnitudes (v0_copy, v1_copy)
    res = s0 * v0_copy + s1 * v1_copy

    # 8. Convert back to PyTorch Tensor with original dtype/device
    return torch.from_numpy(res).to(v0.dtype).to(v0.device)

# 3 : THE MERGE EXECUTION

In [3]:
def merge_models(model_path_a, model_path_b, output_path, merge_ratio=0.5):
    """
    Loads two models, applies SLERP layer-by-layer, and saves the result.
    """
    print(f"Loading Model A: {model_path_a}...")
    # Loading map_location='cpu' to spare GPU VRAM for calculations if needed
    # Note: Loading two 7B models requires ~28GB CPU RAM.
    model_a = AutoModelForCausalLM.from_pretrained(
        model_path_a,
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True,
        device_map="cpu"
    )

    print(f"Loading Model B: {model_path_b}...")
    model_b = AutoModelForCausalLM.from_pretrained(
        model_path_b,
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True,
        device_map="cpu"
    )

    tokenizer = AutoTokenizer.from_pretrained(model_path_a)

    print(f"Starting SLERP Merge with t={merge_ratio}...")

    # We iterate over the state dictionary of Model A
    # The assumption is Model A and B have identical architectures (layer names)
    model_a_dict = model_a.state_dict()
    model_b_dict = model_b.state_dict()

    merged_dict = {}

    for key in model_a_dict.keys():
        # Skip keys that might track metrics like 'num_batches_tracked'
        if key not in model_b_dict:
            print(f"Skipping {key}: Not found in Model B")
            continue

        tensor_a = model_a_dict[key]
        tensor_b = model_b_dict[key]

        # Check shapes
        if tensor_a.shape != tensor_b.shape:
            # Sometimes vocab sizes differ slightly, we usually take the larger one
            # or raise an error. For this specific merge, we assume compatibility.
            print(f"Shape mismatch at {key}: {tensor_a.shape} vs {tensor_b.shape}. Skipping.")
            merged_dict[key] = tensor_a
            continue

        # Apply SLERP to weight tensors
        # We skip integers (like bias terms usually are floats, but position IDs are ints)
        if "int" in str(tensor_a.dtype):
            merged_dict[key] = tensor_a
        else:
            # The core magic happens here
            merged_dict[key] = slerp(merge_ratio, tensor_a, tensor_b)

        # Garbage collection to free up RAM during the loop
        if key in model_b_dict:
            del model_b_dict[key]

    # Clear memory of Model B completely
    del model_b
    gc.collect()

    print("Merge complete. Saving model...")

    # Load the merged weights back into Model A structure
    model_a.load_state_dict(merged_dict)

    # Save to Drive
    model_a.save_pretrained(output_path)
    tokenizer.save_pretrained(output_path)

    print(f"Model successfully saved to: {output_path}")

# 4 : CONFIGURATION & RUN

In [4]:
# Configuration
MODEL_1_URL = "teknium/OpenHermes-2.5-Mistral-7B"
MODEL_2_URL = "Gryphe/MythoMist-7b"
MERGED_NAME = "MyMergedModel01-7B"

# Where to save on Google Drive
OUTPUT_DIR = f"/content/drive/MyDrive/{MERGED_NAME}"

# The SLERP Factor (t)
# 0.5 means equal mix.
# 0.7 means 30% Model A, 70% Model B.
MERGE_RATIO = 0.5

# Run the Merge
# This will download the models, merge them, and upload to your Drive.
try:
    merge_models(MODEL_1_URL, MODEL_2_URL, OUTPUT_DIR, MERGE_RATIO)
except RuntimeError as e:
    print(f"Error: {e}")
    print("If you ran out of RAM (OOM), try using a High-RAM runtime.")

Loading Model A: teknium/OpenHermes-2.5-Mistral-7B...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/120 [00:00<?, ?B/s]

Loading Model B: Gryphe/MythoMist-7b...


config.json:   0%|          | 0.00/600 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/101 [00:00<?, ?B/s]

Starting SLERP Merge with t=0.5...
Shape mismatch at model.embed_tokens.weight: torch.Size([32002, 4096]) vs torch.Size([32000, 4096]). Skipping.
Shape mismatch at lm_head.weight: torch.Size([32002, 4096]) vs torch.Size([32000, 4096]). Skipping.
Merge complete. Saving model...
Model successfully saved to: /content/drive/MyDrive/MyMergedModel01-7B


#5 : TESTING THE MERGED MODEL

In [5]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Path to our saved model on Drive
saved_model_path = "/content/drive/MyDrive/MyMergedModel01-7B"

print(f"Loading merged model from {saved_model_path}...")

# Load Model & Tokenizer
# We use device_map="auto" to put it on GPU for inference
model = AutoModelForCausalLM.from_pretrained(
    saved_model_path,
    torch_dtype=torch.float16,
    device_map="auto",
    low_cpu_mem_usage=True
)
tokenizer = AutoTokenizer.from_pretrained(saved_model_path)

# Test Prompt
prompt = "Explain the concept of quantum entanglement to a 5-year-old."
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt}
]

# Apply Chat Template (OpenHermes uses ChatML usually)
# If the tokenizer doesn't have a chat_template, we format manually
if tokenizer.chat_template:
    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
else:
    # Fallback manual formatting
    input_text = f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
    inputs = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")

print("Generating response...")
outputs = model.generate(
    inputs,
    max_new_tokens=256,
    do_sample=True,
    temperature=0.7,
    top_p=0.9
)

print("-" * 50)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
print("-" * 50)

Loading merged model from /content/drive/MyDrive/MyMergedModel01-7B...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Generating response...
--------------------------------------------------
system
You are a helpful assistant. 
 user
Explain the concept of quantum entanglement to a 5-year-old. 

Imagine you have two toys, let's say a teddy bear and a toy car. They are friends, and no matter how far apart they go, they always know what the other one is doing. Now, quantum entanglement is like this, but for tiny particles that we can't see with our eyes. These particles can be very far away from each other, but they still know what the other one is doing, even if we try to hide information from one of them. It's like they are connected by a special invisible thread that can't be broken, even if we move them to different ends of the universe. This is a very mysterious and fascinating thing that happens in the world of tiny particles.
--------------------------------------------------
