In [1]:
# @title 1. Install Dependencies
!pip install -q torch transformers accelerate safetensors huggingface_hub

In [2]:
# @title 2. Import Libraries & Mount Drive
import torch
import os
import gc
from google.colab import drive
from transformers import AutoModelForCausalLM, AutoTokenizer
from safetensors.torch import save_file
from tqdm import tqdm

# Mount Google Drive
drive.mount('/content/drive')

# Authenticate with Hugging Face (Required for Mistral Base)
from huggingface_hub import notebook_login
print("Please enter your Hugging Face Write Token (required for accessing Mistral Base):")
notebook_login()

Mounted at /content/drive
Please enter your Hugging Face Write Token (required for accessing Mistral Base):


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
# @title 3. Define TIES Merge Function (Optimized for usage)
def ties_merge_weights(base_w, model_ws, density=0.5, lam=1.0):
    """
    Optimized TIES merge for single layer weights.
    """
    # 1. Calculate Deltas
    deltas = [w - base_w for w in model_ws]
    stacked_deltas = torch.stack(deltas, dim=0)

    # 2. Sparsify (Trim)
    # Calculate threshold for top k% magnitude
    k = int(stacked_deltas.numel() * density)
    if k < 1: return base_w # Skip if density is too low

    magnitude = stacked_deltas.abs()
    # torch.kthvalue is slow on large tensors, simple sort is often faster for 1D
    # flattening effectively puts all params in one list
    sorted_mag, _ = torch.sort(magnitude.flatten())
    threshold = sorted_mag[-k] # The value at the cutoff index

    mask = magnitude >= threshold
    sparse_deltas = stacked_deltas * mask

    # 3. Elect (Sign Consensus)
    signs = torch.sign(sparse_deltas)
    summed_signs = torch.sum(signs, dim=0)
    consensus_sign = torch.sign(summed_signs)

    # Filter disagreements
    consensus_mask = (signs == consensus_sign)
    filtered_deltas = sparse_deltas * consensus_mask

    # 4. Merge
    summed_deltas = torch.sum(filtered_deltas, dim=0)
    num_contributors = torch.sum(consensus_mask, dim=0)
    num_contributors = torch.clamp(num_contributors, min=1.0)

    final_delta = summed_deltas / num_contributors

    # 5. Apply
    return base_w + (lam * final_delta)

In [4]:
# @title 4. Execute Layer-wise Merge

# --- HELPER FUNCTION FOR VOCAB MISMATCH ---
def align_and_pad(base_t, model_ts):
    """
    Checks if tensors have different vocabulary sizes (e.g., 32002 vs 32000).
    If so, pads the smaller ones with zeros to match the largest.
    """
    # 1. Check shapes
    shapes = [base_t.shape[0]] + [t.shape[0] for t in model_ts]
    max_vocab = max(shapes)

    # If all shapes are the same, return as is
    if all(s == max_vocab for s in shapes):
        return base_t, model_ts

    print(f"  [Auto-Fix] Resizing vocab from {shapes} to {max_vocab}...")

    # 2. Define padding function
    def pad_tensor(t, target_len):
        current_len = t.shape[0]
        if current_len == target_len:
            return t

        # Calculate how many rows to add
        diff = target_len - current_len

        # Create zeros of the same data type and on the same device
        # We assume the tensor is 2D [vocab, hidden_dim] (like embed_tokens or lm_head)
        zeros = torch.zeros((diff, t.shape[1]), dtype=t.dtype, device=t.device)

        # Concatenate along the vocab dimension (dim 0)
        return torch.cat([t, zeros], dim=0)

    # 3. Apply padding
    base_t = pad_tensor(base_t, max_vocab)
    model_ts = [pad_tensor(t, max_vocab) for t in model_ts]

    return base_t, model_ts

# --- MAIN EXECUTION ---
import os
import gc
from tqdm import tqdm

# Configuration
base_model_name = "mistralai/Mistral-7B-v0.1"
model_paths = [
    "teknium/OpenHermes-2.5-Mistral-7B",
    "Gryphe/MythoMist-7b"
]
merged_model_name = "MyMergedModel02-7B"
save_path = f"/content/drive/MyDrive/{merged_model_name}"
os.makedirs(save_path, exist_ok=True)

# TIES Parameters
density = 0.5
lam = 1.0

print(f"Starting TIES merge (Robust Mode)...")

# 1. Load Base Model Structure (CPU)
print("Loading Base Model structure...")
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    torch_dtype=torch.float16,
    device_map="cpu",
    trust_remote_code=True
)

# 2. Save Tokenizer from the LARGER model (OpenHermes) to ensure special tokens exist
print("Saving tokenizer from OpenHermes (to keep special tokens)...")
tokenizer = AutoTokenizer.from_pretrained(model_paths[0])
tokenizer.save_pretrained(save_path)

base_state_dict = base_model.state_dict()
keys = list(base_state_dict.keys())
new_state_dict = {}

# 3. Load Source Models (CPU)
print("Loading Source Models...")
model_1 = AutoModelForCausalLM.from_pretrained(model_paths[0], torch_dtype=torch.float16, device_map="cpu")
model_2 = AutoModelForCausalLM.from_pretrained(model_paths[1], torch_dtype=torch.float16, device_map="cpu")

print("Merging layers...")

for key in tqdm(keys):
    # Retrieve weights
    base_w = base_state_dict[key]

    # Check if this key exists in the fine-tunes (sometimes structures vary slightly)
    if key not in model_1.state_dict() or key not in model_2.state_dict():
        print(f"Skipping key {key} (missing in one of the models)")
        new_state_dict[key] = base_w
        continue

    w1 = model_1.state_dict()[key]
    w2 = model_2.state_dict()[key]

    # Skip non-float parameters (like tracking stats)
    if base_w.dtype not in [torch.float16, torch.float32, torch.bfloat16]:
        new_state_dict[key] = base_w
        continue

    # --- FIX: ALIGN VOCAB SIZES ---
    # This detects if w1 is 32002 and base is 32000, and pads base/w2 with zeros
    base_w, [w1, w2] = align_and_pad(base_w, [w1, w2])

    # Apply TIES
    try:
        # Move to GPU for calculation
        res = ties_merge_weights(
            base_w.to("cuda"),
            [w1.to("cuda"), w2.to("cuda")],
            density=density,
            lam=lam
        )
        new_state_dict[key] = res.to("cpu") # Move back to CPU to save RAM
    except RuntimeError as e:
        print(f"GPU Error on {key}, falling back to CPU: {e}")
        res = ties_merge_weights(base_w, [w1, w2], density=density, lam=lam)
        new_state_dict[key] = res

    # Explicitly delete tensors to free memory immediately
    del base_w, w1, w2
    torch.cuda.empty_cache()

# Cleanup
del model_1
del model_2
del base_model
gc.collect()

# Save
print(f"Saving merged model to {save_path}...")
save_file(new_state_dict, os.path.join(save_path, "model.safetensors"))

# Save Config
print("Saving config...")
base_config = AutoModelForCausalLM.from_pretrained(base_model_name).config
# Update config vocab size to match the new larger size
base_config.vocab_size = new_state_dict["lm_head.weight"].shape[0]
base_config.save_pretrained(save_path)

print("Merge Complete! Model saved to Google Drive.")

Starting TIES merge (Robust Mode)...
Loading Base Model structure...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Saving tokenizer from OpenHermes (to keep special tokens)...


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/101 [00:00<?, ?B/s]

Loading Source Models...


config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/120 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/600 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Merging layers...


  0%|          | 0/291 [00:00<?, ?it/s]

  [Auto-Fix] Resizing vocab from [32000, 32002, 32000] to 32002...


 99%|█████████▉| 288/291 [00:41<00:00,  6.20it/s]

  [Auto-Fix] Resizing vocab from [32000, 32002, 32000] to 32002...


100%|██████████| 291/291 [00:41<00:00,  6.97it/s]


Saving merged model to /content/drive/MyDrive/MyMergedModel02-7B...
Saving config...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Merge Complete! Model saved to Google Drive.


In [5]:
# @title: 5. Testing the Merged Model
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Path to our saved model on Drive
saved_model_path = "/content/drive/MyDrive/MyMergedModel02-7B"

print(f"Loading merged model from {saved_model_path}...")

# Load Model & Tokenizer
# We use device_map="auto" to put it on GPU for inference
model = AutoModelForCausalLM.from_pretrained(
    saved_model_path,
    torch_dtype=torch.float16,
    device_map="auto",
    low_cpu_mem_usage=True
)
tokenizer = AutoTokenizer.from_pretrained(saved_model_path)

# Test Prompt
prompt = "Explain the concept of quantum entanglement to a 5-year-old."
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt}
]

# Apply Chat Template (OpenHermes uses ChatML usually)
# If the tokenizer doesn't have a chat_template, we format manually
if tokenizer.chat_template:
    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
else:
    # Fallback manual formatting
    input_text = f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
    inputs = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")

print("Generating response...")
outputs = model.generate(
    inputs,
    max_new_tokens=256,
    do_sample=True,
    temperature=0.7,
    top_p=0.9
)

print("-" * 50)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
print("-" * 50)

Loading merged model from /content/drive/MyDrive/MyMergedModel02-7B...


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Generating response...
--------------------------------------------------
system
You are a helpful assistant. 
 user
Explain the concept of quantum entanglement to a 5-year-old. 

Imagine you have two best toy friends, let's call them Toy A and Toy B. Even when they are far away from each other, they still share a secret bond. They can feel what the other is feeling without actually talking to each other. This special connection is called quantum entanglement. In the world of tiny things called particles, this bond can exist between two particles. When something happens to one particle, the other one reacts to it instantly, no matter how far they are. Just like your toy friends, they know they are connected and share a secret bond, even without seeing each other. This is a very mysterious and special thing that only happens in the world of very, very small things.
--------------------------------------------------
