In [None]:
import os 
import torch 
from transformers import AutoConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
from copy import deepcopy

os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"

# 加载model 
model_name = "mistralai/Mistral-7B-v0.1"

origin_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    cache_dir="/home/huyanwei/.cache/huggingface/hub",
    # device_map="auto",
    torch_dtype=torch.float16,
)

# 获取原始模型的配置
config = origin_model.config

# 修改config 
config._name_or_path = "Mistral-7B*2-48layers"

# merge后模型存储位置 
save_dir = f"./save_model/{config._name_or_path}"
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

# 使用深拷贝复制模型
copy_model = deepcopy(origin_model)

In [None]:
# 加载原始模型的 tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 将 tokenizer 保存到新模型的目录
tokenizer.save_pretrained(save_dir)

In [None]:
# 存放merge后的层 
new_layers = []

# C0~C15 = A0~A15
new_layers.extend(deepcopy(origin_model.model.layers[:16]))

# C16~C23 = A16~A23 + B0～B7，对应层加权平均
for i in range(16, 24):
    new_layer = deepcopy(origin_model.model.layers[i])
    for param_name, param_tensor in new_layer.named_parameters():
        param_tensor.data.copy_((param_tensor.data + deepcopy(copy_model.model.layers[i - 16].state_dict()[param_name].data)) * 0.5)
    new_layers.append(new_layer)

# C24~C31 = A24~A31 + B8～B15，对应层加权平均
for i in range(24, 32):
    new_layer = deepcopy(origin_model.model.layers[i])
    for param_name, param_tensor in new_layer.named_parameters():
        param_tensor.data.copy_((param_tensor.data + deepcopy(copy_model.model.layers[i - 16].state_dict()[param_name].data)) * 0.5)
    new_layers.append(new_layer)

# C32~C47 = B16～B31
new_layers.extend(deepcopy(copy_model.model.layers[16:]))

# 将新层列表分配给原始模型的层
if isinstance(origin_model, torch.nn.DataParallel):
    origin_model.module.model.layers = torch.nn.ModuleList(new_layers)
else:
    origin_model.model.layers = torch.nn.ModuleList(new_layers)

config.num_hidden_layers = len(new_layers)


In [None]:
# merge 方法
def merge_models(origin_model, copy_model, n, s, m):
    """
        n  # Original number of layers in the origin model
        s  # Total number of layers desired in the new model
        m  # Number of layers to be merged
    """

    # 存放merge后的层
    new_layers = []
    
    # Step 1: Copy the initial layers from the origin model as is
    # C0~C(n-m-1) = A0~A(n-m-1)
    new_layers.extend(deepcopy(origin_model.model.layers[:n-m]))
    
    # Step 2: Merge the overlapping layers by averaging
    # C(n-m)~C(n-1) = A(n-m)~A(n-1) + B0～B(m-1)，对应层加权平均
    for i in range(n-m, n):
        new_layer = deepcopy(origin_model.model.layers[i])
        for param_name, param_tensor in new_layer.named_parameters():
            param_tensor.data.copy_((param_tensor.data + deepcopy(copy_model.model.layers[i - (n-m)].state_dict()[param_name].data)) * 0.5)
        new_layers.append(new_layer)
    
    # Step 3: Add the remaining layers from the copy model
    # Cn~C(s-1) = Bm～B(s-n+m-1)
    new_layers.extend(deepcopy(copy_model.model.layers[m:s-n+m]))

    # Replace the layers in the original model with the new merged layers
    origin_model.model.layers = torch.nn.ModuleList(new_layers)
    
    return origin_model

# Define the parameters
n = 32  # Original number of layers in the origin model
s = 48  # Total number of layers desired in the new model
m = 8   # Number of layers to be merged


# Merge the models
merged_model = merge_models(origin_model, copy_model, n, s, m)

# Save the merged model
if isinstance(origin_model, torch.nn.DataParallel):
    origin_model.module.save_pretrained(save_dir)
else:
    origin_model.save_pretrained(save_dir)

print(f"新模型已保存到：{save_dir}; 新模型隐藏层个数：{len(origin_model.model.layers)}")

In [None]:
# 迭代模型的所有子模块和层，并打印
for name, module in origin_model.named_modules():
    print(name, module)

In [None]:
# 选择一个特定的层来检查
layer_index = 5  # 例如，我们选择第6层，因为索引是从0开始的

# 打印特定层的参数
def print_layer_parameters(model, layer_index):
    layer = model.model.layers[layer_index]
    for param_tensor in layer.state_dict():
        print(param_tensor, "\t", layer.state_dict()[param_tensor].size())
        print(layer.state_dict()[param_tensor])

print("特定层的参数权重：")
print_layer_parameters(origin_model, layer_index)

In [None]:
# 模型测试 1

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
origin_model.to(device)

# 准备一些输入文本作为提示
input_text = "这是一个例子文本"  # 可以根据需要更改
input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

# 使用 generate 方法进行文本生成
# 您可以调整 generate 方法的参数来改变生成的结果，如max_length、num_beams等
generated_ids = origin_model.generate(input_ids, max_length=50, num_beams=5, no_repeat_ngram_size=2, early_stopping=True)

# 将生成的 token ID 解码为文本
generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

print("输入文本:", input_text)
print("生成文本:", generated_text)


In [None]:
# 模型测试 2

text = "Hi, my name is "
inputs = tokenizer(text, return_tensors="pt").to(device)

outputs = origin_model.generate(**inputs, max_new_tokens=64)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


In [None]:
# 使用MMLU 进行评测 
