In [1]:
import os
import torch
from tqdm import tqdm
import bitsandbytes as bnb
from copy import deepcopy

In [2]:
ckpt = torch.load("../checkpoints/llama2/Llama-2-7b/consolidated.00.pth",map_location='cpu')

In [3]:
wo_list = []
for key,val in ckpt.items():
    if key.endswith("w1.weight"):
        wo_list += [val.to("cuda", torch.float32)]

In [18]:
rank = 512
range_anchor = 2
w_base = torch.nn.Parameter(torch.empty_like(wo_list[0])).cuda()
torch.nn.init.xavier_normal_(w_base)
w_lora_list = []
for ii in range(range_anchor):
    lora1 = torch.nn.Parameter(torch.empty_like(wo_list[0][:,:rank]),requires_grad=True)
    lora2 = torch.nn.Parameter(torch.empty_like(wo_list[0][:rank,:]),requires_grad=True)
    torch.nn.init.xavier_normal_(lora1)
    torch.nn.init.xavier_normal_(lora2)
    w_lora_list +=[(lora1.cuda(), lora2.cuda())]
print(w_base.shape, w_lora_list[0][0].shape, w_lora_list[0][1].shape)

torch.Size([11008, 4096]) torch.Size([11008, 512]) torch.Size([512, 4096])


In [19]:
criterion = torch.nn.MSELoss()
optimizer = torch.optim.SGD([w_lora_list[ii][0] for ii in range(range_anchor)]+[w_lora_list[ii][1] for ii in range(range_anchor)]+[w_base], lr=100000)

pbar = tqdm([ll for ll in range(500)], desc='Training', leave=True)
for epoch in pbar:
    optimizer.zero_grad()           # Zero the gradients
    loss = 0
    for idx, ww in enumerate(wo_list[:range_anchor]):
        w_approx = w_base + w_lora_list[idx][0] @ w_lora_list[idx][1]
        loss += criterion(w_approx, ww)  # Calculate loss
    # print(loss.item())
    loss.backward()
    pbar.set_description(f'Training - Epoch {epoch+1}, Loss: {loss.item():.2e}, w:{w_base[0,0].item():.2e} grad:{w_base.grad[0,0].item():.2e}')
    # print(w_base.grad[0,0])
    # print(w_base[0,0])
    optimizer.step()
print(loss)

Training - Epoch 500, Loss: 1.41e-04, w:1.92e-02 grad:9.90e-11: 100%|██████████| 500/500 [00:19<00:00, 25.10it/s] 

tensor(0.0001, device='cuda:0', grad_fn=<AddBackward0>)





torch.Size([512, 4096])

In [None]:
rank = 256
range_anchor = 2
# def low_rank_equivalent(base, target):
#     delta = target.clone().detach() - base.clone().detach()
#     u,s,v = torch.svd(delta.to(torch.float32))
#     k = rank
#     u_topk, s_topk, v_topk = u[:, :k], s[:k], v[:, :k]

#     lora_b = torch.mm(u_topk, torch.diag(s_topk.sqrt())).to(target.dtype)
#     lora_a = torch.mm(torch.diag(s_topk.sqrt()), v_topk.t()).to(target.dtype)
#     return lora_a, lora_b
# w_base = torch.nn.Parameter(wo_list[0].data.clone()).cuda()
# w_lora_list = []
# for ii in tqdm([kk for kk in range(1,32)]):
#     lora_a, lora_b = low_rank_equivalent(w_base.data, wo_list[ii])
#     lora1 = torch.nn.Parameter(lora_b.data.clone(),requires_grad=True)
#     lora2 = torch.nn.Parameter(lora_a.data.clone(),requires_grad=True)
#     w_lora_list +=[(lora1.cuda(), lora2.cuda())]
# lora1 = torch.nn.Parameter(torch.empty_like(wo_list[0][:,:rank]),requires_grad=True)
# lora2 = torch.nn.Parameter(torch.empty_like(wo_list[0][:rank,:]),requires_grad=True)
# torch.nn.init.xavier_normal_(lora1)
# torch.nn.init.xavier_normal_(lora2)
# w_lora_list = [(lora1.cuda(), lora2.cuda())] + w_lora_list


w_base = torch.nn.Parameter(torch.empty_like(wo_list[0])).cuda()
torch.nn.init.xavier_normal_(w_base)
w_lora_list = []
for ii in range(range_anchor):
    lora1 = torch.nn.Parameter(torch.empty_like(wo_list[0][:,:rank]),requires_grad=True)
    lora2 = torch.nn.Parameter(torch.empty_like(wo_list[0][:rank,:]),requires_grad=True)
    torch.nn.init.xavier_normal_(lora1)
    torch.nn.init.xavier_normal_(lora2)
    w_lora_list +=[(lora1.cuda(), lora2.cuda())]


weight = bnb.nn.Params4bit(
                wo_list[0].data.clone().cpu(), 
                requires_grad=False,
                quant_type='nf4',
)

In [None]:
weight.to("cuda")
weight_nf4 = bnb.functional.dequantize_4bit(weight, weight.quant_state)
((wo_list[0].data.clone()-weight_nf4)**2).mean()

In [None]:
criterion = torch.nn.MSELoss()
optimizer = torch.optim.SGD([w_lora_list[ii][0] for ii in range(range_anchor)]+[w_lora_list[ii][1] for ii in range(range_anchor)]+[w_base], lr=100000)

In [None]:
pbar = tqdm([ll for ll in range(500)], desc='Training', leave=True)
for epoch in pbar:
    optimizer.zero_grad()           # Zero the gradients
    loss = 0
    for idx, ww in enumerate(wo_list[:range_anchor]):
        w_approx = w_base + w_lora_list[idx][0] @ w_lora_list[idx][1]
        loss += criterion(w_approx, ww)  # Calculate loss
    # print(loss.item())
    loss.backward()
    pbar.set_description(f'Training - Epoch {epoch+1}, Loss: {loss.item():.2e}, w:{w_base[0,0].item():.2e} grad:{w_base.grad[0,0].item():.2e}')
    # print(w_base.grad[0,0])
    # print(w_base[0,0])
    optimizer.step()
print(loss)

In [None]:
w_base.std(), w_base.mean()

In [None]:
idx=0
(w_lora_list[idx][0] @ w_lora_list[idx][1]).std(),(w_lora_list[idx][0] @ w_lora_list[idx][1]).mean()

In [None]:
def low_rank_equivalent(base, target):
    delta = target.clone().detach() - base.clone().detach()
    u,s,v = torch.svd(delta.to(torch.float32))
    k = 1024
    u_topk, s_topk, v_topk = u[:, :k], s[:k], v[:, :k]

    lora_b = torch.mm(u_topk, torch.diag(s_topk.sqrt())).to(target.dtype)
    lora_a = torch.mm(torch.diag(s_topk.sqrt()), v_topk.t()).to(target.dtype)
    return base + lora_b@lora_a
delta = 0
for ww in tqdm(wo_list[1:]):
    new_ww = low_rank_equivalent(wo_list[0],ww)
    delta += ((ww-new_ww)**2).mean()
print(delta)

In [None]:
ckpt_new = deepcopy(ckpt)

In [23]:
for ending_name in ["wq.weight","wk.weight","wv.weight","wo.weight"]:
    wo_list = []
    w_name_list = []
    for key,val in ckpt.items():
        if key.endswith(ending_name):
            wo_list += [val.to("cuda", torch.float32)]
            w_name_list += [key]

    rank = 512
    range_anchor = 2
    print(f"(In a group) full params:{4096*4096*range_anchor}, retained params:{4096*rank*2*range_anchor+4096}, reduced:{(4096*rank*2*range_anchor+4096)/(4096*4096*range_anchor):.3f}")

    for group_idx in range(0,32,range_anchor):
        w_base = torch.nn.Parameter(torch.empty_like(wo_list[group_idx])).cuda()
        torch.nn.init.xavier_normal_(w_base)
        w_lora_list = []
        for ii in range(range_anchor):
            lora1 = torch.nn.Parameter(torch.empty_like(wo_list[0][:,:rank]),requires_grad=True)
            lora2 = torch.nn.Parameter(torch.empty_like(wo_list[0][:rank,:]),requires_grad=True)
            torch.nn.init.xavier_normal_(lora1)
            torch.nn.init.xavier_normal_(lora2)
            w_lora_list +=[(lora1.cuda(), lora2.cuda())]

        criterion = torch.nn.MSELoss()
        optimizer = torch.optim.SGD([w_lora_list[ii][0] for ii in range(range_anchor)]+[w_lora_list[ii][1] for ii in range(range_anchor)]+[w_base], lr=500000)

        pbar = tqdm([ll for ll in range(1000)], desc=f'Training group_idx={group_idx}', leave=True)
        for epoch in pbar:
            optimizer.zero_grad()           # Zero the gradients
            loss = 0
            names = w_name_list[group_idx:group_idx+range_anchor]
            for idx, ww in enumerate(wo_list[group_idx:group_idx+range_anchor]):
                w_approx = w_base + w_lora_list[idx][0] @ w_lora_list[idx][1]
                loss += criterion(w_approx, ww)  # Calculate loss
            loss.backward()
            pbar.set_description(f'Training - Epoch {epoch+1}, Loss: {loss.item():.2e}, w:{w_base[0,0].item():.2e} grad:{w_base.grad[0,0].item():.2e}')
            optimizer.step()
        for layer_idx, name in enumerate(names):
            ckpt_new[name] = w_base.to(ckpt_new[name].dtype)
            ckpt_new[name.replace("weight","lora_a.weight")] = w_lora_list[layer_idx][1]
            ckpt_new[name.replace("weight","lora_b.weight")] = w_lora_list[layer_idx][0]
        print(group_idx,loss, names)
        print("-"*40)

(In a group) full params:33554432, retained params:8392704, reduced:0.250


Training - Epoch 1000, Loss: 1.04e-05, w:-4.78e-03 grad:-2.79e-12: 100%|██████████| 1000/1000 [00:15<00:00, 63.96it/s]


0 tensor(1.0439e-05, device='cuda:0', grad_fn=<AddBackward0>) ['layers.0.attention.wq.weight', 'layers.1.attention.wq.weight']
----------------------------------------


Training - Epoch 1000, Loss: 1.16e-04, w:-1.88e-03 grad:-1.39e-13: 100%|██████████| 1000/1000 [00:15<00:00, 63.35it/s]


2 tensor(0.0001, device='cuda:0', grad_fn=<AddBackward0>) ['layers.2.attention.wq.weight', 'layers.3.attention.wq.weight']
----------------------------------------


Training - Epoch 1000, Loss: 1.38e-04, w:-1.24e-02 grad:3.34e-12: 100%|██████████| 1000/1000 [00:15<00:00, 63.69it/s]


4 tensor(0.0001, device='cuda:0', grad_fn=<AddBackward0>) ['layers.4.attention.wq.weight', 'layers.5.attention.wq.weight']
----------------------------------------


Training - Epoch 1000, Loss: 1.15e-04, w:2.88e-03 grad:2.84e-13: 100%|██████████| 1000/1000 [00:15<00:00, 63.94it/s]


6 tensor(0.0001, device='cuda:0', grad_fn=<AddBackward0>) ['layers.6.attention.wq.weight', 'layers.7.attention.wq.weight']
----------------------------------------


Training - Epoch 1000, Loss: 1.29e-04, w:-1.13e-02 grad:5.96e-12: 100%|██████████| 1000/1000 [00:15<00:00, 63.70it/s]


8 tensor(0.0001, device='cuda:0', grad_fn=<AddBackward0>) ['layers.8.attention.wq.weight', 'layers.9.attention.wq.weight']
----------------------------------------


Training - Epoch 1000, Loss: 1.09e-04, w:4.73e-03 grad:6.75e-13: 100%|██████████| 1000/1000 [00:15<00:00, 63.60it/s]


10 tensor(0.0001, device='cuda:0', grad_fn=<AddBackward0>) ['layers.10.attention.wq.weight', 'layers.11.attention.wq.weight']
----------------------------------------


Training - Epoch 1000, Loss: 1.21e-04, w:3.03e-03 grad:-1.46e-12: 100%|██████████| 1000/1000 [00:15<00:00, 63.80it/s]


12 tensor(0.0001, device='cuda:0', grad_fn=<AddBackward0>) ['layers.12.attention.wq.weight', 'layers.13.attention.wq.weight']
----------------------------------------


Training - Epoch 1000, Loss: 1.15e-04, w:-1.35e-02 grad:-5.21e-12: 100%|██████████| 1000/1000 [00:15<00:00, 63.60it/s]


14 tensor(0.0001, device='cuda:0', grad_fn=<AddBackward0>) ['layers.14.attention.wq.weight', 'layers.15.attention.wq.weight']
----------------------------------------


Training - Epoch 1000, Loss: 1.24e-04, w:1.60e-02 grad:1.23e-12: 100%|██████████| 1000/1000 [00:15<00:00, 63.81it/s]


16 tensor(0.0001, device='cuda:0', grad_fn=<AddBackward0>) ['layers.16.attention.wq.weight', 'layers.17.attention.wq.weight']
----------------------------------------


Training - Epoch 1000, Loss: 1.32e-04, w:-1.82e-03 grad:-3.59e-13: 100%|██████████| 1000/1000 [00:15<00:00, 63.81it/s]


18 tensor(0.0001, device='cuda:0', grad_fn=<AddBackward0>) ['layers.18.attention.wq.weight', 'layers.19.attention.wq.weight']
----------------------------------------


Training - Epoch 1000, Loss: 1.27e-04, w:-1.09e-02 grad:-3.25e-12: 100%|██████████| 1000/1000 [00:15<00:00, 63.58it/s]


20 tensor(0.0001, device='cuda:0', grad_fn=<AddBackward0>) ['layers.20.attention.wq.weight', 'layers.21.attention.wq.weight']
----------------------------------------


Training - Epoch 1000, Loss: 1.45e-04, w:8.33e-03 grad:-4.21e-12: 100%|██████████| 1000/1000 [00:15<00:00, 63.73it/s]


22 tensor(0.0001, device='cuda:0', grad_fn=<AddBackward0>) ['layers.22.attention.wq.weight', 'layers.23.attention.wq.weight']
----------------------------------------


Training - Epoch 1000, Loss: 1.32e-04, w:9.93e-03 grad:-2.29e-12: 100%|██████████| 1000/1000 [00:15<00:00, 63.80it/s]


24 tensor(0.0001, device='cuda:0', grad_fn=<AddBackward0>) ['layers.24.attention.wq.weight', 'layers.25.attention.wq.weight']
----------------------------------------


Training - Epoch 1000, Loss: 1.41e-04, w:-9.19e-06 grad:2.07e-12: 100%|██████████| 1000/1000 [00:15<00:00, 63.53it/s]


26 tensor(0.0001, device='cuda:0', grad_fn=<AddBackward0>) ['layers.26.attention.wq.weight', 'layers.27.attention.wq.weight']
----------------------------------------


Training - Epoch 276, Loss: 1.31e-04, w:2.33e-04 grad:-5.93e-12:  28%|██▊       | 276/1000 [00:04<00:11, 64.79it/s] 


KeyboardInterrupt: 

In [None]:
ckpt_new.keys()

In [None]:
torch.save(ckpt_new, "../checkpoints/effiLLaMA2/consolidated.00.pth")

In [None]:
print(ckpt_new["layers.30.attention.wq.weight"]==ckpt_new["layers.31.attention.wq.weight"])
print(ckpt_new["layers.10.attention.wk.weight"]==ckpt_new["layers.11.attention.wk.weight"])
print(ckpt_new["layers.2.attention.wv.weight"]==ckpt_new["layers.3.attention.wv.weight"])

In [None]:
ckpt_new["layers.31.attention.wq.lora_b.weight"].shape, ckpt_new["layers.31.attention.wq.lora_a.weight"].shape, 

In [None]:
ckpt_new["layers.3.attention.wk.weight"]==ckpt_new["layers.2.attention.wk.weight"]

In [None]:
model.llma.layers[31].attention.wq.weight
model.llma.layers[2].attention.wk.weight

In [28]:
ckpt_new = deepcopy(ckpt)
for ending_name in ["feed_forward.w1.weight"]:
    wo_list = []
    w_name_list = []
    for key,val in ckpt.items():
        if key.endswith(ending_name):
            wo_list += [val.to("cuda", torch.float32)]
            w_name_list += [key]

    rank = 512
    range_anchor = 2
    print(f"(In a group) full params:{4096*4096*range_anchor}, retained params:{4096*rank*2*range_anchor+4096}, reduced:{(4096*rank*2*range_anchor+4096)/(4096*4096*range_anchor):.3f}")

    for group_idx in range(0,32,range_anchor):
        w_base = torch.nn.Parameter(torch.empty_like(wo_list[group_idx])).cuda()
        torch.nn.init.xavier_normal_(w_base)
        w_lora_list = []
        for ii in range(range_anchor):
            lora1 = torch.nn.Parameter(torch.empty_like(wo_list[0][:,:rank]),requires_grad=True)
            lora2 = torch.nn.Parameter(torch.empty_like(wo_list[0][:rank,:]),requires_grad=True)
            torch.nn.init.xavier_normal_(lora1)
            torch.nn.init.xavier_normal_(lora2)
            w_lora_list +=[(lora1.cuda(), lora2.cuda())]

        criterion = torch.nn.MSELoss()
        optimizer = torch.optim.SGD([w_lora_list[ii][0] for ii in range(range_anchor)]+[w_lora_list[ii][1] for ii in range(range_anchor)]+[w_base], lr=500000)

        pbar = tqdm([ll for ll in range(1000)], desc=f'Training group_idx={group_idx}', leave=True)
        for epoch in pbar:
            optimizer.zero_grad()           # Zero the gradients
            loss = 0
            names = w_name_list[group_idx:group_idx+range_anchor]
            for idx, ww in enumerate(wo_list[group_idx:group_idx+range_anchor]):
                w_approx = w_base + w_lora_list[idx][0] @ w_lora_list[idx][1]
                loss += criterion(w_approx, ww)  # Calculate loss
            loss.backward()
            pbar.set_description(f'Training - Epoch {epoch+1}, Loss: {loss.item():.2e}, w:{w_base[0,0].item():.2e} grad:{w_base.grad[0,0].item():.2e}')
            optimizer.step()
        for layer_idx, name in enumerate(names):
            ckpt_new[name] = w_base.to(ckpt_new[name].dtype)
            ckpt_new[name.replace("weight","lora_a.weight")] = w_lora_list[layer_idx][1]
            ckpt_new[name.replace("weight","lora_b.weight")] = w_lora_list[layer_idx][0]
        print(group_idx,loss, names)
        print("-"*40)

(In a group) full params:33554432, retained params:8392704, reduced:0.250


Training - Epoch 1000, Loss: 1.06e-04, w:2.51e-02 grad:-1.46e-12: 100%|██████████| 1000/1000 [00:19<00:00, 51.93it/s]


0 tensor(0.0001, device='cuda:0', grad_fn=<AddBackward0>) ['layers.0.feed_forward.w1.weight', 'layers.1.feed_forward.w1.weight']
----------------------------------------


Training - Epoch 1000, Loss: 1.56e-04, w:4.45e-03 grad:-3.44e-12: 100%|██████████| 1000/1000 [00:19<00:00, 52.21it/s]


2 tensor(0.0002, device='cuda:0', grad_fn=<AddBackward0>) ['layers.2.feed_forward.w1.weight', 'layers.3.feed_forward.w1.weight']
----------------------------------------


Training - Epoch 1000, Loss: 1.74e-04, w:-6.25e-03 grad:-2.65e-13: 100%|██████████| 1000/1000 [00:19<00:00, 52.27it/s]


4 tensor(0.0002, device='cuda:0', grad_fn=<AddBackward0>) ['layers.4.feed_forward.w1.weight', 'layers.5.feed_forward.w1.weight']
----------------------------------------


Training - Epoch 1000, Loss: 1.73e-04, w:6.24e-03 grad:2.51e-12: 100%|██████████| 1000/1000 [00:19<00:00, 52.31it/s]


6 tensor(0.0002, device='cuda:0', grad_fn=<AddBackward0>) ['layers.6.feed_forward.w1.weight', 'layers.7.feed_forward.w1.weight']
----------------------------------------


Training - Epoch 1000, Loss: 1.63e-04, w:-6.13e-03 grad:-1.57e-12: 100%|██████████| 1000/1000 [00:19<00:00, 52.29it/s]


8 tensor(0.0002, device='cuda:0', grad_fn=<AddBackward0>) ['layers.8.feed_forward.w1.weight', 'layers.9.feed_forward.w1.weight']
----------------------------------------


Training - Epoch 1000, Loss: 1.57e-04, w:-3.85e-02 grad:1.04e-11: 100%|██████████| 1000/1000 [00:19<00:00, 52.20it/s]


10 tensor(0.0002, device='cuda:0', grad_fn=<AddBackward0>) ['layers.10.feed_forward.w1.weight', 'layers.11.feed_forward.w1.weight']
----------------------------------------


Training - Epoch 1000, Loss: 1.56e-04, w:1.68e-02 grad:-8.35e-15: 100%|██████████| 1000/1000 [00:19<00:00, 52.26it/s]


12 tensor(0.0002, device='cuda:0', grad_fn=<AddBackward0>) ['layers.12.feed_forward.w1.weight', 'layers.13.feed_forward.w1.weight']
----------------------------------------


Training - Epoch 1000, Loss: 1.58e-04, w:-1.96e-03 grad:7.29e-12: 100%|██████████| 1000/1000 [00:19<00:00, 52.22it/s]


14 tensor(0.0002, device='cuda:0', grad_fn=<AddBackward0>) ['layers.14.feed_forward.w1.weight', 'layers.15.feed_forward.w1.weight']
----------------------------------------


Training - Epoch 1000, Loss: 1.64e-04, w:-1.70e-02 grad:-5.14e-13: 100%|██████████| 1000/1000 [00:19<00:00, 52.39it/s]


16 tensor(0.0002, device='cuda:0', grad_fn=<AddBackward0>) ['layers.16.feed_forward.w1.weight', 'layers.17.feed_forward.w1.weight']
----------------------------------------


Training - Epoch 1000, Loss: 1.74e-04, w:-3.51e-03 grad:4.62e-12: 100%|██████████| 1000/1000 [00:19<00:00, 52.32it/s]


18 tensor(0.0002, device='cuda:0', grad_fn=<AddBackward0>) ['layers.18.feed_forward.w1.weight', 'layers.19.feed_forward.w1.weight']
----------------------------------------


Training - Epoch 1000, Loss: 1.78e-04, w:1.50e-03 grad:1.60e-11: 100%|██████████| 1000/1000 [00:19<00:00, 52.27it/s]


20 tensor(0.0002, device='cuda:0', grad_fn=<AddBackward0>) ['layers.20.feed_forward.w1.weight', 'layers.21.feed_forward.w1.weight']
----------------------------------------


Training - Epoch 1000, Loss: 1.83e-04, w:-4.94e-03 grad:-4.23e-12: 100%|██████████| 1000/1000 [00:19<00:00, 52.12it/s]


22 tensor(0.0002, device='cuda:0', grad_fn=<AddBackward0>) ['layers.22.feed_forward.w1.weight', 'layers.23.feed_forward.w1.weight']
----------------------------------------


Training - Epoch 1000, Loss: 1.88e-04, w:-1.99e-02 grad:6.67e-13: 100%|██████████| 1000/1000 [00:19<00:00, 52.14it/s]


24 tensor(0.0002, device='cuda:0', grad_fn=<AddBackward0>) ['layers.24.feed_forward.w1.weight', 'layers.25.feed_forward.w1.weight']
----------------------------------------


Training - Epoch 1000, Loss: 1.94e-04, w:2.58e-02 grad:-1.84e-12: 100%|██████████| 1000/1000 [00:19<00:00, 52.27it/s]


26 tensor(0.0002, device='cuda:0', grad_fn=<AddBackward0>) ['layers.26.feed_forward.w1.weight', 'layers.27.feed_forward.w1.weight']
----------------------------------------


Training - Epoch 1000, Loss: 1.95e-04, w:-9.67e-03 grad:1.06e-11: 100%|██████████| 1000/1000 [00:19<00:00, 52.35it/s]


28 tensor(0.0002, device='cuda:0', grad_fn=<AddBackward0>) ['layers.28.feed_forward.w1.weight', 'layers.29.feed_forward.w1.weight']
----------------------------------------


Training - Epoch 1000, Loss: 2.07e-04, w:-1.69e-02 grad:4.30e-12: 100%|██████████| 1000/1000 [00:19<00:00, 52.21it/s]

30 tensor(0.0002, device='cuda:0', grad_fn=<AddBackward0>) ['layers.30.feed_forward.w1.weight', 'layers.31.feed_forward.w1.weight']
----------------------------------------



