In [1]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

%run 1.tokenizer.ipynb

tokenizer

<__main__.Tokenizer at 0x7f5920c74280>

In [2]:
%run 2.dataset.ipynb


def f(data):
    data = [i['text'] for i in data]
    return tokenizer(data,
                     padding=True,
                     truncation=True,
                     max_length=32,
                     device=device,
                     add_bos_token=True,
                     add_eos_token=False,
                     padding_side='left')['input_ids']


loader = get_loader(f, negative_label=False, with_answer=False)

len(loader), next(iter(loader))

(125000,
 tensor([[ 0, 11,  9,  7, 10, 13,  9,  8,  7,  5, 17],
         [ 0, 10,  3, 11,  5, 15,  9,  6,  4,  6, 17],
         [ 2,  2,  0, 11, 12, 14,  4,  7, 12, 10, 17],
         [ 0, 10,  3,  7,  7, 14, 10,  9, 11,  5, 17],
         [ 0,  6, 11, 12,  5, 16,  5,  4, 12, 11, 17],
         [ 0,  5,  7, 12,  9, 13,  8, 11,  8,  5, 17],
         [ 0, 12,  7,  6,  5, 13, 11,  7,  5,  6, 17],
         [ 2,  0, 10,  5,  8, 11, 16,  5,  7, 12, 17],
         [ 2,  0, 10,  3,  6, 14,  9,  3,  9,  6, 17],
         [ 0,  7,  3,  8,  7, 15,  6,  7, 12,  4, 17],
         [ 0,  8,  8,  5,  3, 14,  9,  8,  5,  4, 17],
         [ 0,  7, 12,  5, 10, 14,  8,  6,  6,  5, 17],
         [ 0,  4, 11,  4,  4, 16,  5,  7, 10, 12, 17],
         [ 0,  6,  4, 10,  6, 15, 10,  9,  7,  9, 17],
         [ 0,  8,  7,  7, 11, 13,  7, 10,  3, 12, 17],
         [ 0,  5, 11, 10, 10, 16,  7,  9, 12,  8, 17]], device='cuda:0'))

In [3]:
from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification
from trl.trainer.utils import disable_dropout_in_model

model_actor = AutoModelForCausalLM.from_pretrained('model/ppo').to(device)
model_actor_ref = AutoModelForCausalLM.from_pretrained('model/actor').to(
    device)

model_critic = AutoModelForSequenceClassification.from_pretrained(
    'model/critic', num_labels=1).to(device)
model_critic_ref = AutoModelForSequenceClassification.from_pretrained(
    'model/critic', num_labels=1).to(device)

model_actor.generation_config.eos_token_id = None
model_actor.generation_config.pad_token_id = None

for i in [model_actor, model_actor_ref, model_critic, model_critic_ref]:
    disable_dropout_in_model(i)

optimizer = torch.optim.AdamW(list(model_actor.parameters()) +
                              list(model_critic.parameters()),
                              lr=5e-6)

The `GPTNeoXSdpaAttention` class is deprecated in favor of simply modifying the `config._attn_implementation`attribute of the `GPTNeoXAttention` class! It will be removed in v4.48


In [4]:
def get_value(critic, question, answer, shift=True):
    input_ids = torch.cat((question, answer), 1)
    attention_mask = input_ids != tokenizer.pad_token_id
    position_ids = attention_mask.cumsum(1) - attention_mask.long()
    input_ids = torch.masked_fill(input_ids, ~attention_mask, 0)

    #[b, lens, 768]
    last_hidden_state = critic.gpt_neox(
        input_ids=input_ids,
        attention_mask=attention_mask,
        position_ids=position_ids).last_hidden_state

    #[b, lens]
    value = critic.score(last_hidden_state)

    if shift:
        value = value[:, question.shape[1] - 1:-1].squeeze(-1)

    return value


get_value(model_critic,
          torch.randint(0, 10, [2, 5]).to(device),
          torch.randint(0, 10, [2, 15]).to(device)).shape

torch.Size([2, 15])

In [5]:
def get_logprob(actor, question, answer):
    input_ids = torch.cat((question, answer), 1)
    attention_mask = input_ids != tokenizer.pad_token_id
    position_ids = attention_mask.cumsum(1) - attention_mask.long()
    input_ids = torch.masked_fill(input_ids, ~attention_mask, 0)

    logits = actor(input_ids=input_ids,
                   attention_mask=attention_mask,
                   position_ids=position_ids).logits

    logits = logits[:, question.shape[1] - 1:-1]
    logits /= 0.7

    logprob = logits.log_softmax(dim=-1)
    logprob = logprob.gather(2, answer.unsqueeze(-1)).squeeze(-1)

    return logprob


get_logprob(model_actor,
            torch.randint(0, 10, [2, 5]).to(device),
            torch.randint(0, 10, [2, 15]).to(device)).shape

torch.Size([2, 15])

In [6]:
def get_advantage(value, reward_kl):
    advantage = []
    last = 0
    for i in reversed(range(value.shape[1])):
        value_next = 0.0
        if i < value.shape[1] - 1:
            value_next = value[:, i + 1]

        delta = reward_kl[:, i] + value_next - value[:, i]

        last = delta + 0.95 * last

        advantage.append(last)

    return torch.stack(advantage[::-1], axis=1)


get_advantage(torch.randn(4, 25), torch.randn(4, 25)).shape

torch.Size([4, 25])

In [7]:
from trl.trainer.utils import first_true_indices


@torch.no_grad()
def get_data(question):
    #====answer====
    answer = model_actor.generate(
        input_ids=question,
        attention_mask=(question != tokenizer.pad_token_id).long(),
        min_length=-1,
        max_length=50,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        top_k=0.0,
        top_p=1.0,
        do_sample=True)

    answer = answer[:, question.shape[1]:]

    #求结束位置
    ends = first_true_indices(answer == tokenizer.pad_token_id).tolist()

    #====prob,value====
    prob_old = get_logprob(model_actor, question, answer)
    prob_ref = get_logprob(model_actor_ref, question, answer)
    value_old = get_value(model_critic, question, answer)
    #这里因为有可能取到最后一个字,所以不能偏移,如果偏移的话,最后一个字的值会被裁剪掉.
    value_ref = get_value(model_critic_ref, question, answer, shift=False)

    #end以后的值value归零
    for i, end in enumerate(ends):
        prob_old[i, end:] = 1.0
        prob_ref[i, end:] = 1.0
        value_old[i, end + 1:] = 0.0

    #====reward====
    reward = []
    for i, end in enumerate(ends):
        #没有eos符号的,置为-1
        if tokenizer.eos_token_id not in answer[i]:
            #reward.append(-1)
            #continue
            pass
        #取最后一个字的value作为reward
        reward.append(value_ref[i, end + question.shape[1] - 1])
    reward = torch.FloatTensor(reward).to(device)

    #====advantage====
    #计算kl散度
    reward_kl = -0.05 * (prob_old - prob_ref)

    #把reward加在最后一个字的kl散度上
    for i, end in enumerate(ends):
        if end == len(answer[i]):
            end = -1
        #assert end == -1

        reward_kl[i, end] += reward[i]

    advantage = get_advantage(value_old, reward_kl)
    returns = advantage + value_old

    #标准化,保持数值稳定
    select = torch.cat([adv[:end] for adv, end in zip(advantage, ends)])
    advantage = (advantage - select.mean()) / (select.var() + 1e-8)**0.5

    #end以后的值归零
    for i, end in enumerate(ends):
        advantage[i, end:] = 0

    return question, answer, ends, prob_old, value_old, advantage, returns


get_data(next(iter(loader)))

(tensor([[ 0,  5,  3,  4,  7, 15,  7, 11,  9,  7, 17],
         [ 0,  9, 11, 12,  7, 13,  7,  4, 12,  5, 17],
         [ 0, 12, 11,  6, 10, 16, 12,  7,  5,  3, 17],
         [ 0,  6,  6,  9,  7, 13, 12, 11,  6,  3, 17],
         [ 0,  8,  8,  4, 10, 15,  9,  8,  7,  8, 17],
         [ 0,  9,  4, 11,  5, 15, 11,  7, 10,  6, 17],
         [ 0,  4,  6,  8,  3, 14,  6,  7,  3,  8, 17],
         [ 0,  7,  6,  5,  6, 15,  6,  7,  3,  8, 17],
         [ 0,  6, 10,  6,  9, 13,  7, 10, 11,  3, 17],
         [ 2,  0,  7,  8,  8,  7, 15,  8, 10,  6, 17],
         [ 2,  0,  6,  6,  3,  4, 14,  8, 10,  3, 17],
         [ 0,  6,  8, 11,  6, 16, 10,  7,  4,  5, 17],
         [ 2,  0,  9, 11,  8, 12, 14,  5, 11, 10, 17],
         [ 0, 12, 11,  4,  7, 15,  6, 12, 10, 12, 17],
         [ 0,  4,  9,  8,  3, 14, 10,  3,  5, 12, 17],
         [ 0,  5, 10, 12,  4, 16, 10, 11,  4,  5, 17]], device='cuda:0'),
 tensor([[12, 12,  9,  4, 11,  6,  9,  1,  2],
         [ 4,  3,  3, 11,  9,  1,  2,  2,  2],
       

In [8]:
def train(question, answer, ends, prob_old, value_old, advantage, returns):
    for _ in range(4):
        #重新计算value和prob
        prob_new = get_logprob(model_actor, question, answer)
        value_new = get_value(model_critic, question, answer)

        #end以后的值value归零
        for i, end in enumerate(ends):
            prob_new[i, end:] = 1.0
            value_new[i, end + 1:] = 0

        #计算critic部分的loss
        value_clip = torch.clamp(value_new, value_old - 0.2, value_old + 0.2)
        loss_vf1 = (value_new - returns)**2
        loss_vf2 = (value_clip - returns)**2
        loss_vf = torch.max(loss_vf1, loss_vf2)

        #计算actor部分的loss
        ratio = (prob_new - prob_old).exp()
        loss_pg1 = -advantage * ratio
        loss_pg2 = -advantage * torch.clamp(ratio, 0.8, 1.2)
        loss_pg = torch.max(loss_pg1, loss_pg2)

        #丢弃end之后的部分
        loss_vf = [xi[:end + 1] for xi, end in zip(loss_vf, ends)]
        loss_pg = [xi[:end + 1] for xi, end in zip(loss_pg, ends)]
        loss_vf = torch.cat(loss_vf).mean()
        loss_pg = torch.cat(loss_pg).mean()

        loss = loss_pg + 0.05 * loss_vf
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()


train(*get_data(next(iter(loader))))

In [9]:
for i in range(1_0000_0000):
    train(*get_data(next(iter(loader))))

    if i % 200 == 0:
        print(i)
        input_ids = next(iter(loader))[0:1]

        gen = model_actor.generate(input_ids=input_ids,
                                   min_length=-1,
                                   max_length=50,
                                   pad_token_id=tokenizer.pad_token_id,
                                   eos_token_id=tokenizer.eos_token_id,
                                   top_k=0.0,
                                   top_p=1.0,
                                   do_sample=True)

        question = tokenizer.decode(input_ids[0])

        answer = question
        answer = answer[answer.index(tokenizer.bos_token) + 1:]
        answer = answer[:answer.index('=')]
        answer = int(eval(answer))
        gen = tokenizer.decode(gen[0, input_ids.shape[1]:])

        print({'question': question, 'answer': answer, 'gen': gen})

    if (i + 1) % 2000 == 0:
        model_actor.save_pretrained('model/ppo_copy_%d' % i)

model_actor.save_pretrained('model/ppo_copy')

0
{'question': 'B3393+1360=', 'answer': 4753, 'gen': '4753E'}
200
{'question': 'B4380-4033=', 'answer': 347, 'gen': '347E'}
400
{'question': 'B7429-1701=', 'answer': 5728, 'gen': '5728E'}
600
{'question': 'B6549/853=', 'answer': 7, 'gen': '7E'}
800
{'question': 'B2196-5373=', 'answer': -3177, 'gen': '-3177E'}
1000
{'question': 'B9824*6189=', 'answer': 60800736, 'gen': '59860376E'}
1200
{'question': 'B2742+3678=', 'answer': 6420, 'gen': '6420E'}
1400
{'question': 'B3055*8870=', 'answer': 27097850, 'gen': '26817850E'}
1600
{'question': 'B6325*932=', 'answer': 5894900, 'gen': '5907200E'}
1800
{'question': 'B3414/5155=', 'answer': 0, 'gen': '0E'}
2000
{'question': 'B3655/6885=', 'answer': 0, 'gen': '0E'}
2200
{'question': 'B9518/9036=', 'answer': 1, 'gen': '1E'}
2400
{'question': 'B92*3833=', 'answer': 352636, 'gen': '355196E'}
2600
{'question': 'B9242*7937=', 'answer': 73353754, 'gen': '74113734E'}
2800
{'question': 'B4557/2072=', 'answer': 2, 'gen': '2E'}
3000
{'question': 'B5571/4104=',

KeyboardInterrupt: 