In [1]:
import torch
import random
import os

os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
b = 4

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('gpt2')
tokenizer.pad_token_id = 0

tokenizer

GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '!'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}

In [2]:
from datasets import load_dataset

dataset = load_dataset('b-mc2/sql-create-context', split='train')


def f(data):
    question = 'context: ' + data['context'] + ' question: ' + data[
        'question'] + ' answer: '
    answer = data['answer']

    question = tokenizer.encode(question, add_special_tokens=False)
    answer = tokenizer.encode(answer, add_special_tokens=False)

    return {'question': question, 'answer': answer}


dataset = dataset.map(f, remove_columns=['context'])

f = lambda data: len(data['question']) + len(data['answer']) < 500
dataset = dataset.filter(f)

dataset, dataset[0]

Map:   0%|          | 0/78577 [00:00<?, ? examples/s]

Filter:   0%|          | 0/78577 [00:00<?, ? examples/s]

(Dataset({
     features: ['question', 'answer'],
     num_rows: 78577
 }),
 {'question': [22866,
   25,
   29244,
   6158,
   43679,
   1182,
   357,
   496,
   17828,
   7156,
   1137,
   8,
   1808,
   25,
   1374,
   867,
   6665,
   286,
   262,
   13346,
   389,
   4697,
   621,
   7265,
   5633,
   3280,
   25,
   220],
  'answer': [46506,
   327,
   28270,
   7,
   28104,
   16034,
   1182,
   33411,
   2479,
   1875,
   7265]})

In [3]:
def get_data():
    data = random.choices(dataset, k=b)
    question = [i['question'] for i in data]
    answer = [i['answer'] for i in data]

    chosen = [
        q + a + [tokenizer.eos_token_id] for q, a in zip(question, answer)
    ]
    
    #rejected直接定义为空
    rejected = [q for q, a in zip(question, answer)]
    data = chosen + rejected

    data = [{'input_ids': i} for i in data]
    data = tokenizer.pad(data, return_tensors='pt').to(device)

    data['answer_mask'] = data['attention_mask'].clone()

    for i, q in enumerate(question):
        data['answer_mask'][i, :len(q)] = 0
        data['answer_mask'][i + b, :len(q)] = 0

    return data


get_data()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': tensor([[22866,    25, 29244,  6158, 43679,  3084,    62,  3672,    62,  1270,
           357,   268, 48108, 17828,  7156,  1137,    11,  1524,   569, 31315,
          1503,     8,  1808,    25,  1867,   318,   262, 20753,   329,   262,
          1524, 39874,    30,  3280,    25,   220, 46506, 35683,     7,   268,
         48108,     8, 16034,  3084,    62,  3672,    62,  1270, 33411,  1524,
           796,   366,  8929,  1044,     1, 50256,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0],
        [22866,    25, 29244,  6158, 43679,  3084,    62,  2075,  3720,  3312,
            16,    62,    21,   357, 27729,   834, 17618, 17828,  7156,  1137,
            11,  2137,   569, 31315,  1503,     8,  1808,    25,  1867,   318,
           262,  4511,  2298,  1271,   329,  2137,   836,  2318,   527,    30,
          3280,    25,   220, 46506, 25882,     7, 27729,   834, 17618,     8,
         16034

In [4]:
from transformers import AutoModelForCausalLM

model_actor = AutoModelForCausalLM.from_pretrained('model/actor').to(device)
model_actor_ref = AutoModelForCausalLM.from_pretrained('model/actor').to(
    device)

In [5]:
def get_prob_diff(actor, input_ids, attention_mask, answer_mask):
    prob = actor(input_ids=input_ids, attention_mask=attention_mask).logits

    #偏移以对齐
    input_ids = input_ids[:, 1:]
    answer_mask = answer_mask[:, 1:]
    prob = prob[:, :-1]

    #取所有字的预测概率,因为要求联合概率,所以取对数
    prob = (prob.softmax(2) + 1e-8).log()

    prob = prob.gather(2, index=input_ids.unsqueeze(2)).squeeze(2)

    #取答案部分的联合概率
    prob = (prob * answer_mask).sum(1)

    #两部分的概率求差,这部分数据即可视为loss,这部分数据是越大越好
    return prob[:b] - prob[b:]


get_prob_diff(model_actor, **get_data())

tensor([-5.9515, -9.2023, -6.3256, -6.8593], device='cuda:0',
       grad_fn=<SubBackward0>)

In [6]:
optimizer = torch.optim.Adam(model_actor.parameters(), lr=1e-5)

for i in range(8000):
    data = get_data()

    #两个模型分别计算概率对数
    prob_diff = get_prob_diff(model_actor, **data)
    with torch.no_grad():
        prob_diff_ref = get_prob_diff(model_actor_ref, **data)

    #ref部分的概率可以视为常量,理解时可以直接把括号里面的ref部分忽略
    loss = 0.1 * (prob_diff - prob_diff_ref)

    #取sigmoid再取log保持数值稳定
    loss = (loss.sigmoid() + 1e-8).log().mean()

    #符号取反,因为loss是越小越好,也就是说,上面计算的结果越大越好
    loss = -loss

    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    if i % 200 == 0:
        print(i, loss.item())
        data = random.choice(dataset)
        question = torch.LongTensor(data['question']).unsqueeze(0).to(device)

        gen = model_actor.generate(input_ids=question,
                                   min_length=-1,
                                   max_length=question.shape[1] + 50,
                                   pad_token_id=tokenizer.pad_token_id,
                                   eos_token_id=tokenizer.eos_token_id,
                                   top_k=0.0,
                                   top_p=1.0,
                                   do_sample=True)
        gen = gen[0, question.shape[1]:]
        print(tokenizer.decode(data['answer']))
        print(tokenizer.decode(gen))

model_actor.save_pretrained('model/dpo')

0 0.6931471824645996
SELECT gpu‡ FROM table_name_31 WHERE application = "cushaw"
SELECT gpu‡ FROM table_name_31 WHERE application = "cushawn"<|endoftext|>
200 0.4377593994140625
SELECT field FROM table_name_63 WHERE date = "july 12"
SELECT field FROM table_name_63 WHERE date = "july 12"<|endoftext|>
400 0.5559206008911133
SELECT production_code__order_they_were_made___number FROM table_2342078_2 WHERE episode__number = 13
SELECT COUNT(production_code__order_they_were_made___number) FROM table_2342078_2 WHERE episode__number = 13<|endoftext|>
600 0.45148441195487976
SELECT home_team FROM table_name_70 WHERE away_team = "melbourne"
SELECT home_team FROM table_name_70 WHERE away_team = "melbourne"<|endoftext|>
800 0.4183880388736725
SELECT AVG(game) FROM table_name_77 WHERE record = "30-22"
SELECT SUM(game) FROM table_name_77 WHERE record = "30-22"<|endoftext|>
1000 0.647190272808075
SELECT type FROM table_name_87 WHERE location = "brazil" AND entered_service = "1976/1997"
SELECT type FRO