In [1]:
# PPO + GPT2, 中文情感分析

import time
import random

import torch
from rich import print
from tqdm import tqdm
import numpy as np

from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

from trl.gpt2 import GPT2HeadWithValueModel
from trl.ppo import PPOTrainer

from iTrainingLogger import iSummaryWriter

In [2]:
# !pip install rich

In [3]:
writer = iSummaryWriter(log_path = './logs', log_name = 'PPO-Sentiment-Zh')
config = {
    'model_name': 'uer/gpt2-chinese-cluecorpussmall',
    'steps': 5000,
    'batch_size': 32,
    'forward_batch_size': 16,
    'ppo_epoch': 4,
    'lr': 1.41e-5,
    'init_kl_ceof': 0.2,
    'target': 6,
    'horizon': 10000,
    'gamma': 1,
    'lam': 0.95,
    'cliprange': .2,
    'cliprange_value': .2,
    'vf_coef': .1,
    'gen_len': 16
}

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
pipe_device = 0 if torch.cuda.is_available() else -1

In [5]:
prompts = [
    '刚收到货，感觉',
    '这部电影',
    '说实话，真的很',
    '这次购物总的来说体验很'
]

In [6]:
# 情感分类模型
senti_tokenizer = AutoTokenizer.from_pretrained('uer/roberta-base-finetuned-jd-binary-chinese')
senti_model = AutoModelForSequenceClassification.from_pretrained('uer/roberta-base-finetuned-jd-binary-chinese')
sentiment_pipe = pipeline('sentiment-analysis', model = senti_model, tokenizer = senti_tokenizer, device = pipe_device)

In [7]:
# 文本生成模型
gpt2_model = GPT2HeadWithValueModel.from_pretrained(config['model_name'])
gpt2_model_ref = GPT2HeadWithValueModel.from_pretrained(config['model_name'])
gpt2_tokenizer = AutoTokenizer.from_pretrained(config['model_name'])
gpt2_tokenizer.eos_token = gpt2_tokenizer.pad_token
gpt2_model.to(device)
gpt2_model_ref.to(device)

Some weights of GPT2HeadWithValueModel were not initialized from the model checkpoint at uer/gpt2-chinese-cluecorpussmall and are newly initialized: ['v_head.summary.weight', 'v_head.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of GPT2HeadWithValueModel were not initialized from the model checkpoint at uer/gpt2-chinese-cluecorpussmall and are newly initialized: ['v_head.summary.weight', 'v_head.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2HeadWithValueModel(
  (transformer): GPT2Model(
    (wte): Embedding(21128, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropou

In [8]:
gen_kwargs = {
    'min_length': -1,
    'top_k': 0.0,
    'top_p': 1.0,
    'do_sample': True,
    'pad_token_id': gpt2_tokenizer.eos_token_id
}

In [9]:
# RL trainer
ppo_trainer = PPOTrainer(gpt2_model, gpt2_model_ref, gpt2_tokenizer, **config)
total_ppo_epochs = int(np.ceil(config['steps'] / config['batch_size']))

for epoch in tqdm(range(total_ppo_epochs)):
    logs, timing = dict(), dict()
    t0 = time.time()
    
    batch = {
        'tokens': [],
        'query': []
    }
    
    for _ in range(config['batch_size']):
        random_prompt = random.choice(prompts)
        tokens = gpt2_tokenizer.encode(random_prompt)
        batch['tokens'].append(tokens)
        batch['query'].append(random_prompt)
    query_tensors = [torch.tensor(t).long().to(device) for t in batch['tokens']]
    
    t = time.time()
    response_tensors = []
    for i in range(config['batch_size']):
        gen_len = config['gen_len']
        response = gpt2_model.generate(query_tensors[i].unsqueeze(dim = 0),
                                       max_new_tokens = gen_len, **gen_kwargs)
        if i == 0:
            print(f'hoho: response = {response.size()}')
    
        response_tensors.append(response.squeeze()[-gen_len:])
    batch['response'] = [gpt2_tokenizer.decode(r.squeeze()) for r in response_tensors]
    timing['time/get_response'] = time.time() - t
    
    t = time.time()
    texts = [q + r for q, r in zip(batch['query'], batch['response'])]
# 计算正向/负向情感得分
    
    pipe_outputs = sentiment_pipe(texts)
    print(f'hoho: pipe_output = {pipe_outputs}')
    
    rewards = []
    for output in pipe_outputs:
        if output['label'] == 'positive (stars 4 and 5)':
            rewards.append(output['score'])
        elif output['label'] == 'negative (stars 1, 2 and 3)':
            rewards.append(1 - output['score'])
        else:
            raise ValueError(f"错误的推理结果{output['label']}.")
    rewards = torch.tensor(rewards).to(device)
#　将正向情感的得分作为生成得分
    timing['time/get_sentiment_preds'] = time.time() - t
    
    t = time.time()
    stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
    print(f'hoho: ppo stats = {stats}')
# PPO update
    timing['time/optimization'] = time.time() - t
    timing['time/epoch'] = time.time() - t0

# logging
    logs.update(timing)
    logs.update(stats)
    logs['env/reward_mean'] = torch.mean(rewards).cpu().numpy()
    logs['env/reward_std'] = torch.std(rewards).cpu().numpy()
    logs['env/reward_dist'] = rewards.cpu().numpy()
    print(f"epoch {epoch} mean-reward: {logs['env/reward_mean']}")
    
    print('Random Sample 5 text(s) of model output:')
    for i in range(5):
# 随机输出5个生成的结果
        print(f'{i + 1}. {random.choice(texts)}')
    
    writer.add_scalar('train/reward', logs['env/reward_mean'], epoch)
    for k, v in timing.items():
        writer.add_scalar(k, v, epoch)
    writer.add_scalar('ppo/loss/policy', stats['ppo/loss/policy'], epoch)
    writer.add_scalar('ppo/loss/value', stats['ppo/loss/value'], epoch)
    writer.add_scalar('ppo/policy/entropy', stats['ppo/policy/entropy'], epoch)
    writer.add_scalar('ppo/policy/policykl', stats['ppo/policy/policykl'], epoch)
    
    writer.record()
    

  0%|                                                                            | 0/157 [00:00<?, ?it/s]

  0%|                                                                            | 0/157 [04:13<?, ?it/s]


TypeError: unsupported operand type(s) for -: 'builtin_function_or_method' and 'float'