In [1]:

from unsloth import FastLanguageModel, PatchFastRL
PatchFastRL("GRPO", FastLanguageModel)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
from unsloth import is_bfloat16_supported
import torch
torch.cuda.empty_cache()
max_seq_length = 2048 # Can increase for longer reasoning traces
lora_rank = 16 # Larger rank = smarter, but slower
import os
os.environ["WANDB_API_KEY"] = "e6f89d31c37067a4256ed983874a83c77e0a8d71"
os.environ["WANDB_PROJECT"] = "GRPO-ds_r1_7b_sentiment_single_day"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "/home/jzyoung/.cache/modelscope/hub/models/unsloth/deepseek-r1-distill-qwen-7b-unsloth-bnb-4bit",
    max_seq_length = max_seq_length,
    load_in_4bit = True, # False for LoRA 16bit
    fast_inference = True, # Enable vLLM fast inference
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.7, # Reduce if out of memory
)

model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ], # Remove QKVO if out of memory
    lora_alpha = lora_rank,
    use_gradient_checkpointing = "unsloth", # Enable long context finetuning
    random_state = 3407,
)

model.print_trainable_parameters() 

INFO 03-08 12:07:14 __init__.py:207] Automatically detected platform cuda.
==((====))==  Unsloth 2025.3.2: Fast Qwen2 patching. Transformers: 4.49.0.
   \\   /|    NVIDIA GeForce RTX 4080. Num GPUs = 1. Max memory: 15.992 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading /home/jzyoung/.cache/modelscope/hub/models/unsloth/deepseek-r1-distill-qwen-7b-unsloth-bnb-4bit with actual GPU utilization = 64.19%
Unsloth: Your GPU has CUDA compute capability 8.9 with VRAM = 15.99 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 2048. Num Sequences = 192.
Unsloth: vLLM's KV Cache can use up to 4.36 GB. Also swap space = 2 GB.
INFO 03-08 12:08:17 config.py:549] This model supports multiple t



INFO 03-08 12:08:19 loader.py:1089] Loading weights with BitsAndBytes quantization.  May take a while ...


Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 03-08 12:08:25 model_runner.py:1115] Loading model weights took 8.0151 GB
INFO 03-08 12:08:25 punica_selector.py:18] Using PunicaWrapperGPU.
INFO 03-08 12:08:26 worker.py:267] Memory profiling takes 1.00 seconds
INFO 03-08 12:08:26 worker.py:267] the current vLLM instance can use total_gpu_memory (15.99GiB) x gpu_memory_utilization (0.64) = 10.26GiB
INFO 03-08 12:08:26 worker.py:267] model weights take 8.02GiB; non_torch_memory takes 0.05GiB; PyTorch activation peak memory takes 1.06GiB; the rest of the memory reserved for KV Cache is 1.14GiB.
INFO 03-08 12:08:26 executor_base.py:111] # cuda blocks: 1336, # CPU blocks: 2340
INFO 03-08 12:08:26 executor_base.py:116] Maximum concurrency for 2048 tokens per request: 10.44x
INFO 03-08 12:08:27 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error oc

Capturing CUDA graph shapes: 100%|██████████| 27/27 [00:10<00:00,  2.48it/s]

INFO 03-08 12:08:38 model_runner.py:1562] Graph capturing finished in 11 secs, took 0.60 GiB
INFO 03-08 12:08:38 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 12.45 seconds



Unsloth 2025.3.2 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


trainable params: 40,370,176 || all params: 7,655,986,688 || trainable%: 0.5273


In [3]:
from datasets import load_dataset, Dataset

data_path = './dataset/sentiment_single_day.json'
sentiments = {'超级差':0, '很差':1, '差':2, '一般':3, '好':4, '很好':5, '超级好':6}
earning_rate = {'0-35%':0, '35%-70%':1, '70%-100%':2}
step_counter = 0
# data = load_dataset('json', data_files=data_path)
# print(data)

# 计算情绪差值来做奖励
def calc_sentiment_reward(senti1, senti2) -> float:
    if senti1 not in sentiments:
        return 0
    reward = 0
    if senti1 == senti2:
        print(f"@#" * 20 + "GOT SAME!!!" + "#@" * 20)
        reward = 2
    elif senti1 in ('超级差', '很差')and senti2 in ('超级差', '很差'):
        reward = 1 
    elif senti1 in ('很好', '超级好') and senti2 in ('很好', '超级好'):
        reward = 1
    else:
        diffs = abs(sentiments[senti1] - sentiments[senti2]) - 1
        reward = - (diffs // 2)
    
    print(f"sentiment #{senti1}# to #{senti2}# reward: {reward}")
    return reward

def earning_rate_reward(new_answer, original_answer) -> float:
    if new_answer not in earning_rate:
        return 0
    original_val = float(original_answer.split('%')[0])
    original_sec = ''
    if original_val < 35:
        original_sec = '0-35%'
    elif original_val < 70:
        original_sec = '35%-70%'
    else:
        original_sec = '70%-100%'

    reward = 0
    if new_answer == original_sec:
        reward = 1
    if abs(earning_rate[new_answer] - earning_rate[original_sec]) > 1:
        reward = -1

    print(f"earning_rate response answer #{new_answer}# compared to original answer {original_answer} in #{original_sec}#, reward: {reward}")
    return reward


def extract_xml_answer(text: str) -> str:
    # print("extract answer from text:", text)
    answer = text.split("</thinking>")[-1]
    answer = answer.split("<answer>")[-1]
    answer = answer.split("</answer>")[0].strip()

    answer = answer.split("boxed{")[-1]

    
    answer = answer.split('{')[-1]
    answer = answer.split('答案:')[-1]
    answer = answer.split('答案：')[-1]
    answer = answer.split('=')[-1]

    last = min(len(answer), 7)
    answer = answer[-last:]
    if answer.find('超级好') != -1:
        answer = '超级好'
    elif answer.find('超级差') != -1:
        answer = '超级差'
    elif answer.find('很好') != -1:
        answer = '很好'
    elif answer.find('很差') != -1:
        answer = '很差'
    elif answer.find('一般') != -1:
        answer = '一般'
    elif answer.find('好') != -1:
        answer = '好'
    elif answer.find('差') != -1:
        answer = '差'
    else:
        answer = answer.strip("})*#].。”")
    
    # print(f"extracted answer: {answer} \n", )
    return answer

def extract_earning_rate_answer(text: str) -> str:
    # print("extract answer from text:", text)
    answer = text.split("</thinking>")[-1]
    answer = answer.split("<answer>")[-1]
    answer = answer.split("</answer>")[0].strip()
    last = min(len(text), 15)
    answer = answer[-last:]

    answer = answer.split("boxed{")[-1]
    if answer.find('0-35%') != -1 or answer.find('0-35\%') != -1 or answer.find('0%-35%') != -1 or answer.find('0 - 35%') != -1:
      return '0-35%'
    if answer.find('35%-70%') != -1 or answer.find('35\%-70\%') != -1 or answer.find('35% - 70%') != -1:
      return '35%-70%'
    if answer.find('70%-100%') != -1 or answer.find('70\%-100\%') != -1 or answer.find('70% - 100%') != -1:
      return '70%-100%'
    return answer

def extract_labeled_answer(text: str) -> str:
    answer = text.split("（")[0]
    answer = answer.split("(")[0]
    return answer.strip()

def get_formatted_data(data_path) -> Dataset:
    data = load_dataset('json', data_files=data_path)[ "train"]
    # print(data)
    data = data.map(lambda x: { 
        'prompt': [
            {'role': 'user', 'content': x['instruction'] + x['input']}
        ],
        'answer': extract_labeled_answer(x['output'])
    })
    return data


# thinking长度奖励
def think_reward(text: str) -> float:
    if '</think>' not in text:
        return 0
    reward = 0
    think = text.split('<think>')[-1]
    think = think.split('</think>')[0]
    if len(think) > 500:
        reward += 0.15
    return reward

def sentiment_correctness_reward_func(prompts, completions, answer, **kwargs) -> list[float]:
    responses = [completion[0]['content'] for completion in completions]
    request = prompts[0][-1]['content']
    extracted_responses = [extract_xml_answer(r) for r in responses]
    global step_counter
    step_counter += 1
    print(f"\n{'='*50}\n第 {step_counter} 步训练\n{'='*50}")
    # print(f'len of label answer: {len(answer)}, len of completions: {len(completions)}， len of response: {len(responses)}, len of extracted_responses: {len(extracted_responses)}')
    print('-'*20, f"Question:\n{request}", f"\nAnswer:\n{answer[0]}", f"\nResponse:\n{responses[0]}", f"\nExtracted:\n{extracted_responses[0]}")
    return [calc_sentiment_reward(r, a) for r, a in zip(extracted_responses, answer)]

def earning_rate_correctness_reward_func(prompts, completions, answer, **kwargs) -> list[float]:
    responses = [completion[0]['content'] for completion in completions]
    request = prompts[0][-1]['content']
    extracted_responses = [extract_xml_answer(r) for r in responses]
    global step_counter
    step_counter += 1
    print(f"\n{'='*50}\n第 {step_counter} 步训练\n{'='*50}")
    # print(f'len of label answer: {len(answer)}, len of completions: {len(completions)}， len of response: {len(responses)}, len of extracted_responses: {len(extracted_responses)}')
    print('-'*20, f"Question:\n{request}", f"\nAnswer:\n{answer[0]}", f"\nResponse:\n{responses[0]}", f"\nExtracted:\n{extracted_responses[0]}")
    return [earning_rate_reward(r, a) for r, a in zip(extracted_responses, answer)]

def length_reward_func(completions, **kwargs) -> list[float]:
    responses = [completion[0]['content'] for completion in completions]
    return [think_reward(r) for r in responses]


dataset = get_formatted_data(data_path)



Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/693 [00:00<?, ? examples/s]

In [4]:
from trl import GRPOConfig, GRPOTrainer
import wandb

training_args = GRPOConfig(
    use_vllm = True, # use vLLM for fast inference!
    learning_rate = 5e-5,
    adam_beta1 = 0.9,
    adam_beta2 = 0.99,
    weight_decay = 0.1,
    warmup_ratio = 0.1,
    lr_scheduler_type = "cosine",
    optim = "adamw_8bit",
    logging_steps = 1,
    bf16 = is_bfloat16_supported(),
    fp16 = not is_bfloat16_supported(),
    per_device_train_batch_size = 4,
    gradient_accumulation_steps = 1, # Increase to 4 for smoother training
    num_generations = 4, # Decrease if out of memory
    max_prompt_length = 512,
    max_completion_length = 2000,
    num_train_epochs = 3, # Set to 1 for a full training run
    # max_steps = 20,
    save_steps = 100,
    # max_grad_norm = 0.1,
    report_to=["wandb"], # Can use Weights & Biases
    output_dir = "outputs",
)

wandb.init(
        config={
            "model_name": "deepseek-ai/deepseek-r1-distill-qwen-7b",
            "learning_rate": training_args.learning_rate,
            "batch_size": training_args.per_device_train_batch_size,
            "num_epochs": training_args.num_train_epochs
        }
    )

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mjiangzyang[0m ([33mjiangzyang-marin[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
print("dataset length is ", len(dataset[:]))
trainer = GRPOTrainer(
    model = model,
    processing_class = tokenizer,
    reward_funcs = [
        sentiment_correctness_reward_func,

    ],
    args = training_args,
    train_dataset = dataset,
)
trainer.train()

dataset length is  5


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 693 | Num Epochs = 3 | Total steps = 2,079
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 1 x 1) = 4
 "-____-"     Trainable parameters = 40,370,176/5,383,329,280 (0.75% trained)



第 1 步训练
-------------------- Question:
请你根据一些当天的市场数据，来预测当天的市场情绪。当天市场数据是：大盘量比0.92， 大盘涨幅0.2， 涨停数量44（1）+4， 跌停数量6+3， 赚钱效应70.00%， 炸板率33.80%， 连板数8（2）， 昨日涨停表现0.22。
市场情绪分类范围是：超级差、很差、差、一般、好、很好、超级好。程度由坏到好。请你一步步的推理分析，最终<answer>里的答案必须是在这个范围内，而且只给出一个答案。
 
Answer:
一般 
Response:
嗯，我现在得根据用户提供的市场数据来预测当天的市场情绪。首先，我得仔细分析这些数据，然后一步步推理分析，最后得出结论。

首先，大盘量比是0.92。量比通常用来判断市场的活跃程度。量比指的是成交量相对于前一天的成交量。一般来说，量比高于1表示成交量放大，市场较为活跃；量比接近1或低于1则相对平静。这里量比是0.92，稍微低于1，说明成交量有所缩小，可能市场活跃度不高。

接下来是大盘涨幅0.2%。这个涨幅很小，说明整体市场走势不大，可能处于一个较稳定的区间，或者刚刚有轻微的上涨但随后可能受到限制。

再来看涨停和跌停的情况。当天涨停数量是44，比昨天多了4个，跌停数量是6，比昨天多了3个。涨停和跌停的数量都增加，尤其是涨停数量增长，说明有较多的投资者在积极尝试做多，但同时也出现了较多的跌停，这可能表示市场情绪分化比较严重，部分投资者在高位获利，而另一部分投资者则可能在低位被套，导致市场出现震荡甚至下跌。

赚钱效应是70%，这个数字相对较高，说明有很多投资者在当天交易中赚到了钱，尤其是涨停板上的资金流入较多。同时，亏损率或亏损金额可能较小，因为亏损比例通常和赚钱比例互补，但具体数据没给出，所以需要进一步分析。

炸板率是33.8%。炸板率指的是当天的涨停板股票中有多少比例会在之后的交易时间内跌落，也就是被打开。这个率较高，说明有较多的高位股票被轮番下跌，可能市场情绪比较谨慎，或者有较大的波动。

连板数是8个，其中连板数是指连续涨停的次数，这里是8个。连板数较多可能意味着市场有一定的热点持续性，但这也可能表示市场情绪较为集中，可能在少数几个板块或概念股上有活跃的交易，整体市场可能呈现出两极分化，有利好也有利空，或者市场情绪不稳定。

昨日涨停表

Step,Training Loss,reward,reward_std,completion_length,kl,rewards / sentiment_correctness_reward_func,rewards / length_reward_func
1,-0.0,1.575,1.052378,802.0,0.0,1.5,0.075
2,0.0,0.15,0.0,859.5,0.0,0.0,0.15
3,0.0,-0.1,0.5,957.0,0.000774,-0.25,0.15
4,0.0,0.6125,0.925,677.75,0.000719,0.5,0.1125
5,0.0,1.5375,0.925,557.25,0.000574,1.5,0.0375
6,0.0,1.15,1.154701,671.75,0.000702,1.0,0.15
7,0.0,1.1125,1.113085,615.25,0.000652,1.0,0.1125
8,0.0,2.15,0.0,639.25,0.000763,2.0,0.15
9,0.0,2.15,0.0,644.75,0.00066,2.0,0.15
10,0.0,0.15,0.0,807.5,0.000766,0.0,0.15



第 2 步训练
-------------------- Question:
请你根据一些当天的市场数据，来预测当天的市场情绪。当天市场数据是：大盘量比0.86， 大盘涨幅-0.32， 涨停数量62（3）+6， 跌停数量25+2， 赚钱效应41.50%， 炸板率26.60%， 连板数12(1)， 昨日涨停表现-0.76。
市场情绪分类范围是：超级差、很差、差、一般、好、很好、超级好。程度由坏到好。请你一步步的推理分析，最终<answer>里的答案必须是在这个范围内，而且只给出一个答案。
 
Answer:
一般 
Response:
今天我要根据提供的市场数据来分析当天的市场情绪。首先，我需要了解每个指标的意义以及它们如何反映市场情绪。

1. **大盘量比**：当前为0.86，这低于1。量比小于1通常表示市场在当前价格中出现了一些卖方力量，可能 indicate a bearish bias，所以是一个坏的信号。

2. **大盘涨幅**：为-0.32%，这是一个负数，说明整体市场下跌。跌幅大于0，显示市场整体表现疲软，Bad。

3. **涨停数量**：62个涨停，其中有3个是新高，这显示有相对积极的买盘。6个涨停板也显示有一定的市场活跃度，但需要结合其他因素来看。

4. **跌停数量**：25个跌停，其中2个是新低，显示有相当多的股票在下跌，卖压较大，Bad。

5. **赚钱效应**：41.50%，为正数，但接近40%。这个指标通常表示 Hull moving averageEMA50/200交叉，显示市场的赚钱效应较好，Good。

6. **炸板率**：26.60%，高于一般水平，说明高连板个股的高概率被炸板，显示市场情绪较为分化，Good但不一定好。

7. **连板数**：12个，其中1个是新高，连板数较少，说明市场情绪较为谨慎，Good但不一定好。

8. **昨日涨停表现**：为-0.76%，连续两天涨停的结果显示市场情绪仍然不佳。

综合以上因素，市场量比和单日涨幅是负面信号，但赚钱效应和部分跌停显示部分投资者可能正在获利了结或有新的资金流入。然而，赚钱效应为正值，显示出有一定的好信号。

尽管赚钱效应为正值，但结合量比和单日涨幅来看，整体市场情绪偏向负面，尤其是在连续两天的负面数据下，显示出市场较为悲观。因此，最终市场情绪应归类为“差”。

TrainOutput(global_step=2079, training_loss=0.00036563305926854886, metrics={'train_runtime': 90943.5456, 'train_samples_per_second': 0.023, 'train_steps_per_second': 0.023, 'total_flos': 0.0, 'train_loss': 0.00036563305926854886})

In [6]:
model.save_lora("grpo_saved_lora_sentiment_single_day")

wandb.finish()

0,1
train/completion_length,▂▄▄▂▁▄▅▄▆▄▁█▃▅▄▂▆▅▃▆▃▅▄▄▅▄▅▅▅▅▄▂▆▄▅▆▇▃▅▁
train/epoch,▁▁▁▁▁▂▂▂▂▃▃▃▃▃▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▇▇▇▇█████
train/global_step,▁▁▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇█
train/grad_norm,▆▇▁▆▇▆▁▆▆▁▆▆▆▆▅█▁▇▁█▆▁▁▅▁▁▇▆▆▁▅▆▇▇▆▆▁▁▇▆
train/kl,▁▁▁▁▁▃▃▃▃▃█▄▇▄▄▆▆▆▇▇▆▇▅▆▅▅▄▄▅▄▄▆▅▅▅▄▆▅▅▄
train/learning_rate,▃▅▅▅▆███████▇▇▇▇▇▇▇▆▆▆▆▆▆▅▅▄▄▃▃▃▃▂▂▁▁▁▁▁
train/loss,▁▁▂▂▇▅▃▃▃▅▇▆▇█▇▆▅█▇▇▇▆▇▇█▅▆▅▇▆▆▅▅▅▆▅▅▅▅▃
train/reward,▃▄▃▃▃▁▄▄▄▂▃▄▃▄▇▃▃▅▃▃▂▇▆▃▃▃▃▅▃▄▇▃▄▃▃█▄▄▃▅
train/reward_std,█▄▇▂█▁▄▇▁▇▁█▁▇▁▁▁▇▁▁▁▁█▇▁▁▁▇▁█▇▇█▇█▁▇▁▄▇
train/rewards/length_reward_func,█████████████████▁█████████▁███▁███████▁

0,1
total_flos,0.0
train/completion_length,826.0
train/epoch,3.0
train/global_step,2079.0
train/grad_norm,0.09334
train/kl,0.00739
train/learning_rate,0.0
train/loss,0.0003
train/reward,0.9
train/reward_std,0.95743


In [7]:

prompts = dataset[-30:]['prompt']
answers = dataset[-30:]['answer']
text = tokenizer.apply_chat_template(prompts, tokenize = False, add_generation_prompt = True)

from vllm import SamplingParams
sampling_params = SamplingParams(
    temperature = 0.7,
    top_p = 0.95,
    max_tokens = 4096,
)
base_output = model.fast_generate(
    text,
    sampling_params = sampling_params,
    lora_request = None,
)
base_responses = [extract_xml_answer(o.outputs[0].text) for o in base_output]
print(base_responses)

def accuracy(responses, answers):
    return sum([1 for r, a in zip(responses, answers) if r == a]) / len(responses)

print("base模型准确率是：", accuracy(base_responses, answers))

Processed prompts:   0%|          | 0/30 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts: 100%|██████████| 30/30 [01:07<00:00,  2.25s/it, est. speed input: 70.99 toks/s, output: 321.17 toks/s]

['好', '差', '差', '一般', '差', '情绪较为分化', '差', '好', '好', '差', '差', '差', '好', '04个，增加了', '一般', '好', '一般', '差', '差', '差', '一般', '差', '一般', '差', '好', '差', '差', '一般', '差', '差']
base模型准确率是： 0.06666666666666667





In [8]:

text = tokenizer.apply_chat_template(prompts, tokenize = False, add_generation_prompt = True)

ft_output = model.fast_generate(
    text,
    sampling_params = sampling_params,
    lora_request = model.load_lora("grpo_saved_lora_sentiment_single_day"),
)
fintuning_responses = [extract_xml_answer(o.outputs[0].text) for o in ft_output]
print(fintuning_responses)

def accuracy(responses, answers):
    return sum([1 for r, a in zip(responses, answers) if r == a]) / len(responses)

print("ft模型准确率是：", accuracy(fintuning_responses, answers))


Processed prompts: 100%|██████████| 30/30 [00:54<00:00,  1.82s/it, est. speed input: 87.73 toks/s, output: 399.67 toks/s]

['好', '好', '差', '好', '差', '很好', '一般', '好', '好', '好', '一般', '一般', '好', '差', '好', '一般', '好', '一般', '一般', '很差', '一般', '一般', '一般', '一般', '一般', '很差', '一般', '一般', '一般', '差']
ft模型准确率是： 0.3





In [9]:
eval_path = './dataset/sentiment_eval_single_day.json'
eval_dataset = get_formatted_data(eval_path)

eval_prompts = eval_dataset['prompt']
eval_answers = eval_dataset['answer']
eval_text = tokenizer.apply_chat_template(eval_prompts, tokenize = False, add_generation_prompt = True)

eval_output = model.fast_generate(
    eval_text,
    sampling_params = sampling_params,
    lora_request = None,
)

eval_responses = [extract_xml_answer(o.outputs[0].text) for o in eval_output]
print(eval_responses)

print("base模型准确率是：", accuracy(eval_responses, eval_answers))

eval_text = tokenizer.apply_chat_template(eval_prompts, tokenize = False, add_generation_prompt = True)

eval_output = model.fast_generate(
    eval_text,
    sampling_params = sampling_params,
    lora_request = model.load_lora("grpo_saved_lora_sentiment_single_day"),
)

eval_responses = [extract_xml_answer(o.outputs[0].text) for o in eval_output]
print(eval_responses)

print("ft模型准确率是：", accuracy(eval_responses, eval_answers))

Processed prompts:   0%|          | 0/42 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]



Processed prompts: 100%|██████████| 42/42 [00:57<00:00,  1.36s/it, est. speed input: 114.96 toks/s, output: 520.66 toks/s]


['一般', '差', '差', '差', '差', '差', '一般', '差', '差', '差', '好', '差', '一般', '好', '好', '差', '差', '一般', '好', '差', '差', '好', '好', '好', '好', '好', '一般', '差', '好', '差', '一般', '差', '好', '差', '差', '一般', '好', '差', '一般', '合。综合分析', '要进一步确认', '微市场情绪微']
base模型准确率是： 0.16666666666666666


Processed prompts: 100%|██████████| 42/42 [00:58<00:00,  1.40s/it, est. speed input: 112.12 toks/s, output: 508.30 toks/s]

['一般', '差', '一般', '差', '一般', '一般', '一般', '一般', '差', '一般', '好', '差', '好', '一般', '好', '一般', '一般', '一般', '好', '一般', '好', '好', '很好', '好', '一般', '一般', '差', '很好', '好', '差', '好', '一般', '一般', '好', '好', '一般', '好', '比、good', '情绪相对活跃', '好', '好', '一般']
ft模型准确率是： 0.35714285714285715



