In [4]:

!pip install transformers torch datasets openpyxl 

Collecting datasets
  Using cached datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting xxhash (from datasets)
  Using cached xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Using cached datasets-3.2.0-py3-none-any.whl (480 kB)
Using cached xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
Installing collected packages: xxhash, datasets
Successfully installed datasets-3.2.0 xxhash-3.5.0
[0m

In [None]:
import os
os.makedirs('eval_results', exist_ok=True)

## Load dataset

In [2]:
import datasets
dataset_dict = datasets.load_dataset('klaylouis1932/OpenFinData-Intent-Understanding-Intruct')

test_dataset = dataset_dict['test'].to_pandas()
test_dataset['id'] = test_dataset.index

  from .autonotebook import tqdm as notebook_tqdm


## Qwen2.5-7B-Instruct

In [3]:
question = test_dataset['instruction'].iloc[0]

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen2.5-7B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# prompt = "Give me a short introduction to large language model."
messages = [
    {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
    {"role": "user", "content": question}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=512
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)

Loading checkpoint shards: 100%|██████████| 4/4 [00:02<00:00,  1.36it/s]


A


In [5]:
%%time
from collections import defaultdict
import time

prediction_results = defaultdict(list)

n = 0
for i, row in test_dataset.iterrows():
    # question = row['question'] # original question
    question = row['instruction'] # rewrite question/instruction
    answer = row['output']
    idx = row['id']
    prediction_results['id'].append(idx)
    
    try:
        # prediction = llm_claude35.invoke_model(question)
        # prompt = "Give me a short introduction to large language model."
        messages = [
            {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
            {"role": "user", "content": question}
        ]
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

        generated_ids = model.generate(
            **model_inputs,
            max_new_tokens=512
        )
        generated_ids = [
            output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
        ]

        prediction = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    except:
        print(f'Error occurred for question: {question}')
        prediction = None
    
    prediction_results['pred_answer'].append(prediction)
    # time.sleep(10)
    
    n += 1
    if n%15 == 0:
        print(f'Progress({n}/{len(test_dataset)})...')
    # if n>=2:
    #     break

Progress(15/75)...
Progress(30/75)...
Progress(45/75)...
Progress(60/75)...
Progress(75/75)...
CPU times: user 6.18 s, sys: 644 μs, total: 6.18 s
Wall time: 6.17 s


In [6]:
import numpy as np
import pandas as pd

pred = pd.DataFrame(prediction_results)
pred_result_df = test_dataset[['id', 'instruction', 'output']].merge(pred, on='id', how='left')

pred_result_df['exact_match'] = np.where(pred_result_df['pred_answer']==pred_result_df['output'], 1, 0)


In [7]:
accuracy = pred_result_df['exact_match'].sum()/len(pred_result_df)
print(f'Accuracy with Finetuned ChatGLM3 is : {accuracy*100}%')

Accuracy with Finetuned ChatGLM3 is : 85.33333333333334%


In [8]:
pred_result_df[pred_result_df['pred_answer'].str.len()>1]

Unnamed: 0,id,instruction,output,pred_answer,exact_match


In [9]:
pred_result_df

Unnamed: 0,id,instruction,output,pred_answer,exact_match
0,0,作为一个意图情绪助手，请分析以下问句的意图类型。\n\n问句: 商业银行可以吗\n\n请从以...,A,A,1
1,1,作为一个意图情绪助手，请分析以下问句的意图类型。\n\n问句: 什么板块可以布局\n\n请从...,A,A,1
2,2,作为一个意图情绪助手，请分析以下问句的意图类型。\n\n问句: 看好哪些赛道\n\n请从以下...,A,A,1
3,3,作为一个意图情绪助手，请分析以下问句的意图类型。\n\n问句: 基建在近期会创新高吗？\n\...,A,A,1
4,4,作为一个意图情绪助手，请分析以下问句的意图类型。\n\n问句: 明年医药板块还能投资吗？\n...,A,A,1
...,...,...,...,...,...
70,70,作为一个意图情绪助手，请分析以下问句的意图类型。\n\n问句: 密码锁了多久开\n\n请从以...,D,D,1
71,71,作为一个意图情绪助手，请分析以下问句的意图类型。\n\n问句: 新股申购什么时候才能正常申购...,D,D,1
72,72,作为一个意图情绪助手，请分析以下问句的意图类型。\n\n问句: 如何更改风险设定值\n\n请...,D,D,1
73,73,作为一个意图情绪助手，请分析以下问句的意图类型。\n\n问句: 客户基本信息在哪里看\n\n...,D,D,1


In [9]:

model_name = 'qwen25-7b'
pred_result_df.to_excel(f'eval_results/intent_understanding_{model_name}.xlsx', index=False)

## Finetuned Qwen2.5-7B-Instruct

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "models/qwen25_7b_lora_sft" # local path
# model_name = "klaylouis1932/qwen25-7b-lora-ft-fin" # from HF

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)


Loading checkpoint shards: 100%|██████████| 9/9 [00:35<00:00,  3.99s/it]


In [5]:
question = test_dataset['instruction'].iloc[0]

In [6]:
%%time
from collections import defaultdict
import time

prediction_results = defaultdict(list)

n = 0
for i, row in test_dataset.iterrows():
    # question = row['question'] # original question
    question = row['instruction'] # rewrite question/instruction
    answer = row['output']
    idx = row['id']
    prediction_results['id'].append(idx)
    
    try:
        messages = [
            {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
            {"role": "user", "content": question}
        ]
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

        generated_ids = model.generate(
            **model_inputs,
            max_new_tokens=512
        )
        generated_ids = [
            output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
        ]

        prediction = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    except:
        print(f'Error occurred for question: {question}')
        prediction = None
    
    prediction_results['pred_answer'].append(prediction)
    # time.sleep(10)
    
    n += 1
    if n%15 == 0:
        print(f'Progress({n}/{len(test_dataset)})...')
    # if n>=2:
    #     break

Progress(15/75)...
Progress(30/75)...
Progress(45/75)...
Progress(60/75)...
Progress(75/75)...
CPU times: user 6.52 s, sys: 215 ms, total: 6.73 s
Wall time: 6.73 s


In [7]:
import numpy as np
import pandas as pd

pred = pd.DataFrame(prediction_results)
pred_result_df = test_dataset[['id', 'instruction', 'output']].merge(pred, on='id', how='left')

pred_result_df['exact_match'] = np.where(pred_result_df['pred_answer']==pred_result_df['output'], 1, 0)


In [8]:
accuracy = pred_result_df['exact_match'].sum()/len(pred_result_df)
print(f'Accuracy with Finetuned ChatGLM3 is : {accuracy*100}%')

Accuracy with Finetuned ChatGLM3 is : 88.0%


In [9]:
pred_result_df[pred_result_df['pred_answer'].str.len()>1]

Unnamed: 0,id,instruction,output,pred_answer,exact_match


In [10]:
pred_result_df

Unnamed: 0,id,instruction,output,pred_answer,exact_match
0,0,作为一个意图情绪助手，请分析以下问句的意图类型。\n\n问句: 商业银行可以吗\n\n请从以...,A,A,1
1,1,作为一个意图情绪助手，请分析以下问句的意图类型。\n\n问句: 什么板块可以布局\n\n请从...,A,A,1
2,2,作为一个意图情绪助手，请分析以下问句的意图类型。\n\n问句: 看好哪些赛道\n\n请从以下...,A,A,1
3,3,作为一个意图情绪助手，请分析以下问句的意图类型。\n\n问句: 基建在近期会创新高吗？\n\...,A,C,0
4,4,作为一个意图情绪助手，请分析以下问句的意图类型。\n\n问句: 明年医药板块还能投资吗？\n...,A,A,1
...,...,...,...,...,...
70,70,作为一个意图情绪助手，请分析以下问句的意图类型。\n\n问句: 密码锁了多久开\n\n请从以...,D,D,1
71,71,作为一个意图情绪助手，请分析以下问句的意图类型。\n\n问句: 新股申购什么时候才能正常申购...,D,D,1
72,72,作为一个意图情绪助手，请分析以下问句的意图类型。\n\n问句: 如何更改风险设定值\n\n请...,D,D,1
73,73,作为一个意图情绪助手，请分析以下问句的意图类型。\n\n问句: 客户基本信息在哪里看\n\n...,D,D,1


In [11]:

model_name = 'qwen25-7b_finetuned'
pred_result_df.to_excel(f'eval_results/intent_understanding_{model_name}.xlsx', index=False)