In [28]:
!pip install datasets

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m

In [29]:
!pip install transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m

In [30]:
!pip install openpyxl

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m

## Load dataset

In [31]:
import datasets
dataset_dict = datasets.load_dataset('klaylouis1932/OpenFinData-Intent-Understanding-Intruct')

test_dataset = dataset_dict['test'].to_pandas()
test_dataset['id'] = test_dataset.index

## Qwen2.5-0.5B-Instruct

In [32]:
question = test_dataset['instruction'].iloc[0]

In [33]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen2.5-0.5B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# prompt = "Give me a short introduction to large language model."
messages = [
    {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
    {"role": "user", "content": question}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=512
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)

A


In [37]:
%%time
from collections import defaultdict
import time

prediction_results = defaultdict(list)

n = 0
for i, row in test_dataset.iterrows():
    # question = row['question'] # original question
    question = row['instruction'] # rewrite question/instruction
    answer = row['output']
    idx = row['id']
    prediction_results['id'].append(idx)
    
    try:
        # prediction = llm_claude35.invoke_model(question)
        # prompt = "Give me a short introduction to large language model."
        messages = [
            {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
            {"role": "user", "content": question}
        ]
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

        generated_ids = model.generate(
            **model_inputs,
            max_new_tokens=512
        )
        generated_ids = [
            output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
        ]

        prediction = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    except:
        print(f'Error occurred for question: {question}')
        prediction = None
    
    prediction_results['pred_answer'].append(prediction)
    # time.sleep(10)
    
    n += 1
    if n%15 == 0:
        print(f'Progress({n}/{len(test_dataset)})...')
    # if n>=2:
    #     break

Progress(15/75)...
Progress(30/75)...
Progress(45/75)...
Progress(60/75)...
Progress(75/75)...
CPU times: user 6.37 s, sys: 5.54 ms, total: 6.37 s
Wall time: 6.37 s


In [38]:
import numpy as np
import pandas as pd

pred = pd.DataFrame(prediction_results)
pred_result_df = test_dataset[['id', 'instruction', 'output']].merge(pred, on='id', how='left')

pred_result_df['exact_match'] = np.where(pred_result_df['pred_answer']==pred_result_df['output'], 1, 0)


In [39]:
accuracy = pred_result_df['exact_match'].sum()/len(pred_result_df)
print(f'Accuracy with Finetuned ChatGLM3 is : {accuracy*100}%')

Accuracy with Finetuned ChatGLM3 is : 40.0%


In [40]:
pred_result_df[pred_result_df['pred_answer'].str.len()>1]

Unnamed: 0,id,instruction,output,pred_answer,exact_match


In [41]:
pred_result_df

Unnamed: 0,id,instruction,output,pred_answer,exact_match
0,0,作为一个意图情绪助手，请分析以下问句的意图类型。\n\n问句: 商业银行可以吗\n\n请从以...,A,E,0
1,1,作为一个意图情绪助手，请分析以下问句的意图类型。\n\n问句: 什么板块可以布局\n\n请从...,A,E,0
2,2,作为一个意图情绪助手，请分析以下问句的意图类型。\n\n问句: 看好哪些赛道\n\n请从以下...,A,A,1
3,3,作为一个意图情绪助手，请分析以下问句的意图类型。\n\n问句: 基建在近期会创新高吗？\n\...,A,C,0
4,4,作为一个意图情绪助手，请分析以下问句的意图类型。\n\n问句: 明年医药板块还能投资吗？\n...,A,A,1
...,...,...,...,...,...
70,70,作为一个意图情绪助手，请分析以下问句的意图类型。\n\n问句: 密码锁了多久开\n\n请从以...,D,D,1
71,71,作为一个意图情绪助手，请分析以下问句的意图类型。\n\n问句: 新股申购什么时候才能正常申购...,D,D,1
72,72,作为一个意图情绪助手，请分析以下问句的意图类型。\n\n问句: 如何更改风险设定值\n\n请...,D,D,1
73,73,作为一个意图情绪助手，请分析以下问句的意图类型。\n\n问句: 客户基本信息在哪里看\n\n...,D,D,1


In [12]:
!mkdir eval_results

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [13]:

model_name = 'qwen25-0.5b'
pred_result_df.to_excel(f'eval_results/intent_understanding_{model_name}.xlsx', index=False)

## Pretrained Qwen2.5-0.5B-Instruct

In [42]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "klaylouis1932/Finance-Research-QWen2.5-0.5B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [43]:
question = test_dataset['instruction'].iloc[0]

In [78]:
%%time
from collections import defaultdict
import time

prediction_results = defaultdict(list)

n = 0
for i, row in test_dataset.iterrows():
    # question = row['question'] # original question
    question = row['instruction'] # rewrite question/instruction
    answer = row['output']
    idx = row['id']
    prediction_results['id'].append(idx)
    
    try:
        messages = [
            {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
            {"role": "user", "content": question}
        ]
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

        generated_ids = model.generate(
            **model_inputs,
            max_new_tokens=512
        )
        generated_ids = [
            output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
        ]

        prediction = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    except:
        print(f'Error occurred for question: {question}')
        prediction = None
    
    prediction_results['pred_answer'].append(prediction)
    # time.sleep(10)
    
    n += 1
    if n%15 == 0:
        print(f'Progress({n}/{len(test_dataset)})...')
    # if n>=2:
    #     break

Progress(15/75)...
Progress(30/75)...
Progress(45/75)...
Progress(60/75)...
Progress(75/75)...
CPU times: user 4.41 s, sys: 3.82 ms, total: 4.42 s
Wall time: 4.41 s


In [79]:
import numpy as np
import pandas as pd

pred = pd.DataFrame(prediction_results)
pred_result_df = test_dataset[['id', 'instruction', 'output']].merge(pred, on='id', how='left')

pred_result_df['exact_match'] = np.where(pred_result_df['pred_answer']==pred_result_df['output'], 1, 0)


In [80]:
accuracy = pred_result_df['exact_match'].sum()/len(pred_result_df)
print(f'Accuracy with pretrained Qwen2.5-0.5B is : {accuracy*100}%')

Accuracy with pretrained Qwen2.5-0.5B is : 20.0%


In [81]:
pred_result_df[pred_result_df['pred_answer'].str.len()>1]

Unnamed: 0,id,instruction,output,pred_answer,exact_match


In [82]:
pred_result_df

Unnamed: 0,id,instruction,output,pred_answer,exact_match
0,0,作为一个意图情绪助手，请分析以下问句的意图类型。\n\n问句: 商业银行可以吗\n\n请从以...,A,A,1
1,1,作为一个意图情绪助手，请分析以下问句的意图类型。\n\n问句: 什么板块可以布局\n\n请从...,A,A,1
2,2,作为一个意图情绪助手，请分析以下问句的意图类型。\n\n问句: 看好哪些赛道\n\n请从以下...,A,A,1
3,3,作为一个意图情绪助手，请分析以下问句的意图类型。\n\n问句: 基建在近期会创新高吗？\n\...,A,A,1
4,4,作为一个意图情绪助手，请分析以下问句的意图类型。\n\n问句: 明年医药板块还能投资吗？\n...,A,A,1
...,...,...,...,...,...
70,70,作为一个意图情绪助手，请分析以下问句的意图类型。\n\n问句: 密码锁了多久开\n\n请从以...,D,E,0
71,71,作为一个意图情绪助手，请分析以下问句的意图类型。\n\n问句: 新股申购什么时候才能正常申购...,D,A,0
72,72,作为一个意图情绪助手，请分析以下问句的意图类型。\n\n问句: 如何更改风险设定值\n\n请...,D,A,0
73,73,作为一个意图情绪助手，请分析以下问句的意图类型。\n\n问句: 客户基本信息在哪里看\n\n...,D,A,0


In [83]:

model_name = 'qwen25-0.5b_pretrained'
pred_result_df.to_excel(f'eval_results/intent_understanding_{model_name}.xlsx', index=False)

## Other

In [26]:
question = "中船防务一季度报告发布时间"

In [23]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen2.5-0.5B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# prompt = "Give me a short introduction to large language model."
messages = [
    {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
    {"role": "user", "content": question}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=512
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)

中船防务是中国船舶工业集团有限公司的全资子公司，是全国最大的专业防务装备生产制造企业。公司的业务范围涵盖了从武器装备、电子对抗设备到舰艇保障服务等多个领域，为国家军事和民用客户提供全方位的专业化解决方案。

在武器装备方面，中船防务主要负责研制和发展各种类型的海军舰艇、潜艇、导弹、无人机等先进武器装备，满足了国防建设的需求。例如，中船防务研发的“东风-21D”洲际弹道导弹，已经完成了多次实战测试，并于2023年首次对外公开亮相。

在电子对抗设备上，中船防务致力于开发新型电子战设备，提升其对敌方电子信息系统进行有效攻击的能力。此外，公司还提供包括通信系统维护、网络空间防御在内的多种电子对抗产品和服务。

在舰艇保障服务方面，中船防务则专注于为各类海军建设提供高质量的舰艇维修、保养和技术支持服务。公司拥有先进的维修技术和强大的团队实力，能够确保军舰处于良好的工作状态。

总的来说，中船防务凭借其雄厚的实力和丰富的经验，在多个领域都取得了显著的成绩，成为了中国海军现代化建设的重要力量之一。


In [27]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "klaylouis1932/Finance-Research-QWen2.5-0.5B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# prompt = "Give me a short introduction to large language model."
messages = [
    {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
    {"role": "user", "content": question}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=512
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)

根据Wind 数据,2023 年4 月16 日,中船防务发布2023 年第一季度报告。报告期内公司实现营业收入987,508,585.77 元,同比增长31.25%;归属于上市公司股东的净利润148,073,554.42 元,同比下降3.65%;归属于上市公司股东的扣除非经常性损益的净利润138,737,875.46 元,同比下降10.45%。业绩下滑主要是受公司经营环境、市场波动等影响所致。 资料来源:wind,华金证券研究所 【浙商银行】关于为全资子公司提供担保暨关联交易的公告(2023-001) 浙商银行股份有限公司于2023年4月16日召开第四届董事会第十六次会议和第四届监事会第十三次会议审议通过了《关于为全资子公司提供担保暨关联交易的议案》,同意公司为全资子公司浙江浙商银行台州分行提供不超过人民币30亿元的连带责任保证担保,并授权经营管理层在上述额度范围内办理具体担保事宜。具体内容详见公司披露的《关于为全资子公司提供担保暨关联交易的公告》(临2023-001)。截至本公告披露日,公司及控股子公司已无逾期担保事项。 资料来源:公司公告,华金证券研究所 【浙商银行】关于为全资子公司提供担保暨关联交易的进展公告(2023-002) 浙商银行股份有限公司于2023年4月16日召开第四届董事会第十六次会议和第四届监事会第十三次会议审议通过了《关于为全资子公司提供担保暨关联交易的议案》,同意公司为全资子公司浙江浙商银行台州分行提供不超过人民币30亿元的连带责任保证担保,并授权经营管理层在上述额度范围内办理具体担保事宜。具体内容详见公司披露的《关于为全资子公司提供担保暨关联交易的进展公告》(临2023-002)。截至本公告披露日,公司及控股子公司已无逾期担保事项。 资料来源:公司公告,华金证券研究所 【浙商银行】关于控股股东质押股份解除质押的公告(2023-003) 浙商银行股份有限公司于20
