导入情绪分析任务所用数据

In [None]:
def custom_Dataset_of_financial_news_sentiment_classification(from_dir, to_dir):
    # https://github.com/wwwxmu/Dataset-of-financial-news-sentiment-classification
    import os
    import pandas as pd
    train_data = pd.read_csv(f"{from_dir}/train_data.csv")
    test_data = pd.read_csv(f"{from_dir}/test_data.csv")
    get_text = lambda x: x['正文']
    get_label = lambda x: x['正负面']
    # 合并数据集
    df = pd.concat([
        train_data.assign(split='train', text=get_text, label=get_label),
        test_data.assign(split='test', text=get_text, label=get_label)
    ], ignore_index=True)[['split', 'text', 'label']]
    # 清洗
    df = df[df['text'].notna() & (df['text'] != '')]
    # 保存清洗后的数据
    os.makedirs(to_dir, exist_ok=True)
    df.to_json(os.path.join(to_dir,'Dataset-of-financial-news-sentiment-classification.jsonl'), orient='records',force_ascii=False, lines=True)


通过qlib获取股票市场数据

In [None]:
def custom_qlib_stock_dataset(from_dir, to_dir):
    # https://github.com/chenditc/investment_data
    import qlib
    import os
    from qlib.data import D
    # 初始化 Qlib 的数据存储
    qlib.init(provider_uri = from_dir)
    fields = ['$open', '$high', '$low', '$close', '$volume', '$amount', '$vwap']
    df = D.features(D.instruments(market='csi300'), fields, start_time='20160101', end_time='20201231', freq='day')
    df.rename(columns=lambda x: x.replace('$', ''), inplace=True)
    os.makedirs(to_dir, exist_ok=True)
    df.to_csv(os.path.join(to_dir,"csi300_stock_feats.csv"),index=False)

导入新闻摘要任务相关数据

In [None]:
def custom_FinCUGE(from_dir, to_dir):
    # https://huggingface.co/datasets/Maciel/FinCUGE-Instruction
    from datasets import load_dataset, concatenate_datasets
    import os
    dataset = load_dataset(from_dir)  
    for split_name in dataset:
        dataset[split_name] = dataset[split_name].map(lambda example: {"split": split_name})
    combined_data = concatenate_datasets([dataset[split_name] for split_name in dataset])
    os.makedirs(to_dir, exist_ok=True)
    df = combined_data.to_pandas()
    df.to_json(os.path.join(to_dir,"FinCUGE.jsonl"), orient='records',force_ascii=False, lines=True)

def extract_finna_in_FinCUGE_as_sharegpt(from_dir, to_dir):
    import pandas as pd  
    import os
    df = pd.read_json(f"{from_dir}/FinCUGE.jsonl", lines=True)
    df['messages'] = df.apply(lambda row: [
                                dict(role='system',content=row['instruction']),
                                dict(role='user',content=row['input']),
                                dict(role='assistant',content=row['output'])
                            ], axis=1)
    df[(df['split'] == 'train') & (df['task'] == 'FINNA')][['messages']].to_json(os.path.join(to_dir,"FinCUGE_FINNA_train.jsonl"), orient='records',force_ascii=False, lines=True)

导入文档主题分割评测数据

In [None]:
def custom_CPTS(from_dir, to_dir):
    # https://github.com/fjiangAI/CPTS
    import pandas as pd
    import os
    def boundary_to_mass(boundary_list):
        bound_idx = [-1] + [i for i, val in enumerate(boundary_list) if val==1]
        return [ bound_idx[i+1] - bound_idx[i] for i in range(len(bound_idx) - 1) ]
    train_data = pd.read_json(f"{from_dir}/train.json")
    test_data = pd.read_json(f"{from_dir}/test.json")
    df = pd.concat([train_data.assign(split='train'),test_data.assign(split='test')], ignore_index=True) # 合并数据集
    df['sentences'] = df.apply(lambda row: [i['text'] for i in row['paragraph_list']], axis=1)
    df['masses'] = df['label_list'].apply(boundary_to_mass)
    # 保存
    os.makedirs(to_dir, exist_ok=True)
    df[['id','title','topic_list','sentences', 'masses','split','label_list']].to_json(os.path.join(to_dir,'CPTS.jsonl'), orient='records',force_ascii=False, lines=True)



导入RAG评测数据

In [None]:
def custom_FinanceRAG(from_dir, to_dir, llm_config, spliter_config):
    # https://huggingface.co/datasets/Linq-AI-Research/FinanceRAG
    import pandas as pd
    import os
    from openai import OpenAI
    from functools import partial
    import sys; sys.path.append("..")
    from utils.spliters import init_spliter
    from minirag.utils import compute_mdhash_id


    client = OpenAI(base_url=llm_config['base_url'],api_key=llm_config["api_key"])
    gen_resp = partial(client.chat.completions.create,model=llm_config['model'],temperature=0.1, top_p=1, max_tokens=1000,)

    ## translate_text：翻译指定文本
    def translate_text(text):
        response = gen_resp(messages=[dict(role="system",content="将文字翻译成中文，直接输出翻译结果"),dict(role="user",content=text)])
        return response.choices[0].message.content  

    ## summarize_text：概括指定文本
    def summarize_text(text):
        response = gen_resp(messages=[dict(role="system",content="为以下内容生成摘要，直接输出结果"),dict(role="user",content=text)])
        return response.choices[0].message.content

    query_df = pd.read_json(os.path.join(from_dir,"queries.jsonl"),lines=True)
    corpus_df = pd.read_json(os.path.join(from_dir,"corpus.jsonl"),lines=True)

    query_df["text_zh"] = query_df["text"].apply(translate_text)
    query_df.to_json(os.path.join(to_dir, "queries.json"), orient="records", indent=4, force_ascii=False)
    corpus_df["text_zh"] = corpus_df["text"].apply(translate_text)
    corpus_df.to_json(os.path.join(to_dir, "corpus.json"), orient="records", indent=4, force_ascii=False)

    spliter = init_spliter(**spliter_config)
    chunk_sum_list = []
    for idx, row in corpus_df.iterrows():
        sentence_df = spliter.split_text_to_sentences(row['text_zh'])
        sentence_df = spliter.add_buffered_sentences(sentence_df)
        chunk_df = spliter.cluster(sentence_df)
        for cidx, crow in chunk_df.iterrows():
            summary = crow['chunk'] if len(crow['chunk'])<50 else summarize_text(crow['chunk'])
            chunk_sum_list.append({
                "doc_id": row["_id"],
                "chunk_id": compute_mdhash_id(summary.strip(), prefix="chunk-"),
                "doc_text": row['text_zh'],
                "chunk_text": crow['chunk'],
                "chunk_sum_text": summary,
            })
    chunk_sum_df = pd.DataFrame(chunk_sum_list)
    chunk_sum_df.to_json(os.path.join(to_dir, "chunk_sum.json"), orient='records', index=False, force_ascii=False, indent=4)


from_dir = "resources/data/_raw/FinanceRAG_finqa_bench"
to_dir = "resources/data/finqa"
llm_config = dict(model="judger", base_url="http://localhost:12235/v1",api_key="empty")
spliter_config = dict(method="doc_seq_model_spliter", model_path="resources/open_models/nlp_bert_document-segmentation_chinese-base")


为demo准备数据

In [None]:
import pandas as pd
import os

def extract_certain_day_related_news(market_data_path, news_dir, day):
    df_market = pd.read_excel(market_data_path)  # 读取 Excel
    df_market['date'] = pd.to_datetime(df_market['datetime']).dt.date  # 统一日期格式

    # 读取新闻数据
    df_news = pd.concat([
        pd.read_json(os.path.join(news_dir, filename))
        .assign(instrument=filename.split('.')[0])
        for filename in os.listdir(news_dir) if filename.endswith(".json")
    ], ignore_index=True)

    # 统一新闻数据的日期格式
    df_news['date'] = pd.to_datetime(df_news['date']).dt.date

    # 找到市场数据中当天有数据的股票
    stocks_on_day = df_market.loc[df_market['date'] == pd.to_datetime(day).date(), 'instrument'].unique()

    # 筛选新闻数据，要求日期为 day，且 instrument 在 stocks_on_day 中
    df_filtered_news = df_news[(df_news['date'] == pd.to_datetime(day).date()) & (df_news['instrument'].astype(int).isin(stocks_on_day))]
        
    return df_filtered_news


market_data_path = "demo/2019Q4股票预测.xlsx"
news_dir = "resources/data/CSI300news_chunked_summarized_senti"
day = "2019-10-08"
df_filtered_news = extract_certain_day_related_news(market_data_path, news_dir, day)
df_filtered_news.to_json(f"demo/{day}_相关新闻片段.json", orient="records",indent=4,index=False,force_ascii=False)

In [2]:
import pandas as pd
import requests
from tqdm import tqdm
def add_stock_name(df, stock_code_col):
    def get_stock_name(stock_code):
        url = "https://push2.eastmoney.com/api/qt/stock/get"
        params = {
            "secid": f"{'1.' if stock_code.startswith('6') else '0.'}{stock_code}",  # 6开头是沪市，其他是深市
            "fields": "f58"  # 只获取股票名称
        }
        headers = {"User-Agent": "Mozilla/5.0"}

        try:
            response = requests.get(url, params=params, headers=headers)
            response.raise_for_status()
            data = response.json().get("data", {})
            return data.get("f58", "股票代码不存在").replace(" ", "")
        except requests.RequestException as e:
            return f"查询失败: {e}"
    stock_name_mapper = {instrument: get_stock_name(instrument) for instrument in tqdm(df[stock_code_col].unique())}
    df["name"] = df.apply(lambda row: stock_name_mapper[row[stock_code_col]],axis=1)
    return df

market_data_path = "demo/2019Q4股票预测.xlsx"
df_market_data = pd.read_excel(market_data_path,index_col=0,dtype={"instrument":str})
df_market_data = add_stock_name(df_market_data,"instrument")
df_market_data.to_excel(market_data_path)

100%|██████████| 271/271 [03:34<00:00,  1.26it/s]


In [10]:
import os
import logging
import pandas as pd
from functools import partial
from minirag.prompt import PROMPTS
from minirag import MiniRAG
from minirag.utils import EmbeddingFunc, compute_mdhash_id
from minirag.llm import openai_complete_if_cache, hf_embedding

import sys; sys.path.append("../..")
from utils.rag import prompts

from transformers import AutoTokenizer, AutoModel
import nest_asyncio; nest_asyncio.apply() # 在notebook中使用async所需

def build_index(embed_model_path,rag_root,rag_llm_args, series):

    PROMPTS.update(prompts) 

    # 设置日志级别
    logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.INFO)

    embed_tokenizer = AutoTokenizer.from_pretrained(embed_model_path, model_max_length=512) 
    embed_model = AutoModel.from_pretrained(embed_model_path)

    os.makedirs(os.path.join(rag_root,"rag_data"),exist_ok=True)
    rag = MiniRAG(
        working_dir=os.path.join(rag_root,"rag_data"),
        llm_model_func=lambda prompt,**kwargs: openai_complete_if_cache(prompt=prompt,**rag_llm_args, **kwargs,), 
        llm_model_max_token_size=1000, 
        llm_model_name=rag_llm_args["model"],
        embedding_func=EmbeddingFunc(
            embedding_dim=embed_model.config.hidden_size,
            max_token_size=embed_model.config.max_position_embeddings,
            func=partial(hf_embedding, embed_model=embed_model, tokenizer=embed_tokenizer)
        )
    )

    for idx, value in series.items():
        rag.insert(value)

rag_llm_args = dict(model="base", base_url="http://localhost:12239/v1",api_key="empty")
embed_model_path = "resources/open_models/bge-large-zh-v1.5" 
rag_root = "demo/" 
news_chunk_path = "demo/2019-10-08_相关新闻片段.json"
df = pd.read_json(news_chunk_path)
series = df.apply(lambda row: f"标题：{row['title']}，摘要：{row['summary']}",axis=1)
# build_index(embed_model_path,rag_root,rag_llm_args,series)
df['chunk_id'] = series.apply(lambda x : compute_mdhash_id(x.strip(), prefix="chunk-"))
df.to_json(news_chunk_path, index=False, force_ascii=False, indent=4,orient="records")

RAG评测数据--消融

In [None]:
def translate_FinanceBench(from_dir, to_dir, llm_config):
    import pandas as pd
    import os
    from openai import OpenAI
    from functools import partial
    import sys; sys.path.append("..")
    from utils.spliters import init_spliter
    from minirag.utils import compute_mdhash_id

    client = OpenAI(base_url=llm_config['base_url'],api_key=llm_config["api_key"])
    gen_resp = partial(client.chat.completions.create,model=llm_config['model'],temperature=0.1, top_p=1, max_tokens=1000,)

    ## translate_text：翻译指定文本
    def translate_text(text):
        response = gen_resp(messages=[dict(role="system",content="将文字翻译成中文，直接输出翻译结果"),dict(role="user",content=text)])
        return response.choices[0].message.content  
    
    query_df = pd.read_json(os.path.join(from_dir,"queries.jsonl"),lines=True)
    corpus_df = pd.read_json(os.path.join(from_dir,"corpus.jsonl"),lines=True)

    query_df["text_zh"] = query_df["text"].apply(translate_text)
    query_df.to_json(os.path.join(to_dir, "queries.json"), orient="records", indent=4, force_ascii=False)
    corpus_df["text_zh"] = corpus_df["text"].apply(translate_text)
    corpus_df.to_json(os.path.join(to_dir, "corpus.json"), orient="records", indent=4, force_ascii=False)
    

def custom_FinanceBench(from_dir, to_dir, llm_config, spliter_config):
    # https://huggingface.co/datasets/Linq-AI-Research/FinanceRAG
    import pandas as pd
    import os
    from openai import OpenAI
    from functools import partial
    import sys; sys.path.append("..")
    from utils.spliters import init_spliter
    from minirag.utils import compute_mdhash_id

    client = OpenAI(base_url=llm_config['base_url'],api_key=llm_config["api_key"])
    gen_resp = partial(client.chat.completions.create,model=llm_config['model'],temperature=0.1, top_p=1, max_tokens=1000,)

    ## summarize_text：概括指定文本
    def summarize_text(text):
        response = gen_resp(messages=[dict(role="system",content="为以下内容生成摘要，直接输出结果"),dict(role="user",content=text)])
        return response.choices[0].message.content

    query_df = pd.read_json(os.path.join(from_dir,"queries.jsonl"),lines=True)
    corpus_df = pd.read_json(os.path.join(from_dir,"corpus.jsonl"),lines=True)

    spliter = init_spliter(**spliter_config)
    chunk_sum_list = []
    for idx, row in corpus_df.iterrows():
        sentence_df = spliter.split_text_to_sentences(row['text_zh'])
        sentence_df = spliter.add_buffered_sentences(sentence_df)
        chunk_df = spliter.cluster(sentence_df)
        for cidx, crow in chunk_df.iterrows():
            summary = crow['chunk'] if len(crow['chunk'])<50 else summarize_text(crow['chunk'])
            chunk_sum_list.append({
                "doc_id": row["_id"],
                "chunk_id": compute_mdhash_id(summary.strip(), prefix="chunk-"),
                "doc_text": row['text_zh'],
                "chunk_text": crow['chunk'],
                "chunk_sum_text": summary,
            })
    chunk_sum_df = pd.DataFrame(chunk_sum_list)
    chunk_sum_df.to_json(os.path.join(to_dir, "chunk_sum.json"), orient='records', index=False, force_ascii=False, indent=4)



In [None]:
llm_config = dict(model="judger", base_url="http://localhost:12235/v1",api_key="empty")
translate_FinanceBench("resources/data/_raw/FinanceBench","resources/data/ablation_rag/FinanceBench/zh",llm_config)

In [None]:

llm_config = dict(model="judger", base_url="http://localhost:12235/v1",api_key="empty")
spliter_config = dict(method="doc_seq_model_spliter", model_path="resources/open_models/nlp_bert_document-segmentation_chinese-base")
custom_FinanceRAG("resources/data/ablation_rag/FinanceBench/zh","resources/data/ablation_rag/FinanceBench/",)
