In [1]:
def custom_Dataset_of_financial_news_sentiment_classification(from_dir, to_dir):
    # https://github.com/wwwxmu/Dataset-of-financial-news-sentiment-classification
    import os
    import pandas as pd
    train_data = pd.read_csv(f"{from_dir}/train_data.csv")
    test_data = pd.read_csv(f"{from_dir}/test_data.csv")
    get_text = lambda x: x['正文']
    get_label = lambda x: x['正负面']
    # 合并数据集
    df = pd.concat([
        train_data.assign(split='train', text=get_text, label=get_label),
        test_data.assign(split='test', text=get_text, label=get_label)
    ], ignore_index=True)[['split', 'text', 'label']]
    # 清洗
    df = df[df['text'].notna() & (df['text'] != '')]
    # 保存清洗后的数据
    os.makedirs(to_dir, exist_ok=True)
    df.to_json(os.path.join(to_dir,'Dataset-of-financial-news-sentiment-classification.jsonl'), orient='records',force_ascii=False, lines=True)


def custom_qlib_stock_dataset(from_dir, to_dir):
    # https://github.com/chenditc/investment_data
    import qlib
    import os
    from qlib.data import D
    # 初始化 Qlib 的数据存储
    qlib.init(provider_uri = from_dir)
    fields = ['$open', '$high', '$low', '$close', '$volume', '$amount', '$vwap']
    df = D.features(D.instruments(market='csi300'), fields, start_time='20160101', end_time='20201231', freq='day')
    df.rename(columns=lambda x: x.replace('$', ''), inplace=True)
    os.makedirs(to_dir, exist_ok=True)
    df.to_csv(os.path.join(to_dir,"csi300_stock_feats.csv"))


def custom_FinCUGE(from_dir, to_dir):
    # https://huggingface.co/datasets/Maciel/FinCUGE-Instruction
    from datasets import load_dataset, concatenate_datasets
    import os
    dataset = load_dataset(from_dir)  
    for split_name in dataset:
        dataset[split_name] = dataset[split_name].map(lambda example: {"split": split_name})
    combined_data = concatenate_datasets([dataset[split_name] for split_name in dataset])
    os.makedirs(to_dir, exist_ok=True)
    df = combined_data.to_pandas()
    df.to_json(os.path.join(to_dir,"FinCUGE.jsonl"), orient='records',force_ascii=False, lines=True)


def custom_CPTS(from_dir, to_dir):
    # https://github.com/fjiangAI/CPTS
    import pandas as pd
    import os
    def boundary_to_mass(boundary_list):
        bound_idx = [-1] + [i for i, val in enumerate(boundary_list) if val==1]
        return [ bound_idx[i+1] - bound_idx[i] for i in range(len(bound_idx) - 1) ]
    train_data = pd.read_json(f"{from_dir}/train.json")
    test_data = pd.read_json(f"{from_dir}/test.json")
    df = pd.concat([train_data.assign(split='train'),test_data.assign(split='test')], ignore_index=True) # 合并数据集
    df['sentences'] = df.apply(lambda row: [i['text'] for i in row['paragraph_list']], axis=1)
    df['masses'] = df['label_list'].apply(boundary_to_mass)
    # 保存
    os.makedirs(to_dir, exist_ok=True)
    df[['id','title','topic_list','sentences', 'masses','split','label_list']].to_json(os.path.join(to_dir,'CPTS.jsonl'), orient='records',force_ascii=False, lines=True)

def extract_finna_in_FinCUGE_for_sft(from_dir, to_dir):
    import pandas as pd  
    import os
    df = pd.read_json(f"{from_dir}/FinCUGE.jsonl", lines=True)
    df['messages'] = df.apply(lambda row: [
                                dict(role='system',content=row['instruction']),
                                dict(role='user',content=row['input']),
                                dict(role='assistant',content=row['output'])
                            ], axis=1)
    df[(df['split'] == 'train') & (df['task'] == 'FINNA')][['messages']].to_json(os.path.join(to_dir,"FinCUGE_FINNA_train.jsonl"), orient='records',force_ascii=False, lines=True)
    df[(df['split'] == 'eval') & (df['task'] == 'FINNA')][['messages']].to_json(os.path.join(to_dir,"FinCUGE_FINNA_eval.jsonl"), orient='records',force_ascii=False, lines=True)


In [2]:
from_dir = "../resources/data/raw/CPTS"
to_dir = "../resources/data/cleaned"
custom_CPTS(from_dir,to_dir)