In [20]:
import pandas as pd
import re

file_paths = [r"./data/复盘记录2019.xlsx",
             r"./data/复盘记录2020.xlsx",
             r"./data/复盘记录2021.xlsx",
             r"./data/复盘记录2022.xlsx",
             r"./data/复盘记录2023.xlsx",
             r"./data/复盘记录2024.xlsx"]
             
eval_paths = [r"./data/复盘记录2025.xlsx"]
# selected_cols = ("日期", "大盘量比", "涨幅", "涨跌停比", '赚钱效应', '市场情绪', '炸板率', '连板数', '昨板今均', '今日概况')
sentiment_selected_cols = ("大盘量比", "涨幅", "涨跌停比", '赚钱效应', '市场情绪', '炸板率', '连板数', '昨板今均')
earning_rate_selected_cols = ("大盘量比", "涨幅", "涨跌停比", '赚钱效应', '炸板率', '连板数', '昨板今均')

percent_cols = ['赚钱效应',  '炸板率']


def parse_file(file_paths, selected_cols, percent_cols):
    result_rows = []
    start_id_of_year = {}
    for path in file_paths:
        year = path[11:15]
        start_id_of_year[year] = len(result_rows)
        df = pd.read_excel(path)
        # print("原始数据形状 (行数, 列数):", df.shape)
        # print("\n列名:", list(df.columns))

        # 获取列名（表头）
        headers = df.columns.tolist()
        # print(headers)
        for idx, row in df.iterrows():
            row_data = []
            # # 处理NaN值
            if pd.isna(row["日期"]):
                continue
            # 遍历每一列
            for header in selected_cols:
                # 添加表头前缀到元素值
                value = row[header]
                # print(idx, header, value)
                if header == "日期":
                    value = value.strftime('%Y-%m-%d')
                if header in percent_cols and '%' not in str(value):
                    value = f"{float(value) * 100:.2f}%"
                if header == "涨幅":
                    header = "大盘涨幅"
                if header == "昨板今均":
                    header = "昨日涨停表现"
                # 情绪变为（超级差，很差，差，一般，好，很好，超级好）
                if header == "市场情绪":
                    # print(f"old 情绪value {value}")
                    value = value.split('(')[0].split('（')[0]
                    value = value.replace("股灾", "超级差")
                    value = value.replace("极差", "很差")
                    value = value.replace("极好", "很好")
                    value = value.replace("爆炸好", "超级好")
                    # print(f"new 情绪value {value}")

                if header == "涨跌停比":
                    tmp  = re.split(':|：', value)
                    formatted_value = f"涨停数量{tmp[0]}， 跌停数量{tmp[1]}"
                    # print(formatted_value)
                else:
                    formatted_value = f"{header}{value}"
                row_data.append(formatted_value)
            # print(row_data)
            # 将处理后的行添加到结果列表
            # result_rows.append("， ".join(row_data))
            result_rows.append(row_data)
    return result_rows, start_id_of_year

result_rows_sentiment, start_id_of_year_sentiment = parse_file(file_paths, sentiment_selected_cols, percent_cols)
result_rows_earning_rate, start_id_of_year_earning_rate = parse_file(file_paths, earning_rate_selected_cols, percent_cols)
eval_data_sentiment, _ = parse_file(eval_paths, sentiment_selected_cols, percent_cols)
eval_data_earning_rate, _ = parse_file(eval_paths, earning_rate_selected_cols, percent_cols)


print(f"sentiment counter: {len(result_rows_sentiment)}")
print(f"sentiment year start ids： {start_id_of_year_sentiment}")

print(f"earning_rate counter: {len(result_rows_earning_rate)}")
print(f"earning_rate year start ids： {start_id_of_year_earning_rate}")


sentiment counter: 1423
sentiment year start ids： {'2019': 0, '2020': 244, '2021': 487, '2022': 730, '2023': 972, '2024': 1185}
earning_rate counter: 1423
earning_rate year start ids： {'2019': 0, '2020': 244, '2021': 487, '2022': 730, '2023': 972, '2024': 1185}


In [21]:
import json
def find_col_id(data, target):
    for i, s in enumerate(data):
        if s.find(target) != -1:
            return i
    return -1

def produce_train_data(dataset, date_span, predict_col="市场情绪"):

    json_data = []
    input = ""
    note = ""
    if predict_col=="市场情绪":
        note = "市场情绪分类范围是：超级差、很差、差、一般、好、很好、超级好。程度由坏到好。"
    if predict_col == "赚钱效应":
        note = "赚钱效应区间有3个，分别是：0-35%、35%-70%、70%-100%。"
    for i in range(len(dataset)):
        date_id = (i + 1) % (date_span + 1)
        if date_id:
            cur_data = f'第{date_id}天大盘数据是:' + '， '.join(dataset[i]) + '。\n'
            input += cur_data
        else:
            predict_col_id = find_col_id(dataset[i], predict_col)
            assert predict_col_id != -1
            output = dataset[i][predict_col_id][len(predict_col):]
            predict_name = predict_col
            if predict_name == "赚钱效应":
                predict_name += "区间"
            json_entry = {
                "instruction": f"请你根据近{date_span}天的数据，来预测下一天的{predict_name}。",
                "input": input + note + "请你一步步的推理分析，最终<answer>里的答案必须是在这个范围内，而且只给出一个答案。\n",
                "output": output
            }
            input = ""
            json_data.append(json_entry)

    print(f"\n共 {len(json_data)} 条记录")
    return json_data

def produce_train_data_single_day(dataset, predict_col="市场情绪"):

    json_data = []
    note = ""
    if predict_col=="市场情绪":
        note = "市场情绪分类范围是：超级差、很差、差、一般、好、很好、超级好。程度由坏到好。"
    if predict_col == "赚钱效应":
        note = "赚钱效应区间有3个，分别是：0-35%、35%-70%、70%-100%。"
    for i in range(len(dataset)):
        
        predict_col_id = find_col_id(dataset[i], predict_col)
        assert predict_col_id != -1
        output = dataset[i][predict_col_id][len(predict_col):]
        # print(output)
        data = dataset[i].copy()
        del data[predict_col_id]
        input = '， '.join(data) + '。\n'
        predict_name = predict_col
        if predict_name == "赚钱效应":
            predict_name += "区间"
        json_entry = {
            "instruction": f"请你根据一些当天的市场数据，来预测当天的{predict_name}。",
            "input": "当天市场数据是：" + input + note + "请你一步步的推理分析，最终<answer>里的答案必须是在这个范围内，而且只给出一个答案。\n",
            "output": output
        }
        json_data.append(json_entry)

    print(f"\n共 {len(json_data)} 条记录")
    return json_data
# sentiment_single_day = produce_train_data_single_day(result_rows_sentiment)
# print(sentiment_single_day)

In [22]:
output_file1 = './dataset/sentiment_data_3.json'
output_file2 = './dataset/sentiment_data_enchanced_3.json'
output_file3 = './dataset/sentiment_data_5.json'
output_file4 = './dataset/sentiment_data_enchanced_5.json'
output_file5 = './dataset/sentiment_eval.json'
output_file6 = './dataset/sentiment_single_day.json'
output_file7 = './dataset/sentiment_eval_single_day.json'
predict = '市场情绪'


json_data1 = produce_train_data(result_rows_sentiment, 3, predict)
with open(output_file1, 'w', encoding='utf-8') as f:
    json.dump(json_data1, f, ensure_ascii=False, indent=4)

start = start_id_of_year_sentiment['2023']
print("start id is ", start)
print(f"\n数据已保存为 {output_file1}，共 {len(json_data1)} 条记录")
# 保存为JSON文件
json_data2 = produce_train_data(result_rows_sentiment[start + 2:], 3, predict)
combined1 = json_data1 + json_data2
with open(output_file2, 'w', encoding='utf-8') as f:
    json.dump(combined1, f, ensure_ascii=False, indent=4)
print(f"\n数据已保存为 {output_file2}，共 {len(combined1)} 条记录")

json_data3 = produce_train_data(result_rows_sentiment, 5, predict)
with open(output_file3, 'w', encoding='utf-8') as f:
    json.dump(json_data3, f, ensure_ascii=False, indent=4)

print(f"\n数据已保存为 {output_file3}，共 {len(json_data3)} 条记录")
# 保存为JSON文件
json_data4 = produce_train_data(result_rows_sentiment[start + 2:], 5, predict)
combined2 = json_data3 + json_data4
with open(output_file4, 'w', encoding='utf-8') as f:
    json.dump(combined2, f, ensure_ascii=False, indent=4)
print(f"\n数据已保存为 {output_file4}，共 {len(combined2)} 条记录")


json_data_eval = produce_train_data(eval_data_sentiment, 3, predict)
with open(output_file5, 'w', encoding='utf-8') as f:
    json.dump(json_data_eval, f, ensure_ascii=False, indent=4)
print(f"\neval数据已保存为 {output_file5}，共 {len(json_data_eval)} 条记录")

json_data_single_day = produce_train_data_single_day(result_rows_sentiment)
with open(output_file6, 'w', encoding='utf-8') as f:
    json.dump(json_data_single_day, f, ensure_ascii=False, indent=4)
print(f"\nsingle day数据已保存为 {output_file6}，共 {len(json_data_single_day)} 条记录")


json_data_eval_single_day = produce_train_data_single_day(eval_data_sentiment)
with open(output_file7, 'w', encoding='utf-8') as f:
    json.dump(json_data_eval_single_day, f, ensure_ascii=False, indent=4)
print(f"\neval数据已保存为 {output_file7}，共 {len(json_data_eval_single_day)} 条记录")



共 355 条记录
start id is  972

数据已保存为 ./dataset/sentiment_data_3.json，共 355 条记录

共 112 条记录

数据已保存为 ./dataset/sentiment_data_enchanced_3.json，共 467 条记录

共 237 条记录

数据已保存为 ./dataset/sentiment_data_5.json，共 237 条记录

共 74 条记录

数据已保存为 ./dataset/sentiment_data_enchanced_5.json，共 311 条记录

共 10 条记录

eval数据已保存为 ./dataset/sentiment_eval.json，共 10 条记录

共 1423 条记录

single day数据已保存为 ./dataset/sentiment_single_day.json，共 1423 条记录

共 42 条记录

eval数据已保存为 ./dataset/sentiment_eval_single_day.json，共 42 条记录


In [23]:
output_file1 = './dataset/earning_rate_data_3.json'
output_file2 = './dataset/earning_rate_data_enchanced_3.json'
output_file3 = './dataset/earning_rate_data_5.json'
output_file4 = './dataset/earning_rate_data_enchanced_5.json'
output_file5 = './dataset/earning_rate_eval.json'
predict = '赚钱效应'

json_data1 = produce_train_data(result_rows_earning_rate, 3, predict)
with open(output_file1, 'w', encoding='utf-8') as f:
    json.dump(json_data1, f, ensure_ascii=False, indent=4)

start = start_id_of_year_earning_rate['2023']
print("start id is ", start)
print(f"\n数据已保存为 {output_file1}，共 {len(json_data1)} 条记录")
# 保存为JSON文件
json_data2 = produce_train_data(result_rows_earning_rate[start + 2:], 3, predict)
combined1 = json_data1 + json_data2
with open(output_file2, 'w', encoding='utf-8') as f:
    json.dump(combined1, f, ensure_ascii=False, indent=4)
print(f"\n数据已保存为 {output_file2}，共 {len(combined1)} 条记录")

json_data3 = produce_train_data(result_rows_earning_rate, 5, predict)
with open(output_file3, 'w', encoding='utf-8') as f:
    json.dump(json_data3, f, ensure_ascii=False, indent=4)

print(f"\n数据已保存为 {output_file3}，共 {len(json_data3)} 条记录")
# 保存为JSON文件
json_data4 = produce_train_data(result_rows_earning_rate[start + 2:], 5, predict)
combined2 = json_data3 + json_data4
with open(output_file4, 'w', encoding='utf-8') as f:
    json.dump(combined2, f, ensure_ascii=False, indent=4)
print(f"\n数据已保存为 {output_file4}，共 {len(combined2)} 条记录")


json_data_eval = produce_train_data(eval_data_earning_rate, 3, predict)
with open(output_file5, 'w', encoding='utf-8') as f:
    json.dump(json_data_eval, f, ensure_ascii=False, indent=4)
print(f"\neval数据已保存为 {output_file5}，共 {len(json_data_eval)} 条记录")


共 355 条记录
start id is  972

数据已保存为 ./dataset/earning_rate_data_3.json，共 355 条记录

共 112 条记录

数据已保存为 ./dataset/earning_rate_data_enchanced_3.json，共 467 条记录

共 237 条记录

数据已保存为 ./dataset/earning_rate_data_5.json，共 237 条记录

共 74 条记录

数据已保存为 ./dataset/earning_rate_data_enchanced_5.json，共 311 条记录

共 10 条记录

eval数据已保存为 ./dataset/earning_rate_eval.json，共 10 条记录
