In [1]:
import pandas as pd
import glob
import re

In [2]:
# 第一步：处理新闻数据
news_file_paths = glob.glob("news_data/CNH_2024*.csv")
news_dataframes = []

for file_path in news_file_paths:
    df = pd.read_csv(file_path, usecols=['updateTimestamp', 'content', 'title'])
    news_dataframes.append(df)

combined_news_df = pd.concat(news_dataframes)

# 转换updateTimestamp为datetime格式
combined_news_df['updateTimestamp'] = pd.to_datetime(combined_news_df['updateTimestamp'])

# 删除'content'列中以“格隆汇X月X日｜”开头的部分
combined_news_df['content'] = combined_news_df['content'].apply(lambda x: re.sub(r'^格隆汇\d+月\d+日｜', '', x))

# 使用str.replace方法删除'音频 | 格隆汇'开头的内容
combined_news_df['title'] = combined_news_df['title'].str.replace(r'^音频 \| 格隆汇', '', regex=True)

def fill_title(row):
    if pd.isna(row['title']):
        first_sentence = re.split(r'。', row['content'], 1)[0]
        return first_sentence
    return row['title']

combined_news_df['title'] = combined_news_df.apply(fill_title, axis=1)

combined_news_df = combined_news_df.sort_values('updateTimestamp').set_index('updateTimestamp')
combined_news_df

Unnamed: 0_level_0,title,content
updateTimestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2024-01-02 09:17:12,人民币兑美元中间价报7.0770 上调57点,人民币兑美元中间价报7.0770，上调57点；上一交易日中间价7.0827，上一交易日官方收...
2024-01-02 16:31:45,在岸人民币兑美元1月2日16:30收盘报7.1320，较上一交易日下跌400点,在岸人民币兑美元1月2日16:30收盘报7.1320，较上一交易日下跌400点。
2024-01-03 06:28:24,美元指数创近十个月最大涨幅 离岸人民币盘中跌超400点失守7.15,十年期英债收益率升超10个基点，创近五个月最大升幅；盘中两年期美债收益率升10个基点、十年期...
2024-01-03 06:49:45,1.3盘前要点—港A美股你需要关注的大事都在这,国际要闻：\n1、纳指创去年10月底以来最大单日跌幅 道指再创历史新高；\n2、美元指数创近...
2024-01-03 09:16:49,人民币兑美元中间价较上日调降232点至7.1002，降幅创2023年6月26日以来最大,人民币兑美元中间价较上日调降232点至7.1002，降幅创2023年6月26日以来最大。
...,...,...
2024-07-30 07:21:35,7.30盘前要点—港A美股你需要关注的大事都在这,国际要闻：\n1、美股三大指数收盘涨跌不一 特斯拉涨超5%；\n2、美国原油期货收盘跌穿20...
2024-07-30 09:16:27,人民币兑美元中间价报7.1364 下调48点,人民币兑美元中间价报7.1364，下调48点；上一交易日中间价7.1316，上一交易日官方收...
2024-07-31 06:10:44,在岸人民币兑美元北京时间03:00收报7.2450，较上一交易日夜盘收盘涨155点,在岸人民币兑美元北京时间03:00收报7.2450，较上一交易日夜盘收盘涨155点。成交量4...
2024-07-31 06:35:18,离岸人民币兑美元较周一纽约尾盘涨278点,离岸人民币（CNH）兑美元北京时间04:59报7.2438元，较周一纽约尾盘涨278点，盘中...


In [3]:
# 第二步：处理汇率数据
spot_file_paths = glob.glob("spot_data/USDCNH/*.csv")
spot_dataframes = []

for file_path in spot_file_paths:
    df = pd.read_csv(file_path, usecols=['Unnamed: 0', 'open', 'high', 'low', 'close', 'num_trds'])
    df = df.rename(columns={'Unnamed: 0': 'datetime'})
    df['datetime'] = pd.to_datetime(df['datetime']).dt.tz_convert('Asia/Hong_Kong').dt.tz_localize(None)
    spot_dataframes.append(df)

combined_spot_df = pd.concat(spot_dataframes)
combined_spot_df = combined_spot_df.sort_values('datetime')
combined_spot_df

Unnamed: 0,datetime,open,high,low,close,num_trds
0,2024-01-02 05:59:00,7.1236,7.1237,7.1236,7.1237,3
0,2024-01-02 06:00:00,7.1236,7.1238,7.1235,7.1238,29
1,2024-01-02 06:05:00,7.1232,7.1240,7.1232,7.1237,178
2,2024-01-02 06:06:00,7.1237,7.1239,7.1237,7.1238,80
3,2024-01-02 06:10:00,7.1246,7.1246,7.1236,7.1237,230
...,...,...,...,...,...,...
1435,2024-07-23 04:55:00,7.2961,7.2963,7.2961,7.2963,539
1436,2024-07-23 04:56:00,7.2963,7.2964,7.2963,7.2963,476
1437,2024-07-23 04:57:00,7.2963,7.2964,7.2962,7.2962,499
1438,2024-07-23 04:58:00,7.2962,7.2962,7.2958,7.2958,536


In [4]:
# 第三步：合并新闻数据和汇率数据
combined_news_df.reset_index(inplace=True)
combined_news_df['datetime'] = combined_news_df['updateTimestamp'].dt.floor('min')

# 检查数据框是否包含 'datetime' 列
if 'datetime' not in combined_news_df.columns:
    raise KeyError("combined_news_df does not contain 'datetime' column")

if 'datetime' not in combined_spot_df.columns:
    raise KeyError("combined_spot_df does not contain 'datetime' column")

# 初始合并新闻和汇率数据
final_df = pd.merge(combined_news_df, combined_spot_df, on='datetime', how='left')

# 重命名合并后的 'datetime' 列
if 'datetime_x' in final_df.columns:
    final_df.rename(columns={'datetime_x': 'datetime'}, inplace=True)
if 'datetime_y' in final_df.columns:
    final_df.drop(columns=['datetime_y'], inplace=True)

# 检查初始合并后的数据框是否包含 'datetime' 列
if 'datetime' not in final_df.columns:
    raise KeyError("final_df does not contain 'datetime' column after initial merge")
    
# 提取 'updateTimestamp' 和 'datetime' 列的精确到分钟的时间
final_df['updateTimestamp_min'] = final_df['updateTimestamp'].dt.floor('min')
final_df['datetime_min'] = final_df['datetime'].dt.floor('min')

# 找出不同的行
different_rows = final_df[final_df['updateTimestamp_min'] != final_df['datetime_min']]

# 打印出不同的行
if not different_rows.empty:
    print("以下行的 'updateTimestamp' 和 'datetime' 列的精确到分钟的时间不同:")
    print(different_rows)
else:
    print("所有行的 'updateTimestamp' 和 'datetime' 列的精确到分钟的时间都相同。")
    
final_df

所有行的 'updateTimestamp' 和 'datetime' 列的精确到分钟的时间都相同。


Unnamed: 0,updateTimestamp,title,content,datetime,open,high,low,close,num_trds,updateTimestamp_min,datetime_min
0,2024-01-02 09:17:12,人民币兑美元中间价报7.0770 上调57点,人民币兑美元中间价报7.0770，上调57点；上一交易日中间价7.0827，上一交易日官方收...,2024-01-02 09:17:00,7.1123,7.1125,7.1119,7.1121,950.0,2024-01-02 09:17:00,2024-01-02 09:17:00
1,2024-01-02 16:31:45,在岸人民币兑美元1月2日16:30收盘报7.1320，较上一交易日下跌400点,在岸人民币兑美元1月2日16:30收盘报7.1320，较上一交易日下跌400点。,2024-01-02 16:31:00,7.1379,7.1382,7.1377,7.1381,833.0,2024-01-02 16:31:00,2024-01-02 16:31:00
2,2024-01-03 06:28:24,美元指数创近十个月最大涨幅 离岸人民币盘中跌超400点失守7.15,十年期英债收益率升超10个基点，创近五个月最大升幅；盘中两年期美债收益率升10个基点、十年期...,2024-01-03 06:28:00,7.1503,7.1504,7.1502,7.1503,394.0,2024-01-03 06:28:00,2024-01-03 06:28:00
3,2024-01-03 06:49:45,1.3盘前要点—港A美股你需要关注的大事都在这,国际要闻：\n1、纳指创去年10月底以来最大单日跌幅 道指再创历史新高；\n2、美元指数创近...,2024-01-03 06:49:00,7.1504,7.1506,7.1502,7.1504,1209.0,2024-01-03 06:49:00,2024-01-03 06:49:00
4,2024-01-03 09:16:49,人民币兑美元中间价较上日调降232点至7.1002，降幅创2023年6月26日以来最大,人民币兑美元中间价较上日调降232点至7.1002，降幅创2023年6月26日以来最大。,2024-01-03 09:16:00,7.1527,7.1546,7.1512,7.1545,2018.0,2024-01-03 09:16:00,2024-01-03 09:16:00
...,...,...,...,...,...,...,...,...,...,...,...
807,2024-07-30 07:21:35,7.30盘前要点—港A美股你需要关注的大事都在这,国际要闻：\n1、美股三大指数收盘涨跌不一 特斯拉涨超5%；\n2、美国原油期货收盘跌穿20...,2024-07-30 07:21:00,,,,,,2024-07-30 07:21:00,2024-07-30 07:21:00
808,2024-07-30 09:16:27,人民币兑美元中间价报7.1364 下调48点,人民币兑美元中间价报7.1364，下调48点；上一交易日中间价7.1316，上一交易日官方收...,2024-07-30 09:16:00,,,,,,2024-07-30 09:16:00,2024-07-30 09:16:00
809,2024-07-31 06:10:44,在岸人民币兑美元北京时间03:00收报7.2450，较上一交易日夜盘收盘涨155点,在岸人民币兑美元北京时间03:00收报7.2450，较上一交易日夜盘收盘涨155点。成交量4...,2024-07-31 06:10:00,,,,,,2024-07-31 06:10:00,2024-07-31 06:10:00
810,2024-07-31 06:35:18,离岸人民币兑美元较周一纽约尾盘涨278点,离岸人民币（CNH）兑美元北京时间04:59报7.2438元，较周一纽约尾盘涨278点，盘中...,2024-07-31 06:35:00,,,,,,2024-07-31 06:35:00,2024-07-31 06:35:00


In [5]:
# 创建延迟x分钟的数据
delays = [1, 3, 5, 10, 30, 60]

for x in delays:
    combined_spot_df[f'datetime_{x}'] = combined_spot_df['datetime'] - pd.Timedelta(minutes=x)
    df_to_merge = combined_spot_df[['datetime', f'datetime_{x}', 'open', 'high', 'low', 'close', 'num_trds']].rename(columns={
        'open': f'open_{x}',
        'high': f'high_{x}',
        'low': f'low_{x}',
        'close': f'close_{x}',
        'num_trds': f'num_trds_{x}'
    })

    # 检查 df_to_merge 是否包含 datetime 列
    if f'datetime_{x}' in df_to_merge.columns:
        final_df = pd.merge(final_df, df_to_merge, left_on='datetime', right_on=f'datetime_{x}', how='left')

        # 重命名合并后的 'datetime' 列
        if 'datetime_x' in final_df.columns:
            final_df.rename(columns={'datetime_x': 'datetime'}, inplace=True)
        if 'datetime_y' in final_df.columns:
            final_df.drop(columns=['datetime_y'], inplace=True)

        # 调试信息：检查合并后是否包含 datetime 列
        if 'datetime' not in final_df.columns:
            print(f"Error: final_df does not contain 'datetime' column after merging with delay {x}")
            print("Columns in final_df:", final_df.columns)
            raise KeyError(f"final_df does not contain 'datetime' column after merging with delay {x}")
        else:
            pass
            # print(f"Successfully merged with delay {x}. Columns in final_df:", final_df.columns)

# 移除不必要的列
for x in delays:
    final_df.drop(columns=[f'datetime_{x}'], inplace=True, errors='ignore')

final_df = final_df.sort_values(['datetime', 'updateTimestamp'])

# 将updateTimestamp重新设为索引，同时保留datetime列
final_df.set_index('updateTimestamp', inplace=True)
final_df

Unnamed: 0_level_0,title,content,datetime,open,high,low,close,num_trds,updateTimestamp_min,datetime_min,...,open_30,high_30,low_30,close_30,num_trds_30,open_60,high_60,low_60,close_60,num_trds_60
updateTimestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-01-02 09:17:12,人民币兑美元中间价报7.0770 上调57点,人民币兑美元中间价报7.0770，上调57点；上一交易日中间价7.0827，上一交易日官方收...,2024-01-02 09:17:00,7.1123,7.1125,7.1119,7.1121,950.0,2024-01-02 09:17:00,2024-01-02 09:17:00,...,7.1251,7.1276,7.1245,7.1268,1767.0,7.1272,7.1276,7.1270,7.1270,835.0
2024-01-02 16:31:45,在岸人民币兑美元1月2日16:30收盘报7.1320，较上一交易日下跌400点,在岸人民币兑美元1月2日16:30收盘报7.1320，较上一交易日下跌400点。,2024-01-02 16:31:00,7.1379,7.1382,7.1377,7.1381,833.0,2024-01-02 16:31:00,2024-01-02 16:31:00,...,7.1389,7.1389,7.1386,7.1386,718.0,7.1386,7.1387,7.1385,7.1385,671.0
2024-01-03 06:28:24,美元指数创近十个月最大涨幅 离岸人民币盘中跌超400点失守7.15,十年期英债收益率升超10个基点，创近五个月最大升幅；盘中两年期美债收益率升10个基点、十年期...,2024-01-03 06:28:00,7.1503,7.1504,7.1502,7.1503,394.0,2024-01-03 06:28:00,2024-01-03 06:28:00,...,7.1501,7.1502,7.1500,7.1501,1060.0,7.1507,7.1507,7.1506,7.1507,703.0
2024-01-03 06:49:45,1.3盘前要点—港A美股你需要关注的大事都在这,国际要闻：\n1、纳指创去年10月底以来最大单日跌幅 道指再创历史新高；\n2、美元指数创近...,2024-01-03 06:49:00,7.1504,7.1506,7.1502,7.1504,1209.0,2024-01-03 06:49:00,2024-01-03 06:49:00,...,7.1504,7.1505,7.1503,7.1504,458.0,7.1506,7.1508,7.1506,7.1507,471.0
2024-01-03 09:16:49,人民币兑美元中间价较上日调降232点至7.1002，降幅创2023年6月26日以来最大,人民币兑美元中间价较上日调降232点至7.1002，降幅创2023年6月26日以来最大。,2024-01-03 09:16:00,7.1527,7.1546,7.1512,7.1545,2018.0,2024-01-03 09:16:00,2024-01-03 09:16:00,...,7.1521,7.1526,7.1517,7.1526,996.0,7.1563,7.1584,7.1563,7.1583,1031.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-07-30 07:21:35,7.30盘前要点—港A美股你需要关注的大事都在这,国际要闻：\n1、美股三大指数收盘涨跌不一 特斯拉涨超5%；\n2、美国原油期货收盘跌穿20...,2024-07-30 07:21:00,,,,,,2024-07-30 07:21:00,2024-07-30 07:21:00,...,,,,,,,,,,
2024-07-30 09:16:27,人民币兑美元中间价报7.1364 下调48点,人民币兑美元中间价报7.1364，下调48点；上一交易日中间价7.1316，上一交易日官方收...,2024-07-30 09:16:00,,,,,,2024-07-30 09:16:00,2024-07-30 09:16:00,...,,,,,,,,,,
2024-07-31 06:10:44,在岸人民币兑美元北京时间03:00收报7.2450，较上一交易日夜盘收盘涨155点,在岸人民币兑美元北京时间03:00收报7.2450，较上一交易日夜盘收盘涨155点。成交量4...,2024-07-31 06:10:00,,,,,,2024-07-31 06:10:00,2024-07-31 06:10:00,...,,,,,,,,,,
2024-07-31 06:35:18,离岸人民币兑美元较周一纽约尾盘涨278点,离岸人民币（CNH）兑美元北京时间04:59报7.2438元，较周一纽约尾盘涨278点，盘中...,2024-07-31 06:35:00,,,,,,2024-07-31 06:35:00,2024-07-31 06:35:00,...,,,,,,,,,,


In [6]:
final_df.dropna(inplace=True)

In [7]:
# 定义需要处理的后缀列表
suffixes = [1, 3, 5, 10, 30, 60]

# 遍历每一个后缀，生成对应的 return 列
for x in suffixes:
    final_df[f'Y_open_{x}'] = final_df[f'open_{x}'] / final_df['open'] - 1
    final_df[f'Y_high_{x}'] = final_df[f'high_{x}'] / final_df['high'] - 1
    final_df[f'Y_low_{x}'] = final_df[f'low_{x}'] / final_df['low'] - 1
    final_df[f'Y_close_{x}'] = final_df[f'close_{x}'] / final_df['close'] - 1

In [8]:
final_df = final_df[['title', 'content','open', 'high', 'low', 'close', 'num_trds', 
                     'Y_close_1', 'Y_close_3', 'Y_close_5', 'Y_close_10', 'Y_close_30','Y_close_60']]

In [9]:
# 定义需要处理的后缀列表
suffixes = [1, 3, 5, 10, 30, 60]

# 如果 final_df 是从另一个 DataFrame 切片而来，确保它是一个副本
final_df = final_df.copy()

# 遍历每一个后缀，生成对应的 return 列
for x in suffixes:
    # 使用 loc 进行赋值
    final_df.loc[:, f'Inverse_Y_close_{x}'] = -final_df.loc[:, f'Y_close_{x}'] / (final_df.loc[:, f'Y_close_{x}'] + 1)
final_df

Unnamed: 0_level_0,title,content,open,high,low,close,num_trds,Y_close_1,Y_close_3,Y_close_5,Y_close_10,Y_close_30,Y_close_60,Inverse_Y_close_1,Inverse_Y_close_3,Inverse_Y_close_5,Inverse_Y_close_10,Inverse_Y_close_30,Inverse_Y_close_60
updateTimestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2024-01-02 09:17:12,人民币兑美元中间价报7.0770 上调57点,人民币兑美元中间价报7.0770，上调57点；上一交易日中间价7.0827，上一交易日官方收...,7.1123,7.1125,7.1119,7.1121,950.0,-0.000211,0.000253,0.000253,-0.000056,0.002067,0.002095,0.000211,-0.000253,-0.000253,0.000056,-0.002063,-0.002091
2024-01-02 16:31:45,在岸人民币兑美元1月2日16:30收盘报7.1320，较上一交易日下跌400点,在岸人民币兑美元1月2日16:30收盘报7.1320，较上一交易日下跌400点。,7.1379,7.1382,7.1377,7.1381,833.0,0.000014,0.000042,0.000140,0.000182,0.000070,0.000056,-0.000014,-0.000042,-0.000140,-0.000182,-0.000070,-0.000056
2024-01-03 06:28:24,美元指数创近十个月最大涨幅 离岸人民币盘中跌超400点失守7.15,十年期英债收益率升超10个基点，创近五个月最大升幅；盘中两年期美债收益率升10个基点、十年期...,7.1503,7.1504,7.1502,7.1503,394.0,-0.000028,0.000000,0.000042,0.000028,-0.000028,0.000056,0.000028,-0.000000,-0.000042,-0.000028,0.000028,-0.000056
2024-01-03 06:49:45,1.3盘前要点—港A美股你需要关注的大事都在这,国际要闻：\n1、纳指创去年10月底以来最大单日跌幅 道指再创历史新高；\n2、美元指数创近...,7.1504,7.1506,7.1502,7.1504,1209.0,-0.000028,0.000000,-0.000042,-0.000042,0.000000,0.000042,0.000028,-0.000000,0.000042,0.000042,-0.000000,-0.000042
2024-01-03 09:16:49,人民币兑美元中间价较上日调降232点至7.1002，降幅创2023年6月26日以来最大,人民币兑美元中间价较上日调降232点至7.1002，降幅创2023年6月26日以来最大。,7.1527,7.1546,7.1512,7.1545,2018.0,0.000321,0.000070,0.000014,-0.000098,-0.000266,0.000531,-0.000321,-0.000070,-0.000014,0.000098,0.000266,-0.000531
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-07-22 09:05:01,“降息”！中国7月1年期、5年期LPR下调10BP,中国央行将一年期和五年期贷款市场报价利率(LPR)分别下调10个基点至3.35%和3.85%。,7.2910,7.2912,7.2909,7.2911,901.0,-0.000027,0.000069,0.000137,-0.000041,-0.000343,0.000110,0.000027,-0.000069,-0.000137,0.000041,0.000343,-0.000110
2024-07-22 09:05:19,央行：自今日起 将LPR发布时间由每月20日上午9:15调整为9：00,中国央行公告，为加强预期管理，促进LPR发布时间与金融市场运行时间更好衔接，自2024年7月...,7.2910,7.2912,7.2909,7.2911,901.0,-0.000027,0.000069,0.000137,-0.000041,-0.000343,0.000110,0.000027,-0.000069,-0.000137,0.000041,0.000343,-0.000110
2024-07-22 09:12:02,离岸人民币兑美元跌破7.29,离岸人民币兑美元跌破7.2900，最低触及7.2924元。此前中国央行将一年期、五年期LPR...,7.2919,7.2923,7.2917,7.2917,1011.0,0.000082,-0.000123,-0.000261,-0.000302,-0.000507,0.000000,-0.000082,0.000123,0.000261,0.000302,0.000508,-0.000000
2024-07-22 09:16:15,人民币兑美元中间价较上日调降20点至7.1335,人民币兑美元中间价较上日调降20点至7.1335。,7.2908,7.2909,7.2903,7.2906,938.0,-0.000110,-0.000247,-0.000206,-0.000069,-0.000219,0.000123,0.000110,0.000247,0.000206,0.000069,0.000220,-0.000123


In [10]:
# 保存到CSV文件，确保编码格式为utf-8-sig以正确显示中文
final_df.to_csv('data/CNH_news.csv', encoding='utf-8-sig')
# 保存到Excel文件
final_df.to_excel('data/CNH_news.xlsx', engine='openpyxl')