In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression as LR
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
pd.set_option('expand_frame_repr', False)

In [2]:
mon_lst = []
for y in range(1990, 2024):
    for m in ['01-31', '02-29' if (y % 4 == 0 and y % 100 != 0) or (y % 400 == 0) else '02-28', '03-31', '04-30', '05-31', '06-30', '07-31', '08-31', '09-30', '10-31', '11-30', '12-31']:
        mon_lst.append(f'{y}-{m}')
mon_df = pd.DataFrame(mon_lst[:-7], columns=['交易月份'])
mon_df['月份'] = mon_df['交易月份'].apply(lambda x: x[:7])
stk_df = pd.read_csv('../../data/csmar/公司文件.csv')
stk_df['on'] = 0
mon_df['on'] = 0
header = pd.merge(stk_df, mon_df, on=['on'], how='left')[['证券代码', '交易月份', '月份']]

In [5]:
# 88. er_trend
# 月频率。该趋势因子根据Liu et al. (2020)[8]的定义构建。
# 根据原文，趋势因子(trend)定义为TrendHigh组中6个投资组合的价值加权回报率(VW)减去TrendLow组的回报率的平均值。
# 三个趋势组(TrendLow、TrendMid和TrendHigh)位于ERTrend的第30和70个百分位。

df1 = pd.read_csv('../../data/csmar/月_月个股回报率文件.csv', usecols=['证券代码', '交易月份', '不考虑现金红利再投资的月个股回报率'])
df2 = pd.read_csv('../../data/csmar/日_日个股回报率文件.csv', usecols=['证券代码', '交易日期', '日收盘价', '日个股交易金额'])
lags = [3, 5, 10, 20, 50, 100, 200, 300, 400]
for l in lags:
    df2[f'日收盘价_MA{l}'] = df2[['证券代码', '日收盘价']].groupby('证券代码').rolling(l).mean().reset_index()['日收盘价']/df2['日收盘价'].replace(0, np.nan)
    df2[f'日个股交易金额_MA{l}'] = df2[['证券代码', '日个股交易金额']].groupby('证券代码').rolling(l).mean().reset_index()['日个股交易金额']/df2['日个股交易金额'].replace(0, np.nan)
df2['月份'] = df2['交易日期'].apply(lambda x: x[:7])
df2 = df2.sort_values(by=['证券代码', '月份'])
df3 = df2.drop_duplicates(subset=['证券代码', '月份'], keep='last')
del df2
df4 = df3[df3['交易日期'].apply(lambda x: x[-2:]) >= '26']
df5 = pd.merge(header, df4, on=['证券代码', '月份'], how='left')
df6 = pd.merge(df1, df5, on=['证券代码', '交易月份'], how='left')
df6[[f'日收盘价_MA{l}' for l in lags]] = df6[[f'日收盘价_MA{l}' for l in lags]].fillna(method='ffill', axis=1)
df6[[f'日个股交易金额_MA{l}' for l in lags]] = df6[[f'日个股交易金额_MA{l}' for l in lags]].fillna(method='ffill', axis=1)
df6 = df6.sort_values(by=['证券代码', '交易月份'])
x_col = [f'日收盘价_MA{l}' for l in lags] + [f'日个股交易金额_MA{l}' for l in lags]
ebeta_col = [f'EBeta_日收盘价_MA{l}' for l in lags] + [f'EBeta_日个股交易金额_MA{l}' for l in lags]
df7 = df6.copy()
df7.loc[:, x_col] = df7.groupby('证券代码').shift(1)[x_col]
X = df7[x_col].values
Y = df7['不考虑现金红利再投资的月个股回报率'].values
ix1 = ~(df7[x_col].isna().any(axis=1))
ix2 = ~df7['不考虑现金红利再投资的月个股回报率'].isna()
beta0 = LR().fit(X[ix1 & ix2], Y[ix1 & ix2]).intercept_
df7['不考虑现金红利再投资的月个股回报率'] = df7['不考虑现金红利再投资的月个股回报率'] - beta0
ebeta = None
for month in df7['交易月份'].drop_duplicates().sort_values():
    X = df7[x_col][df7['交易月份'] == month].values
    Y = df7['不考虑现金红利再投资的月个股回报率'][df7['交易月份'] == month].values
    ix1 = ~(np.isnan(X).any(axis=1))
    ix2 = ~np.isnan(Y)
    if (ix1 & ix2).sum() >= 5:
        beta = LR(fit_intercept=False).fit(X[ix1 & ix2], Y[ix1 & ix2]).coef_
        if ebeta is None:
            ebeta = beta
        else:
            ebeta = 0.98*ebeta + 0.02*beta
        df6.loc[df6['交易月份'] == month, ebeta_col] = ebeta
df6['er_trend'] = (df6[x_col].values*df6[ebeta_col].values).sum(axis=1)
df8 = df6[['证券代码', '交易月份', 'er_trend']]
df8.to_csv('../../output/csmar/月_88.csv', index=False, encoding='utf-8-sig')
df8

Unnamed: 0,证券代码,交易月份,er_trend
0,1,1990-01-31,
1,1,1990-02-28,
2,1,1990-03-31,
3,1,1990-04-30,
4,1,1990-05-31,
...,...,...,...
2197475,900957,2023-01-31,0.002832
2197476,900957,2023-02-28,0.016274
2197477,900957,2023-03-31,0.003797
2197478,900957,2023-04-30,0.003388
