In [8]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import scipy
from datetime import datetime
import statsmodels.formula.api as smf

from matplotlib import style
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib.font_manager import FontProperties
from pylab import mpl
import platform
import pyreadr

system = platform.system()
if  system == 'Darwin':  # macOS
    plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
    plt.rcParams['axes.unicode_minus'] = False

%matplotlib inline
%config InlineBackend.figure_format = 'svg'

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

pd.set_option('display.max_columns', None)

print(f"当前操作系统: {system}")
print(f"字体设置: {plt.rcParams['font.sans-serif']}")

当前操作系统: Darwin
字体设置: ['Arial Unicode MS']


In [9]:
cross = pd.read_csv('/Users/liuwanting/Desktop/pythonhomework/TRD_Mnth202509.csv')
from pandas.tseries.offsets import MonthEnd
cross['month'] = pd.to_datetime(cross['Trdmnt'], format='%Y-%m') + MonthEnd(1)
cross['Stkcd'] = cross['Stkcd'].apply(lambda x: '{:0>6}'.format(x))
cross.rename(columns={'Mretwd': 'Return', 'Msmvosd': 'floatingvalue', 'Msmvttl': 'totalvalue'}, inplace=True)

rf_data = pd.read_csv('/Users/liuwanting/Desktop/pythonhomework/Marketret_mon_stock2024.csv')
rf_data['month'] = pd.to_datetime(rf_data['month'], format='%b %Y') + MonthEnd(1)
rf_data = rf_data[['month', 'rfmonth']]

cross = pd.merge(cross, rf_data, on='month', how='left')

cross = cross.sort_values(by=['Stkcd', 'month'])
cross['list_month'] = cross.groupby('Stkcd').cumcount() + 1

cross['ret'] = cross['Return'] - cross['rfmonth']
cross['floatingvalue'] = cross['floatingvalue'] * 1000
cross['totalvalue'] = cross['totalvalue'] * 1000

cross

Unnamed: 0,Stkcd,Trdmnt,Opndt,Mopnprc,Clsdt,Mclsprc,Mnshrtrd,Mnvaltrd,floatingvalue,totalvalue,Ndaytrd,Return,Mretnd,Markettype,Capchgdt,Ahshrtrd_M,Ahvaltrd_M,month,rfmonth,list_month,ret
0,000001,1991-04,3,49.00,30,43.68,13400,6.150000e+05,1.157520e+09,2.118487e+09,20,,,4,1991-04-03,,,1991-04-30,0.006651,1,
1,000001,1991-05,2,43.46,31,38.34,187800,7.675000e+06,1.016010e+09,1.859497e+09,24,-0.122253,0.235714,4,1991-04-03,,,1991-05-31,0.006092,2,-0.128345
2,000001,1991-06,1,38.53,28,33.99,30600,1.094000e+06,9.007350e+08,1.648521e+09,23,-0.113459,-0.113459,4,1991-04-03,,,1991-06-30,0.006092,3,-0.119551
3,000001,1991-07,1,33.65,31,29.54,6100,1.940430e+05,7.828100e+08,1.432695e+09,16,-0.130921,-0.130921,4,1991-04-03,,,1991-07-31,0.006092,4,-0.137013
4,000001,1991-08,1,29.39,31,15.00,3243100,4.957624e+07,6.748338e+08,1.346275e+09,15,-0.411588,-0.411587,4,1991-08-01,,,1991-08-31,0.006092,5,-0.417680
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886908,920992,2025-05,6,20.00,30,20.46,92609592,1.922463e+09,5.707967e+08,1.979115e+09,19,0.051312,0.051715,64,2023-04-25,0.0,0.0,2025-05-31,,32,
886909,920992,2025-06,3,20.40,30,21.26,73152031,1.580767e+09,5.931152e+08,2.056500e+09,20,0.039101,0.039101,64,2023-04-25,0.0,0.0,2025-06-30,,33,
886910,920992,2025-07,1,21.13,31,22.44,96064221,2.126994e+09,6.260351e+08,2.170642e+09,23,0.055503,0.055503,64,2023-04-25,0.0,0.0,2025-07-31,,34,
886911,920992,2025-08,1,22.12,29,21.65,63953451,1.432608e+09,6.039955e+08,2.094225e+09,21,-0.035205,-0.035205,64,2023-04-25,0.0,0.0,2025-08-31,,35,


In [11]:
# 生成下一个月的收益率

# 方法1：补全所有个股的日期序列，再使用shift
all_months = pd.DataFrame(cross['month'].unique(), columns=['month'])
all_stocks = pd.DataFrame(cross['Stkcd'].unique(), columns=['Stkcd'])

full_index = all_stocks.merge(all_months, how='cross')

cross_full = full_index.merge(cross, on=['Stkcd', 'month'], how='left')
cross_full = cross_full.sort_values(['Stkcd', 'month'])
cross_full['next_ret'] = cross_full.groupby('Stkcd')['ret'].shift(-1)
cross = cross.merge(cross_full[['Stkcd', 'month', 'next_ret']],
                    on=['Stkcd', 'month'], how='right')

cross['Cumsum_tradingday'] = cross.groupby('Stkcd')['Ndaytrd'].transform(lambda x: x.rolling(window=12, min_periods=1).sum())
print("方法1完成：补全日期序列后使用shift")

cross = cross[(cross['month'] >= '1995-01-31') & (cross['month'] <= '2024-12-31')]
cross

方法1完成：补全日期序列后使用shift


Unnamed: 0,Stkcd,Trdmnt,Opndt,Mopnprc,Clsdt,Mclsprc,Mnshrtrd,Mnvaltrd,floatingvalue,totalvalue,Ndaytrd,Return,Mretnd,Markettype,Capchgdt,Ahshrtrd_M,Ahvaltrd_M,month,rfmonth,list_month,ret,next_ret_x,Cumsum_tradingday,next_ret_y
0,000001,1995-01,3.0,10.50,27.0,10.54,17488100.0,1.867690e+08,3.134297e+09,4.543463e+09,19.0,0.005725,0.005725,4.0,1994-09-02,,,1995-01-31,0.008719,46.0,-0.002994,-0.000180,19.0,-0.000180
1,000001,1995-02,13.0,10.60,28.0,10.63,8544200.0,9.192790e+07,3.161060e+09,4.582260e+09,12.0,0.008539,0.008539,4.0,1994-09-02,,,1995-02-28,0.008719,47.0,-0.000180,0.003511,31.0,0.003511
2,000001,1995-03,1.0,10.63,31.0,10.76,18810600.0,2.059094e+08,3.199719e+09,4.638299e+09,23.0,0.012230,0.012230,4.0,1994-09-02,,,1995-03-31,0.008719,48.0,0.003511,-0.108161,54.0,-0.108161
3,000001,1995-04,3.0,10.78,28.0,9.69,13235500.0,1.380604e+08,2.881531e+09,4.177055e+09,20.0,-0.099442,-0.099442,4.0,1994-09-02,,,1995-04-30,0.008719,49.0,-0.108161,0.002633,74.0,0.002633
4,000001,1995-05,2.0,9.69,31.0,9.80,53197900.0,6.049730e+08,2.914242e+09,4.224473e+09,22.0,0.011352,0.011352,4.0,1994-09-02,,,1995-05-31,0.008719,50.0,0.002633,-0.073005,96.0,-0.073005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2112835,920992,2024-08,1.0,8.50,30.0,8.19,14436632.0,1.185912e+08,2.284861e+08,7.922264e+08,22.0,-0.037603,-0.037603,64.0,2023-04-25,0.0,0.0,2024-08-31,0.001118,23.0,-0.038721,0.276049,242.0,0.276049
2112836,920992,2024-09,2.0,8.15,30.0,10.46,18109819.0,1.566140e+08,2.918149e+08,1.011806e+09,19.0,0.277167,0.277167,64.0,2023-04-25,0.0,0.0,2024-09-30,0.001118,24.0,0.276049,0.389895,241.0,0.389895
2112837,920992,2024-10,8.0,13.02,31.0,14.55,86774237.0,1.145494e+09,4.059185e+08,1.407435e+09,18.0,0.391013,0.391013,64.0,2023-04-25,0.0,0.0,2024-10-31,0.001118,25.0,0.389895,0.241494,242.0,0.241494
2112838,920992,2024-11,1.0,14.69,29.0,18.08,130783392.0,2.346534e+09,5.043990e+08,1.748895e+09,21.0,0.242612,0.242612,64.0,2023-04-25,0.0,0.0,2024-11-30,0.001118,26.0,0.241494,-0.243928,241.0,-0.243928


In [13]:
from pandas.tseries.offsets import MonthEnd
Market_ret = pd.read_csv('/Users/liuwanting/Desktop/pythonhomework/Marketret_mon_stock2024.csv')
Market_ret['month'] = pd.to_datetime(Market_ret['month'], format='%b %Y') + MonthEnd(0)
Market_ret.set_index('month', inplace=True)
Market_ret.sort_index(inplace=True)
Market_ret = Market_ret.drop(columns=['Unnamed: 0'])
Market_ret.rename(columns={'ret': 'MKT'}, inplace=True)
Market_ret

Unnamed: 0_level_0,MarketR,MarketR_e,rfmonth,MKT,ret_e,marketret3,marketret6,marketret12,Q
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1991-01-31,0.029998,0.036554,0.006930,0.023068,0.029624,-0.084127,-0.305662,0.254049,1991 Q1
1991-02-28,0.010203,0.021860,0.006930,0.003273,0.014930,-0.183573,-0.384745,0.241492,1991 Q1
1991-03-31,-0.099663,-0.060425,0.006930,-0.106593,-0.067355,-0.252928,-0.445049,0.288857,1991 Q1
1991-04-30,-0.079652,-0.031449,0.006651,-0.086303,-0.038100,-0.234776,-0.394937,0.691749,1991 Q2
1991-05-31,-0.074521,0.005375,0.006092,-0.080613,-0.000717,-0.236294,0.181673,1.542701,1991 Q2
...,...,...,...,...,...,...,...,...,...
2024-08-31,-0.031468,-0.033685,0.001118,-0.032586,-0.034803,0.151050,,,2024 Q3
2024-09-30,0.195992,0.227078,0.001118,0.194874,0.225960,0.200147,,,2024 Q3
2024-10-31,-0.003409,0.070220,0.001118,-0.004527,0.069102,0.000228,,,2024 Q4
2024-11-30,0.009724,0.045312,0.001118,0.008606,0.044194,,,,2024 Q4


In [14]:
cross = pd.merge(cross,Market_ret[['MKT']],left_on='month',right_on='month',how='left')
cross

Unnamed: 0,Stkcd,Trdmnt,Opndt,Mopnprc,Clsdt,Mclsprc,Mnshrtrd,Mnvaltrd,floatingvalue,totalvalue,Ndaytrd,Return,Mretnd,Markettype,Capchgdt,Ahshrtrd_M,Ahvaltrd_M,month,rfmonth,list_month,ret,next_ret_x,Cumsum_tradingday,next_ret_y,MKT
0,000001,1995-01,3.0,10.50,27.0,10.54,17488100.0,1.867690e+08,3.134297e+09,4.543463e+09,19.0,0.005725,0.005725,4.0,1994-09-02,,,1995-01-31,0.008719,46.0,-0.002994,-0.000180,19.0,-0.000180,-0.125017
1,000001,1995-02,13.0,10.60,28.0,10.63,8544200.0,9.192790e+07,3.161060e+09,4.582260e+09,12.0,0.008539,0.008539,4.0,1994-09-02,,,1995-02-28,0.008719,47.0,-0.000180,0.003511,31.0,0.003511,-0.022778
2,000001,1995-03,1.0,10.63,31.0,10.76,18810600.0,2.059094e+08,3.199719e+09,4.638299e+09,23.0,0.012230,0.012230,4.0,1994-09-02,,,1995-03-31,0.008719,48.0,0.003511,-0.108161,54.0,-0.108161,0.122163
3,000001,1995-04,3.0,10.78,28.0,9.69,13235500.0,1.380604e+08,2.881531e+09,4.177055e+09,20.0,-0.099442,-0.099442,4.0,1994-09-02,,,1995-04-30,0.008719,49.0,-0.108161,0.002633,74.0,0.002633,-0.113023
4,000001,1995-05,2.0,9.69,31.0,9.80,53197900.0,6.049730e+08,2.914242e+09,4.224473e+09,22.0,0.011352,0.011352,4.0,1994-09-02,,,1995-05-31,0.008719,50.0,0.002633,-0.073005,96.0,-0.073005,0.166736
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2112835,920992,2024-08,1.0,8.50,30.0,8.19,14436632.0,1.185912e+08,2.284861e+08,7.922264e+08,22.0,-0.037603,-0.037603,64.0,2023-04-25,0.0,0.0,2024-08-31,0.001118,23.0,-0.038721,0.276049,242.0,0.276049,-0.032586
2112836,920992,2024-09,2.0,8.15,30.0,10.46,18109819.0,1.566140e+08,2.918149e+08,1.011806e+09,19.0,0.277167,0.277167,64.0,2023-04-25,0.0,0.0,2024-09-30,0.001118,24.0,0.276049,0.389895,241.0,0.389895,0.194874
2112837,920992,2024-10,8.0,13.02,31.0,14.55,86774237.0,1.145494e+09,4.059185e+08,1.407435e+09,18.0,0.391013,0.391013,64.0,2023-04-25,0.0,0.0,2024-10-31,0.001118,25.0,0.389895,0.241494,242.0,0.241494,-0.004527
2112838,920992,2024-11,1.0,14.69,29.0,18.08,130783392.0,2.346534e+09,5.043990e+08,1.748895e+09,21.0,0.242612,0.242612,64.0,2023-04-25,0.0,0.0,2024-11-30,0.001118,26.0,0.241494,-0.243928,241.0,-0.243928,0.008606


### 导入价值数据

In [16]:
EP = pd.read_csv('/Users/liuwanting/Desktop/pythonhomework/EP_individual_mon2024.csv')
EP['Stkcd'] = EP['Stkcd'].apply(lambda x: '{:0>6}'.format(x))

EP['year'] = EP['month'].astype(int)
EP['month_decimal'] = EP['month'] - EP['year']
EP['month_num'] = (EP['month_decimal'] * 12).round().astype(int) + 1

EP.loc[EP['month_num'] > 12, 'year'] += 1
EP.loc[EP['month_num'] > 12, 'month_num'] -= 12
EP['month'] = pd.to_datetime(EP['year'].astype(str) + '-' + EP['month_num'].astype(str) + '-01')
EP = EP.drop(['year', 'month_decimal', 'month_num'], axis=1)
EP['month'] = EP['month'] + MonthEnd(1)
EP = EP[['Stkcd', 'month', 'ep', 'ep_recent']]
EP

Unnamed: 0,Stkcd,month,ep,ep_recent
0,000001,1991-04-30,,
1,000001,1991-05-31,,
2,000001,1991-06-30,,
3,000001,1991-07-31,,
4,000001,1991-08-31,,
...,...,...,...,...
809599,605599,2024-08-31,0.079184,0.046498
809600,605599,2024-09-30,0.071525,0.042001
809601,605599,2024-10-31,0.077932,0.063816
809602,605599,2024-11-30,0.072171,0.059100


In [17]:
cross = pd.merge(cross,EP[['Stkcd','month','ep','ep_recent']],on=['Stkcd','month'],how='left')
cross

Unnamed: 0,Stkcd,Trdmnt,Opndt,Mopnprc,Clsdt,Mclsprc,Mnshrtrd,Mnvaltrd,floatingvalue,totalvalue,Ndaytrd,Return,Mretnd,Markettype,Capchgdt,Ahshrtrd_M,Ahvaltrd_M,month,rfmonth,list_month,ret,next_ret_x,Cumsum_tradingday,next_ret_y,MKT,ep,ep_recent
0,000001,1995-01,3.0,10.50,27.0,10.54,17488100.0,1.867690e+08,3.134297e+09,4.543463e+09,19.0,0.005725,0.005725,4.0,1994-09-02,,,1995-01-31,0.008719,46.0,-0.002994,-0.000180,19.0,-0.000180,-0.125017,0.078427,0.078427
1,000001,1995-02,13.0,10.60,28.0,10.63,8544200.0,9.192790e+07,3.161060e+09,4.582260e+09,12.0,0.008539,0.008539,4.0,1994-09-02,,,1995-02-28,0.008719,47.0,-0.000180,0.003511,31.0,0.003511,-0.022778,0.077763,0.077763
2,000001,1995-03,1.0,10.63,31.0,10.76,18810600.0,2.059094e+08,3.199719e+09,4.638299e+09,23.0,0.012230,0.012230,4.0,1994-09-02,,,1995-03-31,0.008719,48.0,0.003511,-0.108161,54.0,-0.108161,0.122163,0.076823,0.076823
3,000001,1995-04,3.0,10.78,28.0,9.69,13235500.0,1.380604e+08,2.881531e+09,4.177055e+09,20.0,-0.099442,-0.099442,4.0,1994-09-02,,,1995-04-30,0.008719,49.0,-0.108161,0.002633,74.0,0.002633,-0.113023,0.085306,0.085306
4,000001,1995-05,2.0,9.69,31.0,9.80,53197900.0,6.049730e+08,2.914242e+09,4.224473e+09,22.0,0.011352,0.011352,4.0,1994-09-02,,,1995-05-31,0.008719,50.0,0.002633,-0.073005,96.0,-0.073005,0.166736,0.084349,0.084349
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2112835,920992,2024-08,1.0,8.50,30.0,8.19,14436632.0,1.185912e+08,2.284861e+08,7.922264e+08,22.0,-0.037603,-0.037603,64.0,2023-04-25,0.0,0.0,2024-08-31,0.001118,23.0,-0.038721,0.276049,242.0,0.276049,-0.032586,,
2112836,920992,2024-09,2.0,8.15,30.0,10.46,18109819.0,1.566140e+08,2.918149e+08,1.011806e+09,19.0,0.277167,0.277167,64.0,2023-04-25,0.0,0.0,2024-09-30,0.001118,24.0,0.276049,0.389895,241.0,0.389895,0.194874,,
2112837,920992,2024-10,8.0,13.02,31.0,14.55,86774237.0,1.145494e+09,4.059185e+08,1.407435e+09,18.0,0.391013,0.391013,64.0,2023-04-25,0.0,0.0,2024-10-31,0.001118,25.0,0.389895,0.241494,242.0,0.241494,-0.004527,,
2112838,920992,2024-11,1.0,14.69,29.0,18.08,130783392.0,2.346534e+09,5.043990e+08,1.748895e+09,21.0,0.242612,0.242612,64.0,2023-04-25,0.0,0.0,2024-11-30,0.001118,26.0,0.241494,-0.243928,241.0,-0.243928,0.008606,,


### 删除30%

In [18]:
fenweishu = pd.DataFrame(
    cross.groupby(['month'])['totalvalue'].quantile(0.3))
fenweishu.columns = ['fenweishu_guimo']
fenweishu

Unnamed: 0_level_0,fenweishu_guimo
month,Unnamed: 1_level_1
1995-01-31,3.958033e+08
1995-02-28,3.907805e+08
1995-03-31,4.341120e+08
1995-04-30,3.835094e+08
1995-05-31,4.590161e+08
...,...
2024-08-31,2.442662e+09
2024-09-30,3.012217e+09
2024-10-31,3.234660e+09
2024-11-30,3.392681e+09


In [19]:
cross_new = pd.merge(cross,fenweishu,on='month',how='left')
cross_new = cross_new[cross_new['totalvalue'] > cross_new['fenweishu_guimo']]
cross_new

Unnamed: 0,Stkcd,Trdmnt,Opndt,Mopnprc,Clsdt,Mclsprc,Mnshrtrd,Mnvaltrd,floatingvalue,totalvalue,Ndaytrd,Return,Mretnd,Markettype,Capchgdt,Ahshrtrd_M,Ahvaltrd_M,month,rfmonth,list_month,ret,next_ret_x,Cumsum_tradingday,next_ret_y,MKT,ep,ep_recent,fenweishu_guimo
0,000001,1995-01,3.0,10.50,27.0,10.54,17488100.0,1.867690e+08,3.134297e+09,4.543463e+09,19.0,0.005725,0.005725,4.0,1994-09-02,,,1995-01-31,0.008719,46.0,-0.002994,-0.000180,19.0,-0.000180,-0.125017,0.078427,0.078427,3.958033e+08
1,000001,1995-02,13.0,10.60,28.0,10.63,8544200.0,9.192790e+07,3.161060e+09,4.582260e+09,12.0,0.008539,0.008539,4.0,1994-09-02,,,1995-02-28,0.008719,47.0,-0.000180,0.003511,31.0,0.003511,-0.022778,0.077763,0.077763,3.907805e+08
2,000001,1995-03,1.0,10.63,31.0,10.76,18810600.0,2.059094e+08,3.199719e+09,4.638299e+09,23.0,0.012230,0.012230,4.0,1994-09-02,,,1995-03-31,0.008719,48.0,0.003511,-0.108161,54.0,-0.108161,0.122163,0.076823,0.076823,4.341120e+08
3,000001,1995-04,3.0,10.78,28.0,9.69,13235500.0,1.380604e+08,2.881531e+09,4.177055e+09,20.0,-0.099442,-0.099442,4.0,1994-09-02,,,1995-04-30,0.008719,49.0,-0.108161,0.002633,74.0,0.002633,-0.113023,0.085306,0.085306,3.835094e+08
4,000001,1995-05,2.0,9.69,31.0,9.80,53197900.0,6.049730e+08,2.914242e+09,4.224473e+09,22.0,0.011352,0.011352,4.0,1994-09-02,,,1995-05-31,0.008719,50.0,0.002633,-0.073005,96.0,-0.073005,0.166736,0.084349,0.084349,4.590161e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2112451,920985,2022-08,8.0,11.05,31.0,11.43,245414411.0,3.490897e+09,1.352501e+09,3.445035e+09,18.0,,,64.0,2022-08-08,0.0,0.0,2022-08-31,0.001241,1.0,,-0.124667,18.0,-0.124667,-0.020875,,,3.370444e+09
2112452,920985,2022-09,1.0,11.60,30.0,10.02,69513984.0,7.992666e+08,1.185657e+09,3.100952e+09,21.0,-0.123360,-0.123360,64.0,2022-09-06,0.0,0.0,2022-09-30,0.001307,2.0,-0.124667,0.025581,39.0,0.025581,-0.065953,,,3.024944e+09
2112453,920985,2022-10,10.0,10.06,31.0,10.29,33100688.0,3.486847e+08,1.217606e+09,3.184510e+09,16.0,0.026946,0.026946,64.0,2022-09-06,0.0,0.0,2022-10-31,0.001365,3.0,0.025581,-0.014970,55.0,-0.014970,-0.042599,,,3.165959e+09
2112477,920985,2024-10,8.0,8.60,31.0,15.31,486954437.0,5.439326e+09,2.058722e+09,4.738081e+09,18.0,1.261448,1.261448,64.0,2023-02-22,0.0,0.0,2024-10-31,0.001118,27.0,1.260330,-0.157878,242.0,-0.157878,-0.004527,,,3.234660e+09


In [20]:
cross_new = cross_new[cross_new['Ndaytrd'] >= 12]
# cross_new = cross_new[cross_new['Clsdt'] >= 5]
cross_new = cross_new[cross_new['list_month'] > 6]
cross_new = cross_new[cross_new['Cumsum_tradingday'] >= 120]
cross_new = cross_new[(cross_new['Markettype'] == 1) | (cross_new['Markettype'] == 4) | (cross_new['Markettype'] == 16)] # 上海A 深圳A 创业板
# ep是完整的
cross_new = cross_new.dropna(subset=['ep'])
cross_new

Unnamed: 0,Stkcd,Trdmnt,Opndt,Mopnprc,Clsdt,Mclsprc,Mnshrtrd,Mnvaltrd,floatingvalue,totalvalue,Ndaytrd,Return,Mretnd,Markettype,Capchgdt,Ahshrtrd_M,Ahvaltrd_M,month,rfmonth,list_month,ret,next_ret_x,Cumsum_tradingday,next_ret_y,MKT,ep,ep_recent,fenweishu_guimo
6,000001,1995-07,3.0,9.10,31.0,9.14,9586800.0,8.756887e+07,2.717977e+09,3.939967e+09,21.0,-0.003272,-0.003272,4.0,1994-09-02,,,1995-07-31,0.008719,52.0,-0.011991,0.048174,139.0,0.048174,0.059671,0.090439,0.090439,4.442522e+08
7,000001,1995-08,1.0,9.12,31.0,9.66,69663100.0,7.113924e+08,2.872610e+09,4.164123e+09,21.0,0.056893,0.056893,4.0,1994-09-02,,,1995-08-31,0.008719,53.0,0.048174,0.000984,160.0,0.000984,0.041040,0.085571,0.051992,4.491256e+08
8,000001,1995-09,4.0,9.80,29.0,7.88,26896000.0,2.554183e+08,2.811988e+09,4.076185e+09,20.0,0.009703,0.009938,4.0,1995-09-25,,,1995-09-30,0.008719,54.0,0.000984,-0.068364,180.0,-0.068364,-0.014261,0.087417,0.053114,4.408452e+08
9,000001,1995-10,4.0,7.90,31.0,7.41,22357400.0,1.709354e+08,2.644268e+09,3.833062e+09,20.0,-0.059645,-0.059645,4.0,1995-09-25,,,1995-10-31,0.008719,55.0,-0.068364,-0.064050,200.0,-0.064050,0.000762,0.092962,0.056483,4.386115e+08
10,000001,1995-11,1.0,7.50,30.0,7.00,13498868.0,9.827970e+07,2.497959e+09,3.620977e+09,22.0,-0.055331,-0.055331,4.0,1995-09-25,,,1995-11-30,0.008719,56.0,-0.064050,-0.090148,222.0,-0.090148,-0.091525,0.098407,0.059791,4.146652e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1779475,605599,2024-08,1.0,11.51,30.0,10.74,60492042.0,6.675452e+08,6.300920e+09,8.353334e+09,22.0,-0.065274,-0.065274,1.0,2022-09-09,0.0,0.0,2024-08-31,0.001118,36.0,-0.066392,0.105958,242.0,0.105958,-0.032586,0.079184,0.046498,2.442662e+09
1779476,605599,2024-09,2.0,10.56,30.0,11.89,68255221.0,7.093424e+08,9.247778e+09,9.247778e+09,19.0,0.107076,0.107076,1.0,2024-09-09,0.0,0.0,2024-09-30,0.001118,37.0,0.105958,-0.105407,241.0,-0.105407,0.194874,0.071525,0.042001,3.012217e+09
1779477,605599,2024-10,8.0,12.91,31.0,10.65,100720810.0,1.106652e+09,8.283334e+09,8.283334e+09,18.0,-0.104289,-0.104289,1.0,2024-09-09,0.0,0.0,2024-10-31,0.001118,38.0,-0.105407,0.078694,242.0,0.078694,-0.004527,0.077932,0.063816,3.234660e+09
1779478,605599,2024-11,1.0,10.63,29.0,11.50,108258572.0,1.199394e+09,8.944445e+09,8.944445e+09,21.0,0.079812,0.079812,1.0,2024-09-09,0.0,0.0,2024-11-30,0.001118,39.0,0.078694,-0.014161,241.0,-0.014161,0.008606,0.072171,0.059100,3.392681e+09


### 异常换手率

In [29]:
ret_day = pd.read_csv('/Users/liuwanting/Desktop/pythonhomework/ret_day2024.csv')
ret_day['Stkcd'] = ret_day['Stkcd'].apply(lambda x: '{:0>6}'.format(x))
ret_day['Day'] = pd.to_datetime(ret_day['Day'], format='%Y-%m-%d')
ret_day['month'] = ret_day['Day'] + MonthEnd(0)
ret_day

Unnamed: 0,Day,Stkcd,y,Q,month,Opnprc,Hiprc,Loprc,Clsprc,TradingVolume,TradingMoney,CirculationValue,TotalValue,Return_1,Return_2,Adjprcwd,Adjprcnd,Markettype,Capchgdt,Trdsta,Ahshrtrd_D,Ahvaltrd_D,PreClosePrice,ChangeRatio,LimitDown,LimitUp,LimitStatus,All_shares,A_float_shares,H_float_shares,Pre_shares,ratio,Freq,Freq_Q,Freq_y,yuemo,jimo,rfday,r
0,1990-12-19,600601,1990,1990.75,1990-12-31,185.30,185.30,185.30,185.30,200.0,37060.0,1.686230e+08,1.853000e+08,2.706000,2.706000,185.300000,185.300000,1,1990-12-19,1,,,,,,,,1000000.0,910000.0,0.0,0.0,0.910000,8,8,8,0,0,0.000227,2.705773
1,1990-12-19,600602,1990,1990.75,1990-12-31,365.70,384.00,365.70,384.00,1160.0,443610.0,1.885440e+08,7.680000e+08,2.840000,2.840000,384.000000,384.000000,1,1990-12-19,1,,,,,,,,2000000.0,491000.0,0.0,0.0,0.245500,8,8,8,0,0,0.000227,2.839773
2,1990-12-19,600651,1990,1990.75,1990-12-31,320.30,320.30,320.30,320.30,2.0,640.6,1.056990e+07,1.056990e+07,3.575714,3.575714,320.300000,320.300000,1,1990-12-19,1,,,,,,,,33000.0,33000.0,0.0,0.0,1.000000,8,8,8,0,0,0.000227,3.575487
3,1990-12-19,600656,1990,1990.75,1990-12-31,260.00,260.00,260.00,260.00,50.0,13000.0,1.170286e+08,6.412952e+08,1.600000,1.600000,260.000000,260.000000,1,1990-12-19,1,,,,,,,,2466520.0,450110.0,0.0,0.0,0.182488,5,5,5,0,0,0.000227,1.599773
4,1990-12-20,600601,1990,1990.75,1990-12-31,185.30,194.60,185.30,194.60,84.0,16160.4,1.770860e+08,1.946000e+08,0.050189,0.050189,194.600000,194.600000,1,1990-12-19,1,,,185.30,0.050189,,,0.0,1000000.0,910000.0,0.0,0.0,0.910000,8,8,8,0,0,0.000227,0.049962
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14662441,2024-12-31,605580,2024,2024.75,2024-12-31,10.07,11.08,10.06,11.08,7669759.0,84447640.0,3.102400e+09,3.102400e+09,0.100298,0.100298,17.474255,15.511980,1,2024-08-26,1,0.0,0.0,10.07,0.100298,9.06,11.08,1.0,280000000.0,280000000.0,0.0,0.0,1.000000,22,61,242,1,1,0.000037,0.100261
14662442,2024-12-31,605588,2024,2024.75,2024-12-31,42.01,42.45,40.63,40.76,1342400.0,55579476.0,2.979538e+09,2.995151e+09,-0.034809,-0.034809,41.098544,40.759952,1,2024-12-17,1,0.0,0.0,42.23,-0.034809,38.01,46.45,0.0,73482601.0,73099561.0,0.0,0.0,0.994787,22,61,242,1,1,0.000037,-0.034846
14662443,2024-12-31,605589,2024,2024.75,2024-12-31,24.28,24.36,23.51,23.58,8651436.0,206018788.0,1.835846e+10,1.995943e+10,-0.022388,-0.022388,24.775919,23.579966,1,2024-11-07,1,0.0,0.0,24.12,-0.022388,21.71,26.53,0.0,846455998.0,778560800.0,0.0,0.0,0.919789,22,61,242,1,1,0.000037,-0.022425
14662444,2024-12-31,605598,2024,2024.75,2024-12-31,20.90,22.99,20.37,22.41,11167299.0,245155786.0,5.446007e+09,5.507500e+09,0.072249,0.072249,31.807557,31.373944,1,2024-09-26,1,0.0,0.0,20.90,0.072249,18.81,22.99,0.0,245760841.0,243016841.0,0.0,0.0,0.988835,22,61,242,1,1,0.000037,0.072212


### Fama-French 3 Factors 日数据

In [23]:
fama3 = pyreadr.read_r('/Users/liuwanting/Desktop/pythonhomework/FF3_daily2024.RDS')[None]
fama3['Day'] = pd.to_datetime(fama3['Day'], format='%Y-%m-%d')
fama3

Unnamed: 0,Day,mkt.ff,smb.ff,hml.ff
0,1995-01-03,-0.009724,0.001605,-0.001592
1,1995-01-04,0.023004,-0.004303,0.006382
2,1995-01-05,-0.009166,-0.004228,0.004423
3,1995-01-06,-0.007825,0.003721,0.006375
4,1995-01-09,-0.027512,0.001165,0.005851
...,...,...,...,...
7279,2024-12-25,-0.003003,-0.023692,0.009211
7280,2024-12-26,0.002310,0.011929,-0.011406
7281,2024-12-27,0.001099,0.008671,0.007060
7282,2024-12-30,0.001373,-0.017230,0.004754


In [30]:
ret_day = ret_day.merge(fama3[['Day','mkt.ff','smb.ff','hml.ff']],on='Day',how='left')
ret_day

Unnamed: 0,Day,Stkcd,y,Q,month,Opnprc,Hiprc,Loprc,Clsprc,TradingVolume,TradingMoney,CirculationValue,TotalValue,Return_1,Return_2,Adjprcwd,Adjprcnd,Markettype,Capchgdt,Trdsta,Ahshrtrd_D,Ahvaltrd_D,PreClosePrice,ChangeRatio,LimitDown,LimitUp,LimitStatus,All_shares,A_float_shares,H_float_shares,Pre_shares,ratio,Freq,Freq_Q,Freq_y,yuemo,jimo,rfday,r,mkt.ff,smb.ff,hml.ff
0,1990-12-19,600601,1990,1990.75,1990-12-31,185.30,185.30,185.30,185.30,200.0,37060.0,1.686230e+08,1.853000e+08,2.706000,2.706000,185.300000,185.300000,1,1990-12-19,1,,,,,,,,1000000.0,910000.0,0.0,0.0,0.910000,8,8,8,0,0,0.000227,2.705773,,,
1,1990-12-19,600602,1990,1990.75,1990-12-31,365.70,384.00,365.70,384.00,1160.0,443610.0,1.885440e+08,7.680000e+08,2.840000,2.840000,384.000000,384.000000,1,1990-12-19,1,,,,,,,,2000000.0,491000.0,0.0,0.0,0.245500,8,8,8,0,0,0.000227,2.839773,,,
2,1990-12-19,600651,1990,1990.75,1990-12-31,320.30,320.30,320.30,320.30,2.0,640.6,1.056990e+07,1.056990e+07,3.575714,3.575714,320.300000,320.300000,1,1990-12-19,1,,,,,,,,33000.0,33000.0,0.0,0.0,1.000000,8,8,8,0,0,0.000227,3.575487,,,
3,1990-12-19,600656,1990,1990.75,1990-12-31,260.00,260.00,260.00,260.00,50.0,13000.0,1.170286e+08,6.412952e+08,1.600000,1.600000,260.000000,260.000000,1,1990-12-19,1,,,,,,,,2466520.0,450110.0,0.0,0.0,0.182488,5,5,5,0,0,0.000227,1.599773,,,
4,1990-12-20,600601,1990,1990.75,1990-12-31,185.30,194.60,185.30,194.60,84.0,16160.4,1.770860e+08,1.946000e+08,0.050189,0.050189,194.600000,194.600000,1,1990-12-19,1,,,185.30,0.050189,,,0.0,1000000.0,910000.0,0.0,0.0,0.910000,8,8,8,0,0,0.000227,0.049962,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14662441,2024-12-31,605580,2024,2024.75,2024-12-31,10.07,11.08,10.06,11.08,7669759.0,84447640.0,3.102400e+09,3.102400e+09,0.100298,0.100298,17.474255,15.511980,1,2024-08-26,1,0.0,0.0,10.07,0.100298,9.06,11.08,1.0,280000000.0,280000000.0,0.0,0.0,1.000000,22,61,242,1,1,0.000037,0.100261,-0.017476,-0.008591,0.009624
14662442,2024-12-31,605588,2024,2024.75,2024-12-31,42.01,42.45,40.63,40.76,1342400.0,55579476.0,2.979538e+09,2.995151e+09,-0.034809,-0.034809,41.098544,40.759952,1,2024-12-17,1,0.0,0.0,42.23,-0.034809,38.01,46.45,0.0,73482601.0,73099561.0,0.0,0.0,0.994787,22,61,242,1,1,0.000037,-0.034846,-0.017476,-0.008591,0.009624
14662443,2024-12-31,605589,2024,2024.75,2024-12-31,24.28,24.36,23.51,23.58,8651436.0,206018788.0,1.835846e+10,1.995943e+10,-0.022388,-0.022388,24.775919,23.579966,1,2024-11-07,1,0.0,0.0,24.12,-0.022388,21.71,26.53,0.0,846455998.0,778560800.0,0.0,0.0,0.919789,22,61,242,1,1,0.000037,-0.022425,-0.017476,-0.008591,0.009624
14662444,2024-12-31,605598,2024,2024.75,2024-12-31,20.90,22.99,20.37,22.41,11167299.0,245155786.0,5.446007e+09,5.507500e+09,0.072249,0.072249,31.807557,31.373944,1,2024-09-26,1,0.0,0.0,20.90,0.072249,18.81,22.99,0.0,245760841.0,243016841.0,0.0,0.0,0.988835,22,61,242,1,1,0.000037,0.072212,-0.017476,-0.008591,0.009624


### Turnover

#### Twelve-month turnover

In [31]:
import time
from tqdm import tqdm

ret_day['daily_turnover'] = ret_day['TradingVolume'] / ret_day['All_shares']

month_end_dates = ret_day.groupby('month')['Day'].max().reset_index()
month_end_dates.columns = ['month', 'month_end_day']

month_end_dates['start_date'] = month_end_dates['month_end_day'] - pd.Timedelta(days=250)

def calc_12m_turnover(group):
    results = []
    stkcd = group['Stkcd'].iloc[0]
    group = group.sort_values('Day')

    for _, row in month_end_dates.iterrows():
        month = row['month']
        end_date = row['month_end_day']
        start_date = row['start_date']

        mask = (group['Day'] >= start_date) & (group['Day'] <= end_date)
        period_data = group.loc[mask, 'daily_turnover']

        if len(period_data) > 0:
            avg_turnover = period_data.mean()
            results.append({
                'Stkcd': stkcd,
                'month': month,
                'turnover_12m': avg_turnover,
                'obs_count': len(period_data)
            })

    return pd.DataFrame(results)

print("开始计算12-month turnover...")
print(f"  往前计算天数: 250 天（自然日）")
print(f"  月份数量: {len(month_end_dates)}")
print("="*60)

grouped = ret_day.groupby('Stkcd')
results_list = []
for stkcd, group in tqdm(grouped, desc="计算Turnover", total=len(grouped)):
    result = calc_12m_turnover(group)
    results_list.append(result)

turnover_12m = pd.concat(results_list, ignore_index=True)

print("="*60)
print(f"计算完成:")
print(f"  总观测数: {len(turnover_12m)}")
print(f"  股票数量: {turnover_12m['Stkcd'].nunique()}")
print(f"  平均观测天数: {turnover_12m['obs_count'].mean():.1f}")
turnover_12m

开始计算12-month turnover...
  往前计算天数: 250 天（自然日）
  月份数量: 409


计算Turnover: 100%|██████████| 4827/4827 [04:09<00:00, 19.34it/s]


计算完成:
  总观测数: 759795
  股票数量: 4827
  平均观测天数: 156.0


Unnamed: 0,Stkcd,month,turnover_12m,obs_count
0,000001,1991-04-30,0.000014,20
1,000001,1991-05-31,0.000094,44
2,000001,1991-06-30,0.000071,67
3,000001,1991-07-31,0.000059,83
4,000001,1991-08-31,0.000419,98
...,...,...,...,...
759790,605599,2024-08-31,0.004666,167
759791,605599,2024-09-30,0.004813,165
759792,605599,2024-10-31,0.005027,166
759793,605599,2024-11-30,0.005074,167


#### One-month abnormal turnover

In [32]:
month_end_dates['start_date_20d'] = month_end_dates['month_end_day'] - pd.Timedelta(days=20)

def calc_turnover_all(group):
    results = []
    stkcd = group['Stkcd'].iloc[0]
    group = group.sort_values('Day')

    for _, row in month_end_dates.iterrows():
        month = row['month']
        end_date = row['month_end_day']
        start_date_250d = row['start_date']
        start_date_20d = row['start_date_20d']

        mask_250d = (group['Day'] >= start_date_250d) & (group['Day'] <= end_date)
        period_data_250d = group.loc[mask_250d, 'daily_turnover']

        mask_20d = (group['Day'] >= start_date_20d) & (group['Day'] <= end_date)
        period_data_20d = group.loc[mask_20d, 'daily_turnover']

        if len(period_data_250d) > 0 and len(period_data_20d) > 0:
            avg_turnover_250d = period_data_250d.mean()
            avg_turnover_20d = period_data_20d.mean()
            abnormal_turnover = avg_turnover_20d - avg_turnover_250d

            results.append({
                'Stkcd': stkcd,
                'month': month,
                'turnover_12m': avg_turnover_250d,
                'turnover_1m': avg_turnover_20d,
                'abnormal_turnover': abnormal_turnover,
                'obs_count_250d': len(period_data_250d),
                'obs_count_20d': len(period_data_20d)
            })

    return pd.DataFrame(results)

# 按股票分组计算
print("开始计算 Turnover 和 Abnormal Turnover...")
print(f"  12-month turnover: 过去250天（自然日）的平均换手率")
print(f"  1-month turnover: 过去20天（自然日）的平均换手率")
print(f"  Abnormal turnover: 1-month turnover - 12-month turnover")
print(f"  月份数量: {len(month_end_dates)}")
print("="*60)

grouped = ret_day.groupby('Stkcd')
results_list = []
for stkcd, group in tqdm(grouped, desc="计算Turnover", total=len(grouped)):
    result = calc_turnover_all(group)
    results_list.append(result)

turnover_all = pd.concat(results_list, ignore_index=True)

print("="*60)
print(f"计算完成:")
print(f"  总观测数: {len(turnover_all)}")
print(f"  股票数量: {turnover_all['Stkcd'].nunique()}")
print(f"  250天平均观测数: {turnover_all['obs_count_250d'].mean():.1f}")
print(f"  20天平均观测数: {turnover_all['obs_count_20d'].mean():.1f}")
turnover_all

开始计算 Turnover 和 Abnormal Turnover...
  12-month turnover: 过去250天（自然日）的平均换手率
  1-month turnover: 过去20天（自然日）的平均换手率
  Abnormal turnover: 1-month turnover - 12-month turnover
  月份数量: 409


计算Turnover: 100%|██████████| 4827/4827 [07:21<00:00, 10.94it/s]


计算完成:
  总观测数: 737087
  股票数量: 4827
  250天平均观测数: 158.0
  20天平均观测数: 14.4


Unnamed: 0,Stkcd,month,turnover_12m,turnover_1m,abnormal_turnover,obs_count_250d,obs_count_20d
0,000001,1991-04-30,0.000014,0.000017,0.000003,20,14
1,000001,1991-05-31,0.000094,0.000209,0.000115,44,17
2,000001,1991-06-30,0.000071,0.000032,-0.000039,67,18
3,000001,1991-07-31,0.000059,0.000006,-0.000053,83,9
4,000001,1991-08-31,0.000419,0.002779,0.002360,98,13
...,...,...,...,...,...,...,...
737082,605599,2024-08-31,0.004666,0.003181,-0.001484,167,15
737083,605599,2024-09-30,0.004813,0.004357,-0.000456,165,13
737084,605599,2024-10-31,0.005027,0.006296,0.001269,166,15
737085,605599,2024-11-30,0.005074,0.006502,0.001428,167,15


### 相关性

In [62]:
# 导入rolling_betas数据
rolling_betas = pd.read_csv('/Users/liuwanting/Desktop/pythonhomework/rolling_betas.csv')
rolling_betas['Stkcd'] = rolling_betas['Stkcd'].apply(lambda x: '{:0>6}'.format(x))
rolling_betas['month'] = pd.to_datetime(rolling_betas['month']) + MonthEnd(0)
rolling_betas

Unnamed: 0,Stkcd,month,beta,data_count
0,000001,1998-04-30,1.155400,40
1,000001,1998-05-31,1.148875,41
2,000001,1998-06-30,1.167744,42
3,000001,1998-07-31,1.167735,43
4,000001,1998-08-31,1.152495,44
...,...,...,...,...
630271,920964,2024-08-31,0.725409,49
630272,920964,2024-09-30,0.983053,50
630273,920964,2024-10-31,0.877199,51
630274,920964,2024-11-30,0.860914,52


In [5]:
cross_turnover_corr = pd.merge(cross_turnover, rolling_betas[['Stkcd', 'month', 'beta']],
                               on=['Stkcd', 'month'], how='left')
cross_turnover_corr = cross_turnover_corr[cross_turnover_corr['month'] >= '2000-01-31']

cross_turnover_corr['size'] = np.log(cross_turnover_corr['totalvalue'])

cross_turnover_corr['value'] = cross_turnover_corr['ep']

print(f"合并beta后样本量: {len(cross_turnover_corr)}")
print(f"beta非缺失样本量: {cross_turnover_corr['beta'].notna().sum()}")
cross_turnover_corr[['Stkcd', 'month', 'abnormal_turnover', 'beta', 'size', 'value']].head(10)

NameError: name 'pd' is not defined

In [4]:
def calc_monthly_correlation_matrix(df):
    variables = ['abnormal_turnover', 'turnover_12m', 'beta', 'size', 'value', 'next_ret']
    var_names = ['异常换手率', '12月换手率', 'Beta', 'Size (log)', 'Value (EP)', '未来收益率']

    monthly_corr_matrices = []
    valid_months = []

    available_vars = [v for v in variables if v in df.columns]
    if len(available_vars) < len(variables):
        missing = set(variables) - set(available_vars)
        print(f"警告: 缺少变量 {missing}")
        return None, None, None

    for month in sorted(df['month'].unique()):
        month_data = df[df['month'] == month][variables].dropna()

        if len(month_data) > 10:  # 至少需要10个观测值
            # 计算该月的相关系数矩阵
            corr_matrix = month_data.corr()
            monthly_corr_matrices.append(corr_matrix)
            valid_months.append(month)

    return monthly_corr_matrices, valid_months, var_names

# 计算每月的相关系数矩阵
monthly_corr_matrices, valid_months, var_names = calc_monthly_correlation_matrix(cross_turnover_corr)

if monthly_corr_matrices and len(monthly_corr_matrices) > 0:
    avg_corr_matrix = pd.concat(monthly_corr_matrices).groupby(level=0).mean()

    variables = ['abnormal_turnover', 'turnover_12m', 'beta', 'size', 'value', 'next_ret']
    avg_corr_matrix = avg_corr_matrix.reindex(index=variables, columns=variables)

    avg_corr_matrix.index = var_names
    avg_corr_matrix.columns = var_names

    print("="*80)
    print("指标间相关系数矩阵（截面相关系数的时序平均）")
    print("="*80)
    print(f"\n样本期间: {pd.to_datetime(valid_months[0]).strftime('%Y-%m')} 至 {pd.to_datetime(valid_months[-1]).strftime('%Y-%m')}")
    print(f"有效月份数: {len(valid_months)}")
    print(f"每月平均观测数: {cross_turnover_corr.groupby('month').size().mean():.0f}")
    print("\n时序平均相关系数矩阵:")
    avg_corr_matrix
else:
    print("无法计算相关系数矩阵，请检查数据。")

NameError: name 'cross_turnover_corr' is not defined

In [6]:
fig, ax = plt.subplots(figsize=(10, 8))

im = ax.imshow(
    avg_corr_matrix,
    cmap='RdBu',
    aspect='auto',
    vmin=-1,
    vmax=1
)

ax.set_xticks(np.arange(len(var_names)))
ax.set_yticks(np.arange(len(var_names)))
ax.set_xticklabels(var_names, rotation=45, ha='right')
ax.set_yticklabels(var_names)

for i in range(len(var_names)):
    for j in range(len(var_names)):
        text = ax.text(
            j, i,
            f'{avg_corr_matrix.iloc[i, j]:.3f}',
            ha="center",
            va="center",
            color="black" if abs(avg_corr_matrix.iloc[i, j]) < 0.5 else "white",
            fontsize=10,
            fontweight='bold'
        )

cbar = plt.colorbar(im, ax=ax)
cbar.set_label('相关系数', rotation=270, labelpad=20)

ax.set_title('指标间相关系数矩阵热力图\n(截面相关系数的时序平均)', fontsize=14, fontweight='bold', pad=20)

plt.tight_layout()
plt.show();

NameError: name 'plt' is not defined

In [7]:
print("="*80)
print("相关系数的统计显著性检验")
print("="*80)
print("\nH0: 时序平均相关系数 = 0\n")

abn_turn_correlations = pd.DataFrame()
for i, var_name in enumerate(var_names):
    if var_name != '异常换手率':
        time_series = [matrix.iloc[0, i] for matrix in monthly_corr_matrices]
        abn_turn_correlations[var_name] = time_series

significance_results = []
for col in abn_turn_correlations.columns:
    series = abn_turn_correlations[col]
    mean_corr = series.mean()
    std_corr = series.std()
    n = len(series)
    t_stat = mean_corr / (std_corr / np.sqrt(n))
    p_value = 2 * (1 - stats.t.cdf(abs(t_stat), n - 1))

    significance_results.append({
        '变量': col,
        '平均相关系数': mean_corr,
        '标准差': std_corr,
        't统计量': t_stat,
        'p值': p_value,
        '显著性': '***' if p_value < 0.01 else '**' if p_value < 0.05 else '*' if p_value < 0.1 else ''
    })

significance_df = pd.DataFrame(significance_results)
significance_df

print("\n注: *** p<0.01, ** p<0.05, * p<0.1")

相关系数的统计显著性检验

H0: 时序平均相关系数 = 0



NameError: name 'pd' is not defined

### 异象组合收益率

In [8]:
cross_turnover = pd.merge(cross_new, turnover_all[['Stkcd', 'month', 'turnover_12m', 'turnover_1m', 'abnormal_turnover']],
                          on=['Stkcd', 'month'], how='left')

cross_turnover = cross_turnover.dropna(subset=['turnover_12m', 'abnormal_turnover', 'next_ret'])
print(f"合并后样本量: {len(cross_turnover)}")
cross_turnover

NameError: name 'pd' is not defined

### 12-month Turnover 10分组

In [10]:
cross_turnover['turnover_12m_group'] = cross_turnover.groupby('month')['turnover_12m'].transform(
    lambda x: pd.qcut(x, 10, labels=False, duplicates='drop') + 1
)
def weighted_avg(group):
    return np.average(group['next_ret'], weights=group['totalvalue'])
portfolio_12m_ew = cross_turnover.groupby(['month', 'turnover_12m_group'])['next_ret'].mean().unstack()
portfolio_12m_ew.columns = [f'G{int(i)}' for i in portfolio_12m_ew.columns]
portfolio_12m_ew['H-L'] = portfolio_12m_ew['G1'] - portfolio_12m_ew['G10']

portfolio_12m_vw = cross_turnover.groupby(['month', 'turnover_12m_group'], group_keys=False)[['next_ret', 'totalvalue']].apply(weighted_avg).unstack()
portfolio_12m_vw.columns = [f'G{int(i)}' for i in portfolio_12m_vw.columns]
portfolio_12m_vw['H-L'] = portfolio_12m_vw['G1'] - portfolio_12m_vw['G10']

def calc_newey_west_tvalue(series, maxlags=6):
    temp_df = pd.DataFrame({'ret': series})
    temp_df = temp_df.dropna()
    if len(temp_df) > 0:
        model = smf.ols('ret ~ 1', data=temp_df).fit(cov_type='HAC', cov_kwds={'maxlags': maxlags})
        return model.tvalues['Intercept']
    return np.nan

stats_12m_ew = pd.DataFrame({
    '平均收益率': portfolio_12m_ew.mean() * 100,
    '标准差': portfolio_12m_ew.std() * 100,
    't值(NW)': portfolio_12m_ew.apply(calc_newey_west_tvalue),
    '夏普比率': portfolio_12m_ew.mean() / portfolio_12m_ew.std() * np.sqrt(12)
})

stats_12m_vw = pd.DataFrame({
    '平均收益率': portfolio_12m_vw.mean() * 100,
    '标准差': portfolio_12m_vw.std() * 100,
    't值(NW)': portfolio_12m_vw.apply(calc_newey_west_tvalue),
    '夏普比率': portfolio_12m_vw.mean() / portfolio_12m_vw.std() * np.sqrt(12)
})

print("="*60)
print("12-month Turnover 10分组投资组合")
print("="*60)
print("\n【等权加权】各组合月度平均收益率 (%):\n")
print(stats_12m_ew)
print("\n【市值加权】各组合月度平均收益率 (%):\n")
stats_12m_vw

NameError: name 'cross_turnover' is not defined

### Abnormal Turnover 10分组

In [11]:
cross_turnover['abnormal_turnover_group'] = cross_turnover.groupby('month')['abnormal_turnover'].transform(
    lambda x: pd.qcut(x, 10, labels=False, duplicates='drop') + 1
)

portfolio_abn_ew = cross_turnover.groupby(['month', 'abnormal_turnover_group'])['next_ret'].mean().unstack()
portfolio_abn_ew.columns = [f'G{int(i)}' for i in portfolio_abn_ew.columns]
portfolio_abn_ew['H-L'] = portfolio_abn_ew['G1'] - portfolio_abn_ew['G10']

portfolio_abn_vw = cross_turnover.groupby(['month', 'abnormal_turnover_group'], group_keys=False)[['next_ret', 'totalvalue']].apply(weighted_avg).unstack()
portfolio_abn_vw.columns = [f'G{int(i)}' for i in portfolio_abn_vw.columns]
portfolio_abn_vw['H-L'] = portfolio_abn_vw['G1'] - portfolio_abn_vw['G10']

stats_abn_ew = pd.DataFrame({
    '平均收益率': portfolio_abn_ew.mean() * 100,
    '标准差': portfolio_abn_ew.std() * 100,
    't值(NW)': portfolio_abn_ew.apply(calc_newey_west_tvalue),
    '夏普比率': portfolio_abn_ew.mean() / portfolio_abn_ew.std() * np.sqrt(12)
})

stats_abn_vw = pd.DataFrame({
    '平均收益率': portfolio_abn_vw.mean() * 100,
    '标准差': portfolio_abn_vw.std() * 100,
    't值(NW)': portfolio_abn_vw.apply(calc_newey_west_tvalue),
    '夏普比率': portfolio_abn_vw.mean() / portfolio_abn_vw.std() * np.sqrt(12)
})

print("="*60)
print("Abnormal Turnover 10分组投资组合")
print("="*60)
print("\n【等权加权】各组合月度平均收益率 (%):\n")
print(stats_abn_ew)
print("\n【市值加权】各组合月度平均收益率 (%):\n")
stats_abn_vw

NameError: name 'cross_turnover' is not defined

### 投资组合累计收益率图

In [12]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 12-month Turnover 等权累计收益率
cum_ret_12m_ew = (1 + portfolio_12m_ew[['G1', 'G10', 'H-L']]).cumprod()
cum_ret_12m_ew.plot(ax=axes[0, 0], linewidth=1.5)
axes[0, 0].set_title('12-month Turnover 投资组合累计收益 (等权)', fontsize=12)
axes[0, 0].set_xlabel('月份')
axes[0, 0].set_ylabel('累计收益率')
axes[0, 0].legend(['Low (G1)', 'High (G10)', 'H-L'], loc='upper left')
axes[0, 0].grid(True, alpha=0.3)

# 12-month Turnover 市值加权累计收益率
cum_ret_12m_vw = (1 + portfolio_12m_vw[['G1', 'G10', 'H-L']]).cumprod()
cum_ret_12m_vw.plot(ax=axes[0, 1], linewidth=1.5)
axes[0, 1].set_title('12-month Turnover 投资组合累计收益 (市值加权)', fontsize=12)
axes[0, 1].set_xlabel('月份')
axes[0, 1].set_ylabel('累计收益率')
axes[0, 1].legend(['Low (G1)', 'High (G10)', 'H-L'], loc='upper left')
axes[0, 1].grid(True, alpha=0.3)

# Abnormal Turnover 等权累计收益率
cum_ret_abn_ew = (1 + portfolio_abn_ew[['G1', 'G10', 'H-L']]).cumprod()
cum_ret_abn_ew.plot(ax=axes[1, 0], linewidth=1.5)
axes[1, 0].set_title('Abnormal Turnover 投资组合累计收益 (等权)', fontsize=12)
axes[1, 0].set_xlabel('月份')
axes[1, 0].set_ylabel('累计收益率')
axes[1, 0].legend(['Low (G1)', 'High (G10)', 'H-L'], loc='upper left')
axes[1, 0].grid(True, alpha=0.3)

# Abnormal Turnover 市值加权累计收益率
cum_ret_abn_vw = (1 + portfolio_abn_vw[['G1', 'G10', 'H-L']]).cumprod()
cum_ret_abn_vw.plot(ax=axes[1, 1], linewidth=1.5)
axes[1, 1].set_title('Abnormal Turnover 投资组合累计收益 (市值加权)', fontsize=12)
axes[1, 1].set_xlabel('月份')
axes[1, 1].set_ylabel('累计收益率')
axes[1, 1].legend(['Low (G1)', 'High (G10)', 'H-L'], loc='upper left')
axes[1, 1].grid(True, alpha=0.3)



print("\n12-month Turnover 各组合累计收益率 (等权):")
print(cum_ret_12m_ew.iloc[-1])
print("\n12-month Turnover 各组合累计收益率 (市值加权):")
print(cum_ret_12m_vw.iloc[-1])
print("\nAbnormal Turnover 各组合累计收益率 (等权):")
print(cum_ret_abn_ew.iloc[-1])
print("\nAbnormal Turnover 各组合累计收益率 (市值加权):")
print(cum_ret_abn_vw.iloc[-1])

plt.tight_layout();
plt.show();

NameError: name 'plt' is not defined

### Fama-French 三因子回归分析

In [13]:
# 读取Fama-French三因子数据
ff3_monthly = pd.read_csv('datasets/factors_3f.csv')
ff3_monthly['month'] = pd.to_datetime(ff3_monthly['month'])
ff3_monthly.set_index('month', inplace=True)
print("Fama-French三因子数据:")
ff3_monthly.head()

NameError: name 'pd' is not defined

In [14]:
# 将组合收益率与三因子数据合并
# Abnormal Turnover 等权组合
portfolio_abn_ew_ff3 = portfolio_abn_ew.merge(ff3_monthly, left_index=True, right_index=True, how='inner')

# Abnormal Turnover 市值加权组合
portfolio_abn_vw_ff3 = portfolio_abn_vw.merge(ff3_monthly, left_index=True, right_index=True, how='inner')

# 定义三因子回归函数
def ff3_regression(portfolio_returns, ff3_data):
    """
    对投资组合进行Fama-French三因子回归
    返回alpha、beta及t值
    """
    results = []

    for col in ['G1', 'G10', 'H-L']:
        if col in portfolio_returns.columns:
            # 准备回归数据
            reg_data = pd.DataFrame({
                'ret': portfolio_returns[col],
                'MKT': ff3_data['MKT'],
                'SMB': ff3_data['SMB'],
                'HML': ff3_data['HML']
            }).dropna()

            # 运行回归（使用Newey-West标准误）
            model = smf.ols('ret ~ MKT + SMB + HML', data=reg_data).fit(cov_type='HAC', cov_kwds={'maxlags': 6})

            results.append({
                '组合': col,
                'Alpha': model.params['Intercept'] * 100,
                't(Alpha)': model.tvalues['Intercept'],
                'Beta(MKT)': model.params['MKT'],
                't(MKT)': model.tvalues['MKT'],
                'Beta(SMB)': model.params['SMB'],
                't(SMB)': model.tvalues['SMB'],
                'Beta(HML)': model.params['HML'],
                't(HML)': model.tvalues['HML'],
                'R²': model.rsquared
            })

    return pd.DataFrame(results).set_index('组合')

# 对等权组合进行回归
print("="*80)
print("Abnormal Turnover 投资组合 - Fama-French 三因子回归 (等权)")
print("="*80)
ff3_reg_abn_ew = ff3_regression(portfolio_abn_ew_ff3, portfolio_abn_ew_ff3)
ff3_reg_abn_ew

# 对市值加权组合进行回归
print("="*80)
print("Abnormal Turnover 投资组合 - Fama-French 三因子回归 (市值加权)")
print("="*80)
ff3_reg_abn_vw = ff3_regression(portfolio_abn_vw_ff3, portfolio_abn_vw_ff3)
ff3_reg_abn_vw

NameError: name 'portfolio_abn_ew' is not defined

## 深入分析


### 1. 时间序列稳定性分析

In [16]:
mid_date = portfolio_abn_ew.index[len(portfolio_abn_ew) // 2]

portfolio_abn_ew_first = portfolio_abn_ew[portfolio_abn_ew.index < mid_date]
portfolio_abn_vw_first = portfolio_abn_vw[portfolio_abn_vw.index < mid_date]

portfolio_abn_ew_second = portfolio_abn_ew[portfolio_abn_ew.index >= mid_date]
portfolio_abn_vw_second = portfolio_abn_vw[portfolio_abn_vw.index >= mid_date]

def calc_period_stats(portfolio_ew, portfolio_vw, period_name):
    print(f"\n{'='*80}")
    print(f"{period_name} ({portfolio_ew.index[0].strftime('%Y-%m')} 至 {portfolio_ew.index[-1].strftime('%Y-%m')})")
    print(f"{'='*80}")

    stats_ew = pd.DataFrame({
        '平均收益率': portfolio_ew.mean() * 100,
        '标准差': portfolio_ew.std() * 100,
        't值(NW)': portfolio_ew.apply(calc_newey_west_tvalue),
        '夏普比率': portfolio_ew.mean() / portfolio_ew.std() * np.sqrt(12)
    })

    stats_vw = pd.DataFrame({
        '平均收益率': portfolio_vw.mean() * 100,
        '标准差': portfolio_vw.std() * 100,
        't值(NW)': portfolio_vw.apply(calc_newey_west_tvalue),
        '夏普比率': portfolio_vw.mean() / portfolio_vw.std() * np.sqrt(12)
    })

    print("\n【等权加权】:")
    print(stats_ew[['平均收益率', 't值(NW)', '夏普比率']])
    print("\n【市值加权】:")
    print(stats_vw[['平均收益率', 't值(NW)', '夏普比率']])

    return stats_ew, stats_vw

stats_first_ew, stats_first_vw = calc_period_stats(
    portfolio_abn_ew_first, portfolio_abn_vw_first, "前半期"
)

stats_second_ew, stats_second_vw = calc_period_stats(
    portfolio_abn_ew_second, portfolio_abn_vw_second, "后半期"
)

NameError: name 'portfolio_abn_ew' is not defined


### 2. 双重排序分析

In [17]:
double_sort_ew = cross_turnover.groupby(['month', 'size_group', 'abn_turn_group_in_size'], observed=True)['next_ret'].mean().to_frame()
double_sort_ew

NameError: name 'cross_turnover' is not defined

In [18]:
double_sort_ew = cross_turnover.groupby(['month', 'size_group', 'abn_turn_group_in_size'], observed=True)['next_ret'].mean().unstack(level=[1, 2])
double_sort_ew

NameError: name 'cross_turnover' is not defined

### 2.1 异常换手率 × 市值（条件排序）

In [19]:
cross_turnover['size_group'] = cross_turnover.groupby('month')['totalvalue'].transform(
    lambda x: pd.qcut(x, 3, labels=['Small', 'Medium', 'Large'], duplicates='drop')
)

cross_turnover['abn_turn_group_in_size'] = cross_turnover.groupby(['month', 'size_group'], observed=True)['abnormal_turnover'].transform(
    lambda x: pd.qcut(x, 5, labels=False, duplicates='drop') + 1
)

double_sort_ew = cross_turnover.groupby(['month', 'size_group', 'abn_turn_group_in_size'], observed=True)['next_ret'].mean().unstack(level=[1, 2])

double_sort_vw = cross_turnover.groupby(['month', 'size_group', 'abn_turn_group_in_size'], observed=True, group_keys=False)[['next_ret', 'totalvalue']].apply(weighted_avg).unstack(level=[1, 2])

results_double_sort_ew = []
for size in ['Small', 'Medium', 'Large']:
    if (size, 1.0) in double_sort_ew.columns and (size, 5.0) in double_sort_ew.columns:
        hl_ret = double_sort_ew[(size, 1.0)] - double_sort_ew[(size, 5.0)]
        avg_ret = hl_ret.mean() * 100
        t_val = calc_newey_west_tvalue(hl_ret)
        sharpe = hl_ret.mean() / hl_ret.std() * np.sqrt(12)

        results_double_sort_ew.append({
            '市值组': size,
            'H-L平均收益率(%)': avg_ret,
            't值(NW)': t_val,
            '夏普比率': sharpe
        })

double_sort_summary_ew = pd.DataFrame(results_double_sort_ew)

results_double_sort_vw = []
for size in ['Small', 'Medium', 'Large']:
    if (size, 1.0) in double_sort_vw.columns and (size, 5.0) in double_sort_vw.columns:
        hl_ret = double_sort_vw[(size, 1.0)] - double_sort_vw[(size, 5.0)]
        avg_ret = hl_ret.mean() * 100
        t_val = calc_newey_west_tvalue(hl_ret)
        sharpe = hl_ret.mean() / hl_ret.std() * np.sqrt(12)

        results_double_sort_vw.append({
            '市值组': size,
            'H-L平均收益率(%)': avg_ret,
            't值(NW)': t_val,
            '夏普比率': sharpe
        })

double_sort_summary_vw = pd.DataFrame(results_double_sort_vw)

print("="*80)
print("双重排序（条件排序）：异常换手率 × 市值")
print("="*80)
print("\n【等权】各市值组内异常换手率H-L组合表现：")
double_sort_summary_ew
print("\n【市值加权】各市值组内异常换手率H-L组合表现：")
double_sort_summary_vw

NameError: name 'cross_turnover' is not defined

### 2.2 异常换手率 × 市值（独立排序）

In [20]:
cross_turnover['abn_turn_group_independent'] = cross_turnover.groupby('month')['abnormal_turnover'].transform(
    lambda x: pd.qcut(x, 5, labels=False, duplicates='drop') + 1
)

independent_sort_ew = cross_turnover.groupby(['month', 'size_group', 'abn_turn_group_independent'], observed=True)['next_ret'].mean().unstack(level=[1, 2])

independent_sort_vw = cross_turnover.groupby(['month', 'size_group', 'abn_turn_group_independent'], observed=True, group_keys=False)[['next_ret', 'totalvalue']].apply(weighted_avg).unstack(level=[1, 2])

results_independent_ew = []
for size in ['Small', 'Medium', 'Large']:
    if (size, 1.0) in independent_sort_ew.columns and (size, 5.0) in independent_sort_ew.columns:
        hl_ret = independent_sort_ew[(size, 1.0)] - independent_sort_ew[(size, 5.0)]
        avg_ret = hl_ret.mean() * 100
        t_val = calc_newey_west_tvalue(hl_ret)
        sharpe = hl_ret.mean() / hl_ret.std() * np.sqrt(12)

        results_independent_ew.append({
            '市值组': size,
            'H-L平均收益率(%)': avg_ret,
            't值(NW)': t_val,
            '夏普比率': sharpe
        })

independent_summary_ew = pd.DataFrame(results_independent_ew)

results_independent_vw = []
for size in ['Small', 'Medium', 'Large']:
    if (size, 1.0) in independent_sort_vw.columns and (size, 5.0) in independent_sort_vw.columns:
        hl_ret = independent_sort_vw[(size, 1.0)] - independent_sort_vw[(size, 5.0)]
        avg_ret = hl_ret.mean() * 100
        t_val = calc_newey_west_tvalue(hl_ret)
        sharpe = hl_ret.mean() / hl_ret.std() * np.sqrt(12)

        results_independent_vw.append({
            '市值组': size,
            'H-L平均收益率(%)': avg_ret,
            't值(NW)': t_val,
            '夏普比率': sharpe
        })

independent_summary_vw = pd.DataFrame(results_independent_vw)

print("="*80)
print("双重排序（独立排序）：异常换手率 × 市值")
print("="*80)
print("\n【等权】各市值组内异常换手率H-L组合表现：")
independent_summary_ew
print("\n【市值加权】各市值组内异常换手率H-L组合表现：")
independent_summary_vw

NameError: name 'cross_turnover' is not defined

### 2.3 异常换手率 × 账面市值比(EP)（条件排序）

In [21]:
cross_turnover['ep_group'] = cross_turnover.groupby('month')['ep'].transform(
    lambda x: pd.qcut(x, 3, labels=['Low', 'Medium', 'High'], duplicates='drop')
)

cross_turnover['abn_turn_group_in_ep'] = cross_turnover.groupby(['month', 'ep_group'], observed=True)['abnormal_turnover'].transform(
    lambda x: pd.qcut(x, 5, labels=False, duplicates='drop') + 1
)

double_sort_ep_ew = cross_turnover.groupby(['month', 'ep_group', 'abn_turn_group_in_ep'], observed=True)['next_ret'].mean().unstack(level=[1, 2])

double_sort_ep_vw = cross_turnover.groupby(['month', 'ep_group', 'abn_turn_group_in_ep'], observed=True, group_keys=False)[['next_ret', 'totalvalue']].apply(weighted_avg).unstack(level=[1, 2])

results_double_sort_ep_ew = []
for ep_cat in ['Low', 'Medium', 'High']:
    if (ep_cat, 1.0) in double_sort_ep_ew.columns and (ep_cat, 5.0) in double_sort_ep_ew.columns:
        hl_ret = double_sort_ep_ew[(ep_cat, 1.0)] - double_sort_ep_ew[(ep_cat, 5.0)]
        avg_ret = hl_ret.mean() * 100
        t_val = calc_newey_west_tvalue(hl_ret)
        sharpe = hl_ret.mean() / hl_ret.std() * np.sqrt(12)

        results_double_sort_ep_ew.append({
            'EP组': ep_cat,
            'H-L平均收益率(%)': avg_ret,
            't值(NW)': t_val,
            '夏普比率': sharpe
        })

double_sort_ep_summary_ew = pd.DataFrame(results_double_sort_ep_ew)

results_double_sort_ep_vw = []
for ep_cat in ['Low', 'Medium', 'High']:
    if (ep_cat, 1.0) in double_sort_ep_vw.columns and (ep_cat, 5.0) in double_sort_ep_vw.columns:
        hl_ret = double_sort_ep_vw[(ep_cat, 1.0)] - double_sort_ep_vw[(ep_cat, 5.0)]
        avg_ret = hl_ret.mean() * 100
        t_val = calc_newey_west_tvalue(hl_ret)
        sharpe = hl_ret.mean() / hl_ret.std() * np.sqrt(12)

        results_double_sort_ep_vw.append({
            'EP组': ep_cat,
            'H-L平均收益率(%)': avg_ret,
            't值(NW)': t_val,
            '夏普比率': sharpe
        })

double_sort_ep_summary_vw = pd.DataFrame(results_double_sort_ep_vw)

print("="*80)
print("双重排序（条件排序）：异常换手率 × EP")
print("="*80)
print("\n【等权】各EP组内异常换手率H-L组合表现：")
print(double_sort_ep_summary_ew)
print("\n【市值加权】各EP组内异常换手率H-L组合表现：")
double_sort_ep_summary_vw

NameError: name 'cross_turnover' is not defined

### 2.4 异常换手率 × 账面市值比(EP)（独立排序）

In [22]:
independent_sort_ep_ew = cross_turnover.groupby(['month', 'ep_group', 'abn_turn_group_independent'], observed=True)['next_ret'].mean().unstack(level=[1, 2])

independent_sort_ep_vw = cross_turnover.groupby(['month', 'ep_group', 'abn_turn_group_independent'], observed=True, group_keys=False)[['next_ret', 'totalvalue']].apply(weighted_avg).unstack(level=[1, 2])

results_independent_ep_ew = []
for ep_cat in ['Low', 'Medium', 'High']:
    if (ep_cat, 1.0) in independent_sort_ep_ew.columns and (ep_cat, 5.0) in independent_sort_ep_ew.columns:
        hl_ret = independent_sort_ep_ew[(ep_cat, 1.0)] - independent_sort_ep_ew[(ep_cat, 5.0)]
        avg_ret = hl_ret.mean() * 100
        t_val = calc_newey_west_tvalue(hl_ret)
        sharpe = hl_ret.mean() / hl_ret.std() * np.sqrt(12)

        results_independent_ep_ew.append({
            'EP组': ep_cat,
            'H-L平均收益率(%)': avg_ret,
            't值(NW)': t_val,
            '夏普比率': sharpe
        })

independent_ep_summary_ew = pd.DataFrame(results_independent_ep_ew)

results_independent_ep_vw = []
for ep_cat in ['Low', 'Medium', 'High']:
    if (ep_cat, 1.0) in independent_sort_ep_vw.columns and (ep_cat, 5.0) in independent_sort_ep_vw.columns:
        hl_ret = independent_sort_ep_vw[(ep_cat, 1.0)] - independent_sort_ep_vw[(ep_cat, 5.0)]
        avg_ret = hl_ret.mean() * 100
        t_val = calc_newey_west_tvalue(hl_ret)
        sharpe = hl_ret.mean() / hl_ret.std() * np.sqrt(12)

        results_independent_ep_vw.append({
            'EP组': ep_cat,
            'H-L平均收益率(%)': avg_ret,
            't值(NW)': t_val,
            '夏普比率': sharpe
        })

independent_ep_summary_vw = pd.DataFrame(results_independent_ep_vw)

print("="*80)
print("双重排序（独立排序）：异常换手率 × EP")
print("="*80)
print("\n【等权】各EP组内异常换手率H-L组合表现：")
print(independent_ep_summary_ew)
print("\n【市值加权】各EP组内异常换手率H-L组合表现：")
independent_ep_summary_vw

NameError: name 'cross_turnover' is not defined


### 3. 换手率水平与异常换手率的交互作用

In [23]:
cross_turnover['turnover_12m_tertile'] = cross_turnover.groupby('month')['turnover_12m'].transform(
    lambda x: pd.qcut(x, 3, labels=['Low', 'Medium', 'High'], duplicates='drop')
)

cross_turnover['abn_turn_tertile_in_12m'] = cross_turnover.groupby(['month', 'turnover_12m_tertile'], observed=True)['abnormal_turnover'].transform(
    lambda x: pd.qcut(x, 3, labels=False, duplicates='drop') + 1
)

interaction_ew = cross_turnover.groupby(['month', 'turnover_12m_tertile', 'abn_turn_tertile_in_12m'], observed=True)['next_ret'].mean().unstack(level=[1, 2])

results_interaction = []
for turn_level in ['Low', 'Medium', 'High']:
    if (turn_level, 1.0) in interaction_ew.columns and (turn_level, 3.0) in interaction_ew.columns:
        hl_ret = interaction_ew[(turn_level, 1.0)] - interaction_ew[(turn_level, 3.0)]
        avg_ret = hl_ret.mean() * 100
        t_val = calc_newey_west_tvalue(hl_ret)
        sharpe = hl_ret.mean() / hl_ret.std() * np.sqrt(12)

        results_interaction.append({
            '12月换手率水平': turn_level,
            'H-L平均收益率(%)': avg_ret,
            't值(NW)': t_val,
            '夏普比率': sharpe
        })

interaction_summary = pd.DataFrame(results_interaction)

print("="*80)
print("交互作用分析：12-month Turnover × Abnormal Turnover")
print("="*80)
print("\n不同12月换手率水平下，异常换手率H-L组合表现：")
interaction_summary

NameError: name 'cross_turnover' is not defined

### 4. 回报率分解分析

In [24]:
long_ret_ew = portfolio_abn_ew['G1'].mean() * 100
short_ret_ew = portfolio_abn_ew['G10'].mean() * 100
hl_ret_ew = portfolio_abn_ew['H-L'].mean() * 100

long_t_ew = calc_newey_west_tvalue(portfolio_abn_ew['G1'])
short_t_ew = calc_newey_west_tvalue(portfolio_abn_ew['G10'])

long_ret_vw = portfolio_abn_vw['G1'].mean() * 100
short_ret_vw = portfolio_abn_vw['G10'].mean() * 100
hl_ret_vw = portfolio_abn_vw['H-L'].mean() * 100

long_t_vw = calc_newey_west_tvalue(portfolio_abn_vw['G1'])
short_t_vw = calc_newey_west_tvalue(portfolio_abn_vw['G10'])

long_contrib_ew = (long_ret_ew / hl_ret_ew) * 100 if hl_ret_ew != 0 else 0
short_contrib_ew = (-short_ret_ew / hl_ret_ew) * 100 if hl_ret_ew != 0 else 0

long_contrib_vw = (long_ret_vw / hl_ret_vw) * 100 if hl_ret_vw != 0 else 0
short_contrib_vw = (-short_ret_vw / hl_ret_vw) * 100 if hl_ret_vw != 0 else 0

decomposition_results = pd.DataFrame({
    '组合类型': ['等权', '市值加权'],
    '多头收益率(%)': [long_ret_ew, long_ret_vw],
    '多头t值': [long_t_ew, long_t_vw],
    '空头收益率(%)': [short_ret_ew, short_ret_vw],
    '空头t值': [short_t_ew, short_t_vw],
    'H-L收益率(%)': [hl_ret_ew, hl_ret_vw],
    '多头贡献(%)': [long_contrib_ew, long_contrib_vw],
    '空头贡献(%)': [short_contrib_ew, short_contrib_vw]
})

print("="*80)
print("H-L组合收益分解")
print("="*80)
print("\n多头(G1)和空头(G10)的收益贡献：")
decomposition_results

NameError: name 'portfolio_abn_ew' is not defined

### 5. 市场状态分析

In [25]:
portfolio_abn_with_mkt = portfolio_abn_ew.merge(Market_ret[['MKT']], left_index=True, right_index=True, how='inner')

portfolio_abn_with_mkt['market_state'] = portfolio_abn_with_mkt['MKT'].apply(
    lambda x: 'Up Market' if x > 0 else 'Down Market'
)

def calc_market_state_stats(df, state_label):
    subset = df[df['market_state'] == state_label]
    if len(subset) > 0:
        hl_ret = subset['H-L']
        return {
            '市场状态': state_label,
            '月数': len(subset),
            'H-L平均收益率(%)': hl_ret.mean() * 100,
            't值(NW)': calc_newey_west_tvalue(hl_ret),
            '夏普比率': hl_ret.mean() / hl_ret.std() * np.sqrt(12) if hl_ret.std() > 0 else np.nan
        }
    return None

up_market_stats = calc_market_state_stats(portfolio_abn_with_mkt, 'Up Market')
down_market_stats = calc_market_state_stats(portfolio_abn_with_mkt, 'Down Market')

market_state_results = pd.DataFrame([up_market_stats, down_market_stats])

print("="*80)
print("市场状态分析：异常换手率H-L组合在牛市和熊市的表现")
print("="*80)
market_state_results

NameError: name 'portfolio_abn_ew' is not defined


### 6. 异常换手率的持续性

In [26]:
from pandas.tseries.offsets import MonthEnd

cross_turnover_sorted = cross_turnover.sort_values(['Stkcd', 'month']).copy()
temp_df = cross_turnover_sorted[['Stkcd', 'month', 'abnormal_turnover']].copy()

for lag in range(1, 13):
    lag_data = temp_df.copy()
    lag_data['month'] = lag_data['month'] + MonthEnd(lag)
    lag_data = lag_data.rename(columns={'abnormal_turnover': f'abnormal_turnover_lag{lag}'})
    cross_turnover_sorted = pd.merge(cross_turnover_sorted, lag_data, on=['Stkcd', 'month'], how='left')

correlations = []
for lag in range(1, 13):
    valid_data = cross_turnover_sorted[['abnormal_turnover', f'abnormal_turnover_lag{lag}']].dropna()
    if len(valid_data) > 0:
        corr = valid_data.corr().iloc[0, 1]
    else:
        corr = np.nan

    correlations.append({
        '滞后期': f'{lag}个月',
        '相关系数': corr
    })

persistence_df = pd.DataFrame(correlations)

print("="*80)
print("异常换手率的持续性分析")
print("="*80)
print("\n异常换手率的自相关系数：")
persistence_df

NameError: name 'cross_turnover' is not defined


### 8. 相关表现的指标

In [27]:
def calculate_risk_metrics(returns):
    cum_returns = (1 + returns).cumprod()
    running_max = cum_returns.expanding().max()
    drawdown = (cum_returns - running_max) / running_max
    max_drawdown = drawdown.min()

    info_ratio = returns.mean() / returns.std() * np.sqrt(12)

    skewness = returns.skew()
    kurtosis = returns.kurtosis()

    downside_returns = returns[returns < 0]
    downside_std = downside_returns.std() if len(downside_returns) > 0 else 0

    sortino_ratio = returns.mean() / downside_std * np.sqrt(12) if downside_std > 0 else np.nan

    return {
        '平均收益率(%)': returns.mean() * 100,
        '标准差(%)': returns.std() * 100,
        '最大回撤(%)': max_drawdown * 100,
        '夏普比率': info_ratio,
        'Sortino比率': sortino_ratio,
        '偏度': skewness,
        '峰度': kurtosis
    }

# 计算等权组合的风险指标
risk_metrics_ew = []
for col in ['G1', 'G10', 'H-L']:
    metrics = calculate_risk_metrics(portfolio_abn_ew[col])
    metrics['组合'] = col
    risk_metrics_ew.append(metrics)

risk_metrics_ew_df = pd.DataFrame(risk_metrics_ew).set_index('组合')

risk_metrics_vw = []
for col in ['G1', 'G10', 'H-L']:
    metrics = calculate_risk_metrics(portfolio_abn_vw[col])
    metrics['组合'] = col
    risk_metrics_vw.append(metrics)

risk_metrics_vw_df = pd.DataFrame(risk_metrics_vw).set_index('组合')

print("="*80)
print("风险调整后的表现分析")
print("="*80)
print("\n【等权组合】风险指标：")
print(risk_metrics_ew_df)
print("\n【市值加权组合】风险指标：")
risk_metrics_vw_df

NameError: name 'portfolio_abn_ew' is not defined


### 9. 综合总结表

In [29]:
print("="*80)
print("异常换手率异象 - 深入分析总结")
print("="*80)

summary_text = f"""
### 1. 时间序列稳定性
- 前半期H-L收益: {stats_first_ew.loc['H-L', '平均收益率']:.3f}% (t={stats_first_ew.loc['H-L', 't值(NW)']:.2f})
- 后半期H-L收益: {stats_second_ew.loc['H-L', '平均收益率']:.3f}% (t={stats_second_ew.loc['H-L', 't值(NW)']:.2f})

### 2. 双重排序分析
异常换手率效应在不同市值和价值组中的表现：
{double_sort_summary.to_string()}

### 3. 换手率交互作用
12-month turnover对abnormal turnover预测力的影响：
{interaction_summary.to_string()}

### 4. 收益分解
- 多头组合(G1)贡献: {long_contrib_ew:.1f}%
- 空头组合(G10)贡献: {short_contrib_ew:.1f}%

### 5. 市场状态
{market_state_results.to_string()}

### 6. 风险指标
等权H-L组合:
- 夏普比率: {risk_metrics_ew_df.loc['H-L', '夏普比率']:.3f}
- 最大回撤: {risk_metrics_ew_df.loc['H-L', '最大回撤(%)']:.2f}%
- Sortino比率: {risk_metrics_ew_df.loc['H-L', 'Sortino比率']:.3f}

### 7. 异常换手率持续性
{persistence_df.to_string()}
"""

print(summary_text)

异常换手率异象 - 深入分析总结


NameError: name 'stats_first_ew' is not defined