In [4]:
import numpy as np
import pandas as pd

import os
from tqdm import tqdm
import time

### AMQI数据准备

#### 数据准备

##### 金融净资产

In [2]:
# dfin: 金融资产净值
fin = pd.read_csv('.\\AMQI\\raw\\dfin.csv')
fin.head()

Unnamed: 0,Stkcd,Accper,Typrep,A001107000,A001202000,A001203000,A001204000,A001205000,A001206000,A001211000,A002101000,A002125000,A002206000,A003112101
0,1,1990-12-31,A,,,,,,,,0.0,,0.0,
1,1,1991-12-31,A,,,,,,,,0.0,,0.0,
2,1,1992-12-31,A,,,,,,,,0.0,,0.0,
3,1,1993-12-31,A,194893456.9,,,,,,,0.0,,0.0,
4,1,1994-06-30,A,274008592.2,,,,253698353.9,,,0.0,,0.0,


In [3]:
# 预处理数据
fin = fin.rename(columns={fin.columns[0]:'code',
                          fin.columns[1]:'date',
                          fin.columns[2]:'type'}) # 重命名数据列名
fin['code'] = fin['code'].astype(str).str.zfill(6) # 修改股票代码格式
fin['date'] = pd.to_datetime(fin['date']) # 修改日期格式
fin = fin[fin.type=='A'] # 只选择A股

In [4]:
# 求解“净”资产
fin1 = fin.copy()
fin1 = fin1[~fin1[fin1.columns[-4:]].isnull().all(axis=1)] # 债务不能全部为missing value
fin1['fin'] = fin1.iloc[:,3:10].sum(axis=1) - fin1.iloc[:,-4:].sum(axis=1)
fin1['year'] = fin1.date.dt.year
fin1['month'] = fin1.date.dt.month
fin1 = fin1[fin1.month == 12] # 每年按照年报数据进行更新
fin1 = fin1[['code','year','month','fin']]

# 求解同比变动额
fin1['year_lag1'] = fin1.groupby('code')['year'].shift(1) # 用于设置condition
fin1['fin_lag1'] = np.where((fin1['year'] - fin1['year_lag1']) == 1, fin1['fin'].shift(1), np.nan)
fin1['dfin'] = fin1['fin'] - fin1['fin_lag1'] # 计算差值

fin1 = fin1[['code','year','month','dfin']].dropna()

# 根据财报年调整：每年7月第一个交易日根据上一年度财报数据进行更新
fin1['year'] = fin1['year']+1
fin1['month'] = 6
fin1

Unnamed: 0,code,year,month,dfin
1,000001,1992,6,0.000000e+00
2,000001,1993,6,0.000000e+00
3,000001,1994,6,1.948935e+08
5,000001,1995,6,5.158071e+08
7,000001,1996,6,3.312972e+08
...,...,...,...,...
591171,873706,2024,6,2.849915e+07
591191,873726,2024,6,2.174352e+08
591207,873806,2023,6,7.672605e+07
591217,873806,2024,6,4.178950e+06


In [46]:
fin1.to_csv('.\\AMQI\\processed\\dfin_final.csv') # 数据尚未标准化

##### ROA变化

In [29]:
# droa: 金融资产净值
r = pd.read_csv('.\\AMQI\\raw\\earning.csv') # 净利润
r.head()

# 数据预处理
r = r.rename(columns={r.columns[0]:'code',
                      r.columns[1]:'date',
                      r.columns[2]:'type',
                      r.columns[3]:'earning'})
r['code'] = r['code'].astype(str).str.zfill(6)
r['date'] = pd.to_datetime(r['date'])
r = r[r['type']=='A'].drop('type',axis=1)
r.head()

Unnamed: 0,code,date,earning
0,1,1991-12-31,
1,1,1992-12-31,
2,1,1993-06-30,
3,1,1993-12-31,0.029269
4,1,1994-06-30,


In [3]:
# 导入总资产数据
a = pd.read_csv('.\\AMQI\\raw\\asset.csv') # 净利润
a.head()

# 数据预处理
a = a.rename(columns={a.columns[0]:'code',
                      a.columns[1]:'date',
                      a.columns[2]:'type',
                      a.columns[3]:'asset'})
a['code'] = a['code'].astype(str).str.zfill(6)
a['date'] = pd.to_datetime(a['date'])
a = a[a['type']=='A'].drop('type',axis=1)
a.head()

Unnamed: 0,code,date,asset
0,1,1990-12-31,2919190000.0
1,1,1991-12-31,4354460000.0
2,1,1992-12-31,7522847000.0
3,1,1993-12-31,9337871000.0
4,1,1994-06-30,12465950000.0


In [4]:
# 计算roa，需要用滞后一期的asset进行计算
a = a.sort_values(by='date')
a['date_lag1'] = a.groupby('code')['date'].shift(-1) # 滞后一期
a = a.drop('date',axis=1)

r['year'] = r.date.dt.year
r['month'] = r.date.dt.month
a['year'] = a.date_lag1.dt.year
a['month'] = a.date_lag1.dt.month

roa = pd.merge(r,a,how='left',on=['code','year','month'])
roa['roa'] = roa['earning']/roa['asset']
roa = roa[['code','date','roa']]
roa.head()

Unnamed: 0,code,date,roa
0,1,1991-12-31,
1,1,1992-12-31,
2,1,1993-06-30,
3,1,1993-12-31,3.890681e-12
4,1,1994-06-30,


In [6]:
# 根据公告公布时间调整日期
announce = pd.read_csv('.\\AMQI\\raw\\announce.csv')

# 数据预处理
announce = announce.rename(columns={announce.columns[0]:'code',
                                    announce.columns[1]:'quarter',
                                    announce.columns[2]:'date',
                                    announce.columns[3]:'announce'})
announce['code'] = announce['code'].astype(str).str.zfill(6)
announce['date'] = pd.to_datetime(announce['date'])
announce['announce'] = pd.to_datetime(announce['announce'])
announce.head()

Unnamed: 0,code,quarter,date,announce
0,1,4,1990-12-31,1991-04-30
1,1,4,1991-12-31,1992-04-30
2,1,2,1992-06-30,1992-07-28
3,1,4,1992-12-31,1993-02-26
4,1,2,1993-06-30,1993-08-05


In [7]:
# 合并数据
roa1 = pd.merge(roa,announce,
                how='left',
                on=['code','date'])

roa1 = roa1.dropna()

roa1.head()

Unnamed: 0,code,date,roa,quarter,announce
3,1,1993-12-31,3.890681e-12,4.0,1994-03-29
5,1,1994-12-31,1.845507e-12,4.0,1995-01-27
6,1,1995-06-30,8.015024e-13,2.0,1995-08-11
7,1,1995-12-31,1.228101e-12,4.0,1996-03-14
8,1,1996-06-30,7.377732e-13,2.0,1996-08-29


In [25]:
# 计算差值
roa2 = roa1.copy()
roa2['year'] = roa2.date.dt.year
roa2 = roa2.sort_values(by='date')
roa2['droa'] = np.where((((roa2.groupby('code')['quarter'].diff() == 1)&(roa2.groupby('code')['year'].diff() == 0)) |
                        ((roa2.groupby('code')['quarter'].diff() == -3)&(roa2.groupby('code')['year'].diff() == 1)) |
                        ((roa2.groupby('code')['quarter'].diff() == 2)&(roa2.groupby('code')['year'].diff() == 0)) |
                        ((roa2.groupby('code')['quarter'].diff() == -2)&(roa2.groupby('code')['year'].diff() == 1))),
                       roa2.groupby('code')['roa'].diff(), np.nan) # 要求必须相邻的两次财报公布才能计算插值

# 整理最终输出数据
roa2 = roa2[['code','droa','announce']].rename(columns={'announce':'date'}).dropna()
roa2['year'] = roa2.date.dt.year
roa2['month'] = roa2.date.dt.month
roa2 = roa2.drop('date',axis=1)
roa2.head()

Unnamed: 0,code,droa,year,month
221213,600816,3.775985e-12,1995,4
223402,600837,9.619808e-12,1995,4
16295,563,-4.408167e-12,1995,3
201999,600617,-2.404052e-10,1995,8
202215,600619,-9.915708e-12,1995,8


In [27]:
roa2.to_csv('.\\AMQI\\processed\\droa_final.csv') # 储存数据

##### Amihud

In [34]:
%%time
# 导入数据
path = '.\\AMQI\\raw\\amihud\\'
info = os.listdir(path)
amihud = pd.DataFrame()

for i in tqdm(info):
    sub_amihud = pd.read_csv(path+i,encoding='gbk',low_memory=False).iloc[2:,:]
    amihud = pd.concat([amihud,sub_amihud])

100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [00:33<00:00,  2.26s/it]

CPU times: total: 12.5 s
Wall time: 34.4 s





In [35]:
# 数据预处理
amihud1 = amihud.copy()
amihud1 = amihud1.rename(columns = {amihud1.columns[0]:'code',
                                    amihud1.columns[1]:'date',
                                    amihud1.columns[2]:'closed',
                                    amihud1.columns[3]:'volumn'})
amihud1['date'] = pd.to_datetime(amihud1['date'])
amihud1 = amihud1.sort_values('date')
amihud1['gap'] = amihud1.groupby('code')['date'].transform(lambda x:x-x.shift(1))

In [37]:
%%time
# 计算日度amihud比率
amihud2 = amihud1.copy()
#amihud2 = amihud2[(amihud2['gap']==pd.Timedelta('1 days')) | (amihud2['gap'].isna())] # 要求连续日期
amihud2[['closed','volumn']] = amihud2[['closed','volumn']].astype(float)
amihud2['return'] = amihud2.groupby('code')['closed'].transform(lambda x:(x-x.shift(1))/x.shift(1))

amihud2['volumn'] = amihud2['volumn']/1000000
amihud2['amihud_d'] = np.abs(amihud2['return'])/amihud2['volumn']
amihud2.head()

CPU times: total: 11.9 s
Wall time: 30.1 s


Unnamed: 0,code,date,closed,volumn,gap,return,amihud_d
2,1,2005-01-04,6.52,11.465602,NaT,,
352951,600668,2005-01-04,3.7,0.971748,NaT,,
208878,605,2005-01-04,6.65,0.310137,NaT,,
351794,600667,2005-01-04,3.2,2.066521,NaT,,
209839,606,2005-01-04,3.4,2.681713,NaT,,


In [38]:
# 数据预处理
amihud3 = amihud2.copy()
amihud3['code'] = amihud3['code'].astype(str).str.zfill(6)
amihud3['year'] = amihud3.date.dt.year
amihud3['month'] = amihud3.date.dt.month

amihud3['count'] = amihud3.groupby(['code','year','month'])['amihud_d'].transform(lambda x:x.count()) # 计算每月个数，为了之后计算六个月平均值做准备
amihud3 = amihud3[~amihud3['count']<10] # 根据appendix，要求每月交易日期不得少于10天

In [40]:
amihud4 = amihud3.groupby(['code','year','month','count'])['amihud_d'].apply(lambda x:x.mean()).reset_index() # 计算月平均amihud
amihud4.head()

Unnamed: 0,code,year,month,count,amihud_d
0,1,2005,1,18,0.000805
1,1,2005,2,13,0.000411
2,1,2005,3,23,0.000674
3,1,2005,4,20,0.000509
4,1,2005,5,17,0.000356


In [41]:
# 定义函数，我们需要一个滑动窗口来估计t期前6个月的平均amihud
def date_shift(df,i):
    df1 = df.copy()
    df1['year'] = np.where(df1['month'] == 12, df1['year'] + 1, df1['year'])
    df1['month'] = np.where(df1['month'] == 12, 1, df1['month']+1)
    df1 = df1.rename(columns={df1.columns[3]:'count'+str(i),df1.columns[4]:'amihud_d'+str(i)})
    return df1

In [42]:
# 计算滑动窗口
amihud41 = date_shift(amihud4,1)
amihud42 = date_shift(amihud41,2)
amihud43 = date_shift(amihud42,3)
amihud44 = date_shift(amihud43,4)
amihud45 = date_shift(amihud44,5)
amihud5 = pd.merge(amihud4,amihud41,on=['code','year','month'],how='left')
amihud5 = pd.merge(amihud5,amihud42,on=['code','year','month'],how='left')
amihud5 = pd.merge(amihud5,amihud43,on=['code','year','month'],how='left')
amihud5 = pd.merge(amihud5,amihud44,on=['code','year','month'],how='left')
amihud5 = pd.merge(amihud5,amihud45,on=['code','year','month'],how='left')

# 计算前6个月平均值
amihud5 = amihud5.dropna()
amihud5['amihud'] = (amihud5['count']*amihud5['amihud_d']+amihud5['count']*amihud5['amihud_d']+
                     amihud5['count']*amihud5['amihud_d']+amihud5['count']*amihud5['amihud_d']+
                     amihud5['count']*amihud5['amihud_d']+amihud5['count']*amihud5['amihud_d'])/(amihud5['count']+amihud5['count1']+
                                                                                                 amihud5['count2']+amihud5['count3']+
                                                                                                 amihud5['count4']+amihud5['count5'])
amihud5.head()

Unnamed: 0,code,year,month,count,amihud_d,count1,amihud_d1,count2,amihud_d2,count3,amihud_d3,count4,amihud_d4,count5,amihud_d5,amihud
5,1,2005,6,21,0.00034,17.0,0.000356,20.0,0.000509,23.0,0.000674,13.0,0.000411,18.0,0.000805,0.000383
6,1,2005,7,21,0.000433,21.0,0.00034,17.0,0.000356,20.0,0.000509,23.0,0.000674,13.0,0.000411,0.000475
7,1,2005,8,23,0.000223,21.0,0.000433,21.0,0.00034,17.0,0.000356,20.0,0.000509,23.0,0.000674,0.000246
8,1,2005,9,19,0.000275,23.0,0.000223,21.0,0.000433,21.0,0.00034,17.0,0.000356,20.0,0.000509,0.000259
9,1,2005,10,7,0.000427,19.0,0.000275,23.0,0.000223,21.0,0.000433,21.0,0.00034,17.0,0.000356,0.000166


In [43]:
# 保存需要的数据
amihud6 = amihud5[['code','year','month','amihud']]
amihud6.to_pickle('.\\AMQI\\processed\\amihud.pkl')

#### 价格数据

In [5]:
# 去除B股数据
def dropB(df,datename):
    '''
    drop stocks in B market
    '''
    bstock = pd.read_pickle('.\\AMQI\\raw\\bstock.pkl')
    if datename != 'date':
        bstock = bstock.rename(columns={'date':datename})
    df = pd.merge(df,bstock,
                  on=['code',datename],how='left')
    return df[(df['type']!=2) & (df['type']!=8)].drop('type',axis=1)

In [6]:
# 导入价格数据
price_month = pd.read_pickle('.\\AMQI\\raw\\price_month_filled.pkl')
price_month = dropB(price_month,'date')
price_month = price_month.drop('risk_free',axis=1) # 去掉错误的无风险利率，需要替换为月度数据

price_month.head()

Unnamed: 0,code,date,close_price,market_value,return,month,year,market,total_value
0,1,1991-05-31,38.34,1016010.0,-0.122253,5,1991,-0.072406,1859496.56
1,1,1991-06-28,33.990002,900735.0,-0.113459,6,1991,-0.083346,1648520.81
2,1,1991-07-31,29.540001,782810.0,-0.130921,7,1991,-0.056298,1432695.05
3,1,1991-08-31,15.0,674833.8,-0.492214,8,1991,-0.217385,1346274.65
4,1,1991-09-30,0.0,0.0,0.0,9,1991,0.173043,


In [13]:
# 换上正确的risk free数据
risk_free = pd.read_csv('.\\AMQI\\raw\\risk_free.csv')

# 数据预处理
risk_free['date'] = pd.to_datetime(risk_free['date'])
risk_free['year'] = risk_free.date.dt.year
risk_free['month'] = risk_free.date.dt.month
risk_free = risk_free.drop(['risk_free','date'],axis=1) # 仅保留月度数据
risk_free = risk_free.drop_duplicates(keep='first')
risk_free = risk_free.groupby(['year','month'])['risk_free_month'].mean().reset_index() # 对于一个月有两个无风险利率的情况，进行平均处理

# 合并数据
price_month1 = pd.merge(price_month,risk_free,
                        how='left',
                        on=['year','month'])
print('Before processed:\n',price_month1.isna().sum())

# 处理total_value缺失值，缺失值是由于停牌导致的
price_month1['total_value'] = price_month1.groupby('code')['total_value'].transform(lambda x:x.ffill())
print('After processed:\n',price_month1.isna().sum())

Before processed:
 code                   0
date                   0
close_price            0
market_value           0
return                 0
month                  0
year                   0
market                 0
total_value        29123
risk_free_month        0
dtype: int64
After processed:
 code               0
date               0
close_price        0
market_value       0
return             0
month              0
year               0
market             0
total_value        0
risk_free_month    0
dtype: int64


In [14]:
price_month1.to_pickle('.\\AMQI\\processed\\price_month.pkl')

In [18]:
a = pd.read_pickle('.\\AMQI\\raw\\price.pkl')

In [22]:
a[(a.code=='000005') & (a.date >= pd.to_datetime('20110315'))].head(20)

Unnamed: 0,code,date,close_price,market_value,return,market,risk_free,total_value
4369038,5,2011-03-15,4.07,3718934.04,-0.02864,-0.014122,8.1e-05,3721337.78
4371052,5,2011-03-16,4.11,3755483.76,0.009828,0.012616,8.1e-05,3757911.12
4373063,5,2011-03-17,4.05,3700659.18,-0.014599,-0.012481,8.1e-05,3703051.11
4375082,5,2011-03-18,4.11,3755483.76,0.014815,0.004014,8.1e-05,3757911.12
4377073,5,2011-03-21,4.09,3737208.9,-0.004866,-0.000478,8.1e-05,3739624.45
4379072,5,2011-03-22,4.06,3709796.61,-0.007335,0.003054,8.1e-05,3712194.44
4381082,5,2011-03-23,4.16,3801170.91,0.024631,0.010917,8.1e-05,3803627.81
4383103,5,2011-03-24,4.14,3782896.05,-0.004808,-7e-06,8.1e-05,3785341.13
4385135,5,2011-03-25,4.17,3810308.34,0.007246,0.009464,8.1e-05,3812771.14
4387145,5,2011-03-28,4.21,3846858.06,0.009592,0.000109,8.1e-05,3849344.49
