In [1]:
#调用包和API接口
import numpy as np
import pandas as pd
import tushare as ts
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import datetime
sns.set()
mpl.rcParams['font.sans-serif'] = 'WenQuanYi Micro Hei'
pro = ts.pro_api()

In [2]:
#生成空dataframe储存数据
df_daily_trade = pd.DataFrame()
list_fama = []

In [3]:
#定义调用数据函数（避免单次提取的数据量过大）
def get_daily(trade_date=''):
    for _ in range(3):
      try:
                df1 = pro.daily(trade_date=trade_date ,fields=["ts_code","trade_date","pct_chg"])
                df2 = pro.daily_basic(trade_date=trade_date , fields=["ts_code","trade_date","pb","circ_mv"])
                df2['bm'] = 1/df2['pb']
                df3 = pd.merge(df1,df2,on='ts_code', how='inner')
      except:
                time.sleep(2)
      else:
                return df3

In [4]:
#定义计算因子函数
def fama3(df_daily):
    df_daily['label_sb'] = pd.qcut(df_daily['circ_mv'],2,['small','large'])
    df_daily['label_bm'] = pd.qcut(df_daily['bm'],[0, 0.3, 0.7, 1.0],['low','mid','high'])
    
    small_low = df_daily.query('(label_sb == "small") & (label_bm == "low")')
    small_mid = df_daily.query('(label_sb == "small") & (label_bm == "mid")')
    small_high = df_daily.query('(label_sb == "small") & (label_bm == "high")')
    
    large_low = df_daily.query('(label_sb == "large") & (label_bm == "low")')
    large_mid = df_daily.query('(label_sb == "large") & (label_bm == "mid")')
    large_high = df_daily.query('(label_sb == "large") & (label_bm == "high")')
    
    r_sl = (small_low['pct_chg']*small_low['circ_mv']/100).sum()/small_low['circ_mv'].sum()
    r_sm = (small_mid['pct_chg']*small_mid['circ_mv']/100).sum()/small_mid['circ_mv'].sum()
    r_sh = (small_high['pct_chg']*small_high['circ_mv']/100).sum()/small_high['circ_mv'].sum()
    
    r_ll = (large_low['pct_chg']*large_low['circ_mv']/100).sum()/large_low['circ_mv'].sum()
    r_lm = (large_mid['pct_chg']*large_mid['circ_mv']/100).sum()/large_mid['circ_mv'].sum()
    r_lh = (large_high['pct_chg']*large_high['circ_mv']/100).sum()/large_high['circ_mv'].sum()
    
    smb = (r_sl+r_sm+r_sh-r_ll-r_lm-r_lh)/3
    hml = (r_lh+r_sh-r_ll-r_sl)/2
    
    return smb,hml

In [5]:
#生成股票列表，并剔除掉当年上市的公司（留存备用）
df_stock = pro.stock_basic(exchange='', list_status='L', fields='ts_code,symbol,market,list_date')
df_stock['list_year'] = pd.to_datetime(df_stock['list_date'])
df_stock = df_stock.query('market=="主板"').query('list_year<=20180101')
df_stock = df_stock.sort_values('list_year')
df_stock = df_stock.dropna()
df_stock.duplicated('ts_code')
df_stock = df_stock.reset_index(drop=True)

In [6]:
#开始数据处理
#挑选交易日期
df_date = pro.trade_cal(exchange='SSE', is_open='1', 
                            start_date='20191201', 
                            end_date='20191231', 
                            fields='cal_date')

In [7]:
#获取每天的因子
for date in df_date['cal_date'].values:
    df_daily = get_daily(trade_date=date)
    smb,hml = fama3(df_daily)
    list_fama.append([date,smb,hml])

In [12]:
df_fama = pd.DataFrame(np.array(list_fama), columns = ['trade_date','smb','hml'])
df_fama

Unnamed: 0,trade_date,smb,hml
0,20191202,-0.0006150089759712,-0.0013942373783179
1,20191203,0.0036024098504546,-0.0001936556939426
2,20191204,0.0031947581793979,-0.0055685454901305
3,20191205,0.0014665016993588,-0.0084422205563671
4,20191206,0.0002402544147747,-0.0086633346165552
5,20191209,0.00247310690963,0.0008606452205563
6,20191210,0.0035911941250096,-0.0091492687493856
7,20191211,-0.0023999096784445,0.0053531909079649
8,20191212,-0.0013765972757954,-0.0051712483697209
9,20191213,-0.0073221331293638,-0.0017678946435398
