In [212]:
#!/Tsan/bin/python
# -*- coding: utf-8 -*-

In [213]:
# Libraries To Use
from __future__ import division 
import numpy as np
import pandas as pd
import statsmodels.api as sm
import os
from sklearn import linear_model
from datetime import datetime,time,date
import matplotlib.pyplot as plt
import theano.tensor as T
from theano import function
import seaborn as sns
from theano.tensor.shared_randomstreams import RandomStreams

In [214]:
# Import My own library for factor testing
from SingleFactorTest import factorFilterFunctions as ff
#from config import *

In [215]:
%load_ext line_profiler

The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


In [216]:
path = ff.data_path # path

In [217]:
filenameFCAP = 'LZ_GPA_VAL_A_FCAP.csv'

In [218]:
# --------------------------------------- Global Functions to def---------------------------------- #

In [219]:
# top functions to def
def simpleNormalize(narrowedData):
    dataWinsorized = narrowedData.copy()
    dataWinsorizedTrans = dataWinsorized.T
    MAD = 1.483*np.abs(dataWinsorizedTrans-dataWinsorizedTrans.median(skipna=True))
    return ((dataWinsorizedTrans - dataWinsorizedTrans.mean(axis=0, skipna=True))/dataWinsorizedTrans.std(axis=0, skipna=True)).T

In [220]:
# top function
def neutralizeFactor(normalizedFactorDF, normalizedLFCAPDF, datelist):
    factorNeutralized = pd.DataFrame(index=normalizedFactorDF.index, columns=normalizedFactorDF.columns, data=None, dtype = float)
    for date in datelist:
        LFCAPIndice = normalizedLFCAPDF.loc[date].dropna()
        factorIndice = normalizedFactorDF.loc[date].dropna()
        intersectionStocks = list(set(LFCAPIndice.index) & set(factorIndice.index))
        #dummy_Matrix = pd.get_dummies(IndustryDF.loc[date]).T.iloc[:-1]
        #dummy_Matrix = dummy_Matrix[intersectionStocks].append(LFCAPIndice.loc[intersectionStocks])
        try:
            result = sm. OLS(factorIndice.loc[intersectionStocks].T, LFCAPIndice.loc[intersectionStocks].T).fit()
            factorNeutralized.loc[date][intersectionStocks] = result.resid
        except:
            factorNeutralized.loc[date] = np.NaN
    return factorNeutralized

In [221]:
# --------------------------------------- Function Section End ---------------------------------- #

In [222]:
# Data prepared for Neuralization
FCAP1 = np.log10(pd.read_csv(path+filenameFCAP,infer_datetime_format=True,parse_dates=[0],index_col=0))
NormalizedFCAP = simpleNormalize(FCAP1 )

In [223]:
# --------------------------------------- Calculate Forward Adjusted Price ---------------------------------- #

In [224]:
# the necessary files
filenameAdjustFactor='LZ_GPA_CMFTR_CUM_FACTOR.csv'
filenamePirce='LZ_GPA_QUOTE_TCLOSE.csv'

In [225]:
# first is to calculate forward adjusted pice
def calAdjustedPrice():
    # Adjusted factor
    AdjFacBackward=pd.read_csv(path+filenameAdjustFactor,infer_datetime_format=True,parse_dates=[0],index_col=0)
    AdjFacBackward=AdjFacBackward[:]

    #PriceData to Adjust
    PriceToAdj=pd.read_csv(path+filenamePirce,infer_datetime_format=True,parse_dates=[0],index_col=0)
    PriceToAdj=PriceToAdj[:]

    #Calculate
    AdjFacforward = AdjFacBackward/AdjFacBackward.max()
    adjustedPrice = (AdjFacforward*PriceToAdj).round(5)
    #adjustedPrice.index.name = 'LZ_GPA_DERI_AdjustedPriceForward_20-d' 
    adjustedPrice.index.name = 'Own_Factor_AdjustedPriceForward-1d'
    print adjustedPrice.index.name 
    adjustedPrice.to_csv(path+'Own_Factor_AdjustedPriceForward-1d.csv',na_rep='NaN',date_format='%Y%m%d')

In [226]:
calAdjustedPrice()

Own_Factor_AdjustedPriceForward-1d


In [227]:
# --------------------------------------- Calculate ILLQ Factor(5-days average) ---------------------------------- #

In [228]:
# 
filenameClose = 'LZ_GPA_QUOTE_TCLOSE.csv'
filenameOpen = 'LZ_GPA_QUOTE_TOPEN.csv'
filenameVolume = 'LZ_GPA_QUOTE_TVOLUME.csv'

In [229]:
def calcILLQ(): # rolling_window is set as 5 days
    openPrice = pd.read_csv(path+filenameOpen,infer_datetime_format=True,parse_dates=[0],index_col=0)
    closePrice = pd.read_csv(path+filenameClose,infer_datetime_format=True,parse_dates=[0],index_col=0)
    volume = pd.read_csv(path+filenameVolume,infer_datetime_format=True,parse_dates=[0],index_col=0)
    if openPrice.shape != closePrice.shape:
        print openPrice.shape, closePrice.shape
        print 'data shape is not equal!'
    else:
        newdf = np.abs((closePrice - openPrice)/openPrice)/volume
        newdf = newdf.rolling(min_periods=5,window=5,center=False).mean()
        newdf.index.name = 'Own_Factor_ILLQ-1d'
        newdf.to_csv(path+'Own_Factor_ILLQ-1d.csv',na_rep='NaN',date_format='%Y%m%d')
        return newdf       

In [230]:
#filenameILLQ = 'Own_Factor_ILLQ-1d.csv'
#openPrice = pd.read_csv(path+filenameILLQ,infer_datetime_format=True,parse_dates=[0],index_col=0)

In [231]:
ILLQdf = calcILLQ()

In [232]:
# TO nuetralize DDA20
NormalizedILLQ = simpleNormalize(ILLQdf)
neutralizedILLQ = neutralizeFactor(NormalizedILLQ, NormalizedFCAP, NormalizedILLQ.index)
neutralizedILLQ.index.name = 'Own_Factor_ADJ_ILLQ_1D'
neutralizedILLQ.to_csv(path+neutralizedILLQ.index.name+'.csv',na_rep='NaN',date_format='%Y%m%d')

In [233]:
# --------------------------------------- Calculate FCAP Adjusted Turnover Volume---------------------------------- #

In [234]:
# --------------------------------------- ILLIQ End--------------------------------- #

In [235]:
# REMINDER: Code in this section can be used to neutralize any new factor! Use this to check some size-affected factor!!!

In [236]:
filenameTURNOVER = 'LZ_GPA_QUOTE_TVOLUME.csv'
filenameFCAP = 'LZ_GPA_VAL_A_FCAP.csv'

In [237]:
TURNOVER = pd.read_csv(path+filenameTURNOVER,infer_datetime_format=True,parse_dates=[0],index_col=0)
FCAP1 = np.log10(pd.read_csv(path+filenameFCAP,infer_datetime_format=True,parse_dates=[0],index_col=0))

TURNOVER= simpleNormalize(TURNOVER)

FCAP1 = simpleNormalize(FCAP1 )

datelist = FCAP1.index.tolist()

In [238]:
adjustedTurnOver = neutralizeFactor(TURNOVER , FCAP1, datelist)

In [239]:
adjustedTurnOver.index.name = 'Own_Factor_AdjustedTurnOver-1d'
adjustedTurnOver.to_csv(path+'Own_Factor_AdjustedTurnOver-1d.csv',na_rep='NaN',date_format='%Y%m%d')

In [240]:
# --------------------------------------- Calculate FCAP Adjusted PB ---------------------------------- #

In [241]:
filenamePB = 'LZ_GPA_VAL_PB.csv'
filenameFCAP = 'LZ_GPA_VAL_A_FCAP.csv'
PB = pd.read_csv(path+filenamePB ,infer_datetime_format=True,parse_dates=[0],index_col=0)
PB = simpleNormalize(PB)

In [242]:
adjustedPB = neutralizeFactor(PB , FCAP1, datelist)
adjustedPB.index.name = 'Own_Factor_AdjustedPB-1d'
adjustedPB.to_csv(path+'Own_Factor_AdjustedPB-1d.csv',na_rep='NaN',date_format='%Y%m%d')

In [243]:
pb1 = pd.read_csv(path+'Own_Factor_AdjustedPB-1d.csv',infer_datetime_format=True,parse_dates=[0],index_col=0)

In [244]:
pb1.columns

Index([u'000005.SZ', u'600601.SH', u'600602.SH', u'600651.SH', u'600652.SH',
       u'600653.SH', u'600654.SH', u'600656.SH', u'000004.SZ', u'000002.SZ',
       ...
       u'603985.SH', u'300651.SZ', u'603229.SH', u'603728.SH', u'603896.SH',
       u'603926.SH', u'002871.SZ', u'603086.SH', u'603113.SH', u'603180.SH'],
      dtype='object', length=3317)

In [245]:
# --------------------------------------- Calculate  x-days return volatility  ---------------------------------- #

In [246]:
filenameAdjPrice =  'Own_Factor_AdjustedPriceForward-1d.csv'

In [247]:
def calReturnVol(period): # 90days maybe good
    AdjPrice = pd.read_csv(path+filenameAdjPrice,infer_datetime_format=True,parse_dates=[0],index_col=0)
    returnDF = AdjPrice.pct_change()
    newdf = returnDF.rolling(min_periods=20,window=period,center=False).std()
    newdf.index.name = 'Own_Factor_Volatility_%dd' % period
    newdf.to_csv(path+'Own_Factor_Volatility_%dd.csv' % period,na_rep='NaN',date_format='%Y%m%d')
    return newdf

In [248]:
calReturnVol(90).tail()

Unnamed: 0_level_0,000005.SZ,600601.SH,600602.SH,600651.SH,600652.SH,600653.SH,600654.SH,600656.SH,000004.SZ,000002.SZ,...,603985.SH,300651.SZ,603229.SH,603728.SH,603896.SH,603926.SH,002871.SZ,603086.SH,603113.SH,603180.SH
Own_Factor_Volatility_90d,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-05-08,0.013169,0.012675,0.010156,0.019321,0.01953,0.012285,0.0,,0.015433,0.014209,...,,,,,,,,,,
2017-05-09,0.01303,0.012343,0.014588,0.019388,0.022357,0.012244,0.0,,0.015113,0.014215,...,,,,,,,,,,
2017-05-10,0.013548,0.012432,0.014828,0.019564,0.022571,0.011962,0.0,,0.015081,0.014199,...,,,,,,,,,,
2017-05-11,0.013547,0.012503,0.016606,0.01933,0.022509,0.011863,0.0,,0.014758,0.014502,...,,,,,,,,,,
2017-05-12,0.013615,0.012569,0.016698,0.019432,0.022623,0.011927,0.0,,0.01475,0.014459,...,,,,,,,,,,


In [249]:
# --------------------------------------- Calculate  x-days return above 20 days MA  ---------------------------------- #

In [250]:
def calAbove20MA(period):
    AdjPrice = pd.read_csv(path+filenameAdjPrice,infer_datetime_format=True,parse_dates=[0],index_col=0)
    newdf = AdjPrice - AdjPrice.rolling(min_periods=20,window=period,center=False).mean()
    newdf = newdf.rolling(min_periods=20,window=period,center=False).mean()
    newdf.index.name = 'Own_Factor_Above20MA_%dd' % period
    newdf.to_csv(path+'Own_Factor_Above20MA_%dd.csv' % period,na_rep='NaN',date_format='%Y%m%d')
    return newdf 

In [251]:
calAbove20MA(20).tail()

Unnamed: 0_level_0,000005.SZ,600601.SH,600602.SH,600651.SH,600652.SH,600653.SH,600654.SH,600656.SH,000004.SZ,000002.SZ,...,603985.SH,300651.SZ,603229.SH,603728.SH,603896.SH,603926.SH,002871.SZ,603086.SH,603113.SH,603180.SH
Own_Factor_Above20MA_20d,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-05-08,-0.254825,-0.1918,7.727152e-15,-0.6766,-0.7526,-0.183525,1.953993e-15,,-0.755025,-0.62425,...,,,,,,,,,,
2017-05-09,-0.2746,-0.202575,-0.04465,-0.740025,-0.6667,-0.191675,1.953993e-15,,-0.651425,-0.671175,...,,,,,,,,,,
2017-05-10,-0.31575,-0.2186,-0.099775,-0.7745,-0.6123,-0.1985,1.953993e-15,,-0.555775,-0.715275,...,,,,,,,,,,
2017-05-11,-0.36575,-0.234375,-0.182275,-0.8041,-0.55355,-0.204,1.953993e-15,,-0.4685,-0.74165,...,,,,,,,,,,
2017-05-12,,,,,,,,,,,...,,,,,,,,,,


In [252]:
# --------------------------------------- Calculate  ARoon  ---------------------------------- #

In [253]:
AdjPrice = pd.read_csv(path+filenameAdjPrice,infer_datetime_format=True,parse_dates=[0],index_col=0).iloc[-1000:]
adcopy = AdjPrice.copy()

In [254]:
adcopy.iloc[:,2].head()

Own_Factor_AdjustedPriceForward-1d
2013-04-02    3.91469
2013-04-03    3.85493
2013-04-08    3.87485
2013-04-09    3.89477
2013-04-10    3.98442
Name: 600602.SH, dtype: float64

In [255]:
def calAroon(data, l=20):
    datacopy = data.copy()
    for i in range(l,len(datacopy)):
        s = datacopy.iloc[i-l:i]
        #print s
        try:
            data.iloc[i] = pd.Timedelta(s.idxmax().date() - s.idxmin().date()).days/l
        except:
            data.iloc[i] = np.NaN
    return data

In [256]:
#aroonData = adcopy.iloc[:,:10].apply(calAroon,l=20)

In [257]:
# --------------------------------------- Aroon End  ---------------------------------- #

In [258]:
# --------------------------------------- Calculate  daily deal Amount(yuan)   ---------------------------------- #

In [259]:
filenameTVolume = 'LZ_GPA_QUOTE_TVOLUME.csv' # 成交量
filenameAdjPrice =  'Own_Factor_AdjustedPriceForward-1d.csv'

In [260]:
def calcDDA():
    turnOver = pd.read_csv(path+filenameTVolume,infer_datetime_format=True,parse_dates=[0],index_col=0)
    closePrice = pd.read_csv(path+filenameAdjPrice,infer_datetime_format=True,parse_dates=[0],index_col=0)
    #volume = pd.read_csv(path+filenameVolume,infer_datetime_format=True,parse_dates=[0],index_col=0)
    if turnOver.shape != closePrice.shape:
        print turnOver.shape, closePrice.shape
        print 'data shape is not equal!'
    else:
        newdf = turnOver * closePrice
        #newdf = newdf.rolling(min_periods=5,window=5,center=False).mean()
        newdf.index.name = 'Own_Factor_DDA-1d'
        newdf.to_csv(path+'Own_Factor_DDA-1d.csv',na_rep='NaN',date_format='%Y%m%d')
        return newdf           

In [261]:
calcDDA()

Unnamed: 0_level_0,000005.SZ,600601.SH,600602.SH,600651.SH,600652.SH,600653.SH,600654.SH,600656.SH,000004.SZ,000002.SZ,...,603985.SH,300651.SZ,603229.SH,603728.SH,603896.SH,603926.SH,002871.SZ,603086.SH,603113.SH,603180.SH
Own_Factor_DDA-1d,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2005-01-04,1.120256e+04,9.511504e+04,3.648641e+04,4.231575e+04,4.286417e+04,46478.301754,34848.379104,7.744337e+04,13739.751792,1.176296e+05,...,,,,,,,,,,
2005-01-05,1.104777e+04,1.539230e+05,7.705574e+04,1.189617e+05,3.607527e+04,36450.030077,66542.020171,2.682239e+04,24501.571164,2.070431e+05,...,,,,,,,,,,
2005-01-06,1.028649e+04,7.519487e+04,9.378221e+04,1.406231e+05,5.684882e+04,30255.485960,33524.838585,8.755773e+03,23666.695054,2.156412e+05,...,,,,,,,,,,
2005-01-07,4.150195e+04,1.162695e+05,8.110980e+04,5.668555e+04,5.576362e+04,34962.611051,62727.421697,3.152954e+04,17692.454404,2.093654e+05,...,,,,,,,,,,
2005-01-10,1.024396e+05,1.451684e+05,8.156218e+04,6.196563e+04,2.280886e+04,35335.227232,113085.368636,2.329403e+05,15792.835194,8.330268e+04,...,,,,,,,,,,
2005-01-11,2.546677e+04,7.329183e+04,6.653886e+04,7.007041e+04,2.777977e+04,30690.642067,53326.139704,4.126954e+05,51265.468366,7.055445e+04,...,,,,,,,,,,
2005-01-12,1.819887e+04,9.166338e+04,5.598795e+04,3.804263e+04,2.549850e+04,29026.524715,54666.832954,2.388097e+05,22423.051945,8.959669e+04,...,,,,,,,,,,
2005-01-13,2.032021e+04,1.246899e+05,4.048227e+04,1.689614e+05,2.643201e+04,36763.938919,77921.212387,3.383591e+05,49882.869110,1.099860e+05,...,,,,,,,,,,
2005-01-14,1.890111e+04,1.520970e+05,5.239335e+04,1.093032e+05,6.019900e+04,39863.661422,56831.726570,3.200100e+05,42039.785261,9.886799e+04,...,,,,,,,,,,
2005-01-17,2.449155e+04,1.062675e+05,7.289316e+04,5.262947e+04,8.441836e+04,57976.111495,47497.162300,3.989485e+05,37938.614077,1.166628e+05,...,,,,,,,,,,


In [262]:
filenameDDA = 'Own_Factor_DDA-1d.csv'
DDAdf = pd.read_csv(path+filenameDDA,infer_datetime_format=True,parse_dates=[0],index_col=0)

In [263]:
def DDAaverage(period):
    DDAmean = DDAdf.rolling(min_periods=20,window=period,center=False).mean()
    DDAmean.index.name = 'Own_Factor_DDA-%dd' % period
    DDAmean.to_csv(path+'Own_Factor_DDA-%dd.csv' % period, na_rep='NaN',date_format='%Y%m%d')
    return DDAmean

In [264]:
DDA20df= DDAaverage(20)

In [265]:
# TO nuetralize DDA20
NormalizedDDA20 = simpleNormalize(DDA20df)
neutralizedDDA20 = neutralizeFactor(NormalizedDDA20, NormalizedFCAP, NormalizedDDA20.index)
neutralizedDDA20.index.name = 'Own_Factor_ADJ_DDA_20D'
neutralizedDDA20.to_csv(path+neutralizedDDA20.index.name+'.csv',na_rep='NaN',date_format='%Y%m%d')

In [266]:
# --------------------------------------- Calculate  annual idiosyncratic volatility(daily updated)   ---------------------------------- #

In [267]:
def cal_factor_return(factor, stkreturn, factorname):
    '''
    To cal the return of the factor by group method( best(1/3) - worst(1/3))
    Output: Dataframe, the factor Return with only one value column
    Input:
    factor: Dataframe, the factor Value df, Note that shift(1) has been used before this function is used
    stkreturn:  Dataframe, the return of all stock in the market. No shift needed.
    factorname:  String, the name of the factor!
    '''
    factorReturn = pd.DataFrame(index=factor.index[1:-1], columns=[factorname], data =None , dtype =float)
    for date in factorReturn.index:
        factorSlice = factor.loc[date].dropna()
        stkreturnSlice = stkreturn.loc[date].dropna()
        intersection = list(set(factorSlice.index) & set(stkreturnSlice.index))
        factorSlice = factorSlice.loc[intersection]
        stkreturnSlice = stkreturnSlice.loc[intersection]
        
        q_min = factorSlice.quantile(0.33)
        q_max = factorSlice.quantile(0.66)
        q_min_univ = factorSlice[factorSlice<=q_min]
        q_max_univ = factorSlice[factorSlice>=q_max]
        q_min_return = (q_min_univ * stkreturnSlice.loc[q_min_univ.index]).sum() / q_min_univ.sum()
        q_max_return = (q_max_univ * stkreturnSlice.loc[q_max_univ.index]).sum() / q_max_univ.sum()
        #print q_min_return
        factorReturn.loc[date] = q_min_return - q_max_return
    return factorReturn

In [268]:
startTime =  datetime.strptime('20100101', '%Y%m%d')
endTime = datetime.strptime('20170328', '%Y%m%d')
filenameAdjPrice =  'Own_Factor_AdjustedPriceForward-1d.csv'
filenameFCAP = 'LZ_GPA_VAL_A_FCAP.csv'
filenamePB='LZ_GPA_VAL_PB.csv'  # 市净率
filenameBENCH = 'LZ_GPA_INDXQUOTE_CLOSE.csv'
ZZ500Index = '000905.SH' #   ZZ500 index code

In [269]:
pricedf = pd.read_csv(path+filenameAdjPrice,infer_datetime_format=True,parse_dates=[0],index_col=0).loc[startTime:endTime]
FCAPdf  = pd.read_csv(path+filenameFCAP,infer_datetime_format=True,parse_dates=[0],index_col=0).loc[startTime:endTime]
PBdf = pd.read_csv(path+filenamePB,infer_datetime_format=True,parse_dates=[0],index_col=0).loc[startTime:endTime]
benchmarkdf = pd.read_csv(path+filenameBENCH,infer_datetime_format=True,parse_dates=[0],index_col=0)[ZZ500Index].loc[startTime:endTime].pct_change()
returndf = pricedf.pct_change()
FCAPdf = FCAPdf .shift(1)
PBdf = PBdf.shift(1)
FCAPdf.tail()

Unnamed: 0_level_0,000005.SZ,600601.SH,600602.SH,600651.SH,600652.SH,600653.SH,600654.SH,600656.SH,000004.SZ,000002.SZ,...,603985.SH,300651.SZ,603229.SH,603728.SH,603896.SH,603926.SH,002871.SZ,603086.SH,603113.SH,603180.SH
LZ_GPA_VAL_A_FCAP-d,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-03-22,631611.7134,985506.1506,825918.6874,815507.8476,756457.5094,670610.0417,1316040.0,,329771.5014,20561770.0,...,,,,,,,,,,
2017-03-23,623050.7113,972336.8034,825918.6874,797829.5436,732637.1453,668863.6614,1316040.0,,330518.1539,20454980.0,...,,,,,,,,,,
2017-03-24,619245.8215,965752.1298,825918.6874,793986.4341,721048.86,661878.1401,1316040.0,,324296.0501,20823890.0,...,,,,,,,,,,
2017-03-27,621148.2664,974531.6946,825918.6874,800135.4094,708816.7811,665370.9008,1316040.0,,319982.0581,20862720.0,...,,,,,,,,,,
2017-03-28,616392.1541,976726.5858,825918.6874,803209.897,703666.4321,661878.1401,1316040.0,,315502.1434,20583220.0,...,,,,,,,,,,


In [270]:
SizeReturn = cal_factor_return(FCAPdf, returndf , 'Size_Return')

In [271]:
SizeReturn.isnull().sum()

Size_Return    0
dtype: int64

In [272]:
PBReturn = cal_factor_return(PBdf, returndf , 'PB_Return')

In [273]:
PBReturn.isnull().sum()

PB_Return    0
dtype: int64

In [274]:
benchmarkdf = benchmarkdf.loc[PBReturn.index[0]:PBReturn.index[-1]]

In [275]:
benchmarkdf .isnull().sum()

0

In [276]:
returndf = returndf.loc[PBReturn.index[0]:PBReturn.index[-1]]

In [277]:
startOfMonthList, endOfMonthList = ff.getLastDayOfMonth(PBReturn.index)

In [278]:
len(startOfMonthList)

87

In [279]:
len(endOfMonthList)

87

In [280]:
datetuple = list(zip(startOfMonthList,endOfMonthList))

In [281]:
def calIdoVol(returnDF , factorReturnList , datetuple ,torelance = 0.05):
    '''
    function to calculate to idiosyncratic volatility.
    Output: Dataframe, The idiosyncratic volatility factor (same shape as ohter daily factor).
    Input : 
    returnDF: Dataframe, the data of the return of all stocks.
    factorReturn: List, the element is the factor-return dataframe. (Obtained by group-method so there is only one value \
    column(and one index column)
    for each dataframe).Note that this df has been shift(1) to ensure that future data is not used!
    datatuple: List, element is the tuple which is consisted of startTime and endTime. Usually zip by the startOfMonthList and endOfMonthList.
    torelance: float, to filter the Nan Value.
    '''
    startdf = pd.DataFrame()
    for i in datetuple:        
        returnDFSlice = returnDF.loc[i[0]:i[1]]
        tempdf= pd.DataFrame(index = returnDFSlice.index, columns = returnDFSlice.columns ,data = None ,dtype =float)
        newReturnSlice = returnDFSlice.loc[:,returnDFSlice.isnull().sum() < returnDFSlice.shape[1] * torelance]
        newReturnSlice = newReturnSlice.fillna(method = 'ffill').fillna(method = 'bfill')
        filterList = newReturnSlice.columns.tolist()
        mapfunction = map(lambda x: x.loc[i[0]:i[1]], factorReturnList)
        totaldf = pd.concat(mapfunction, axis=1) 
        for stk in filterList:
            result = sm.OLS(newReturnSlice[stk],totaldf).fit()
            tempdf[stk].loc[i[0]:i[1]] = np.std(result.resid) * np.sqrt(242)
        startdf = startdf.append(tempdf)
    return startdf        

In [282]:
dfList = [PBReturn,SizeReturn,benchmarkdf]

In [283]:
dd = calIdoVol(returndf , dfList,datetuple)

In [284]:
dd.index.name = 'Own_FACTOR_Idiosyncratic_Volatility'
dd.head()

Unnamed: 0_level_0,000005.SZ,600601.SH,600602.SH,600651.SH,600652.SH,600653.SH,600654.SH,600656.SH,000004.SZ,000002.SZ,...,603985.SH,300651.SZ,603229.SH,603728.SH,603896.SH,603926.SH,002871.SZ,603086.SH,603113.SH,603180.SH
Own_FACTOR_Idiosyncratic_Volatility,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-01-05,0.267502,0.275782,0.183632,0.256143,0.131893,0.245651,0.192114,0.318826,0.0,0.161892,...,,,,,,,,,,
2010-01-06,0.267502,0.275782,0.183632,0.256143,0.131893,0.245651,0.192114,0.318826,0.0,0.161892,...,,,,,,,,,,
2010-01-07,0.267502,0.275782,0.183632,0.256143,0.131893,0.245651,0.192114,0.318826,0.0,0.161892,...,,,,,,,,,,
2010-01-08,0.267502,0.275782,0.183632,0.256143,0.131893,0.245651,0.192114,0.318826,0.0,0.161892,...,,,,,,,,,,
2010-01-11,0.267502,0.275782,0.183632,0.256143,0.131893,0.245651,0.192114,0.318826,0.0,0.161892,...,,,,,,,,,,


In [285]:
dd.to_csv(path+'Own_Factor_Idiosyncratic_Volatility.csv', na_rep='NaN',date_format='%Y%m%d')

In [286]:
# --------------------------------------- Calculate  annual idiosyncratic volatility(daily updated)   ---------------------------------- #

In [287]:
ddc =dd.tail(100)

In [288]:
filenameIDIVOL = 'Own_Factor_Idiosyncratic_Volatility.csv'
idio = pd.read_csv(path+filenameIDIVOL,infer_datetime_format=True,parse_dates=[0],index_col=0)

In [289]:
idio.head()

Unnamed: 0_level_0,000005.SZ,600601.SH,600602.SH,600651.SH,600652.SH,600653.SH,600654.SH,600656.SH,000004.SZ,000002.SZ,...,603985.SH,300651.SZ,603229.SH,603728.SH,603896.SH,603926.SH,002871.SZ,603086.SH,603113.SH,603180.SH
Own_FACTOR_Idiosyncratic_Volatility,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-01-05,0.267502,0.275782,0.183632,0.256143,0.131893,0.245651,0.192114,0.318826,0.0,0.161892,...,,,,,,,,,,
2010-01-06,0.267502,0.275782,0.183632,0.256143,0.131893,0.245651,0.192114,0.318826,0.0,0.161892,...,,,,,,,,,,
2010-01-07,0.267502,0.275782,0.183632,0.256143,0.131893,0.245651,0.192114,0.318826,0.0,0.161892,...,,,,,,,,,,
2010-01-08,0.267502,0.275782,0.183632,0.256143,0.131893,0.245651,0.192114,0.318826,0.0,0.161892,...,,,,,,,,,,
2010-01-11,0.267502,0.275782,0.183632,0.256143,0.131893,0.245651,0.192114,0.318826,0.0,0.161892,...,,,,,,,,,,


In [290]:
dd.head()

Unnamed: 0_level_0,000005.SZ,600601.SH,600602.SH,600651.SH,600652.SH,600653.SH,600654.SH,600656.SH,000004.SZ,000002.SZ,...,603985.SH,300651.SZ,603229.SH,603728.SH,603896.SH,603926.SH,002871.SZ,603086.SH,603113.SH,603180.SH
Own_FACTOR_Idiosyncratic_Volatility,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-01-05,0.267502,0.275782,0.183632,0.256143,0.131893,0.245651,0.192114,0.318826,0.0,0.161892,...,,,,,,,,,,
2010-01-06,0.267502,0.275782,0.183632,0.256143,0.131893,0.245651,0.192114,0.318826,0.0,0.161892,...,,,,,,,,,,
2010-01-07,0.267502,0.275782,0.183632,0.256143,0.131893,0.245651,0.192114,0.318826,0.0,0.161892,...,,,,,,,,,,
2010-01-08,0.267502,0.275782,0.183632,0.256143,0.131893,0.245651,0.192114,0.318826,0.0,0.161892,...,,,,,,,,,,
2010-01-11,0.267502,0.275782,0.183632,0.256143,0.131893,0.245651,0.192114,0.318826,0.0,0.161892,...,,,,,,,,,,


In [291]:
def adj_boxplot(factorData):
    '''To calculate  adjusted -boxplot winsorized data and then Normalize the outcome
    Output: Dataframe, the winsorized and normalized data
    Input: 
    factorData:Dataframe, raw data, can contain nan value
    '''
    copyData = factorData.copy()
    for i in copyData.index:
        temp = copyData.loc[i]
        x = temp.dropna().values
        if len(x) > 0:
            mc = sm.stats.stattools.medcouple(x)
            x.sort()
            q1 = x[int(0.25*len(x))]
            q3 = x[int(0.75*len(x))]
            iqr = q3-q1
            if mc >= 0:
                l = q1-1.5*np.exp(-3.5*mc)*iqr
                u = q3+1.5*np.exp(4*mc)*iqr
            else:
                l = q1-1.5*np.exp(-4*mc)*iqr
                u = q3+1.5*np.exp(3.5*mc)*iqr
            temp.loc[temp < l] = l
            temp.loc[temp > u] = u
            #factor_data.loc[i] = (temp-temp.mean())/temp.std()
    Trans = copyData.T
    return ((Trans  - Trans .mean(axis=0, skipna=True))/Trans .std(axis=0, skipna=True)).T

In [292]:
%mprun  addd = adj_boxplot(dd)

ERROR:root:Line magic function `%mprun` not found.


In [293]:
%lprun -f adj_boxplot adj_boxplot(idio)

In [294]:
# --------------------------------------- Calculate  some random factor  ---------------------------------- #

In [295]:
filenameAdjPrice =  'Own_Factor_AdjustedPriceForward-1d.csv'
filenameVolume = 'LZ_GPA_QUOTE_TVOLUME.csv'
closePrice = pd.read_csv(path+filenameAdjPrice,infer_datetime_format=True,parse_dates=[0],index_col=0)
tradVol = pd.read_csv(path+filenameVolume,infer_datetime_format=True,parse_dates=[0],index_col=0)

In [296]:
def calrandfac(pricedf, factordf):
    sgndf= np.sign(pricedf.pct_change())
    tempo = factordf * sgndf
    tempo = tempo.ewm(ignore_na=True, min_periods=5, halflife = 5).mean()
    return tempo

In [297]:
overheatVol = calrandfac(closePrice, tradVol)

In [298]:
overheatVol

Unnamed: 0_level_0,000005.SZ,600601.SH,600602.SH,600651.SH,600652.SH,600653.SH,600654.SH,600656.SH,000004.SZ,000002.SZ,...,603985.SH,300651.SZ,603229.SH,603728.SH,603896.SH,603926.SH,002871.SZ,603086.SH,603113.SH,603180.SH
LZ_GPA_QUOTE_TVOLUME-d,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2005-01-04,,,,,,,,,,,...,,,,,,,,,,
2005-01-05,,,,,,,,,,,...,,,,,,,,,,
2005-01-06,,,,,,,,,,,...,,,,,,,,,,
2005-01-07,,,,,,,,,,,...,,,,,,,,,,
2005-01-10,,,,,,,,,,,...,,,,,,,,,,
2005-01-11,17362.211023,15599.700509,8196.017254,7338.990601,1224.180126,6698.553599,10590.838322,-14326.736494,3880.029943,28860.425910,...,,,,,,,,,,
2005-01-12,10402.596662,4943.392692,3961.613465,3901.998182,-714.132059,2198.381750,3396.387402,1668.029096,2044.179410,39472.459598,...,,,,,,,,,,
2005-01-13,11233.659633,12637.857932,1585.294374,9984.066901,970.715897,1740.174325,8768.956296,-15194.558737,3484.987477,50336.580403,...,,,,,,,,,,
2005-01-14,6374.148763,201.505827,-598.890251,3853.326015,-2465.104131,-2078.950153,2830.837631,-27492.719639,1318.541297,24416.840946,...,,,,,,,,,,
2005-01-17,1834.721746,-6514.614880,-3010.419273,1223.984240,-6407.404637,-6573.772657,-1101.438848,-41250.269022,-228.435338,1917.324023,...,,,,,,,,,,


In [299]:
overheatVol.index.name = 'OVER_HEAT_VOL'
overheatVol.to_csv(path+'Over_Heat_Volume.csv', na_rep='NaN',date_format='%Y%m%d')

In [300]:
# --------------------------------------- Calculate  skewness  ---------------------------------- #
filenameAdjPrice =  'Own_Factor_AdjustedPriceForward-1d.csv'
closePrice = pd.read_csv(path+filenameAdjPrice,infer_datetime_format=True,parse_dates=[0],index_col=0)

In [301]:
# calculate N-days skewness of the price
def calSkewness(pricedf,period):
    df = pricedf.rolling(min_periods=250,window=period,center=False).skew()
    df.index.name =  'Own_Factor_Skewness_%dd' % period
    df.to_csv(path+'Own_Factor_Skewness_%dd.csv' % period,na_rep='NaN',date_format='%Y%m%d')
    return df

In [302]:
priceSkewness = calSkewness(closePrice,250)

In [303]:
# --------------------------------------- Calculate  TurnOver Rate Volatility ---------------------------------- #
filenameTOR= 'LZ_GPA_VAL_TURN.csv'
turnoverdf =  pd.read_csv(path+filenameTOR,infer_datetime_format=True,parse_dates=[0],index_col=0)
FCAP1 = np.log10(pd.read_csv(path+filenameFCAP,infer_datetime_format=True,parse_dates=[0],index_col=0))

In [304]:
def calToRvol(period):
    newdf = turnoverdf.rolling(min_periods=20,window=period,center=False).std()
    newdf.index.name = 'Own_Factor_Turnover_Volatility_%dD' % period
    newdf.to_csv(path+'Own_Factor_Turnover_Volatility_%dD.csv' % period,na_rep='NaN',date_format='%Y%m%d')
    return newdf

In [305]:
newsdf = calToRvol(20)

In [306]:
newsdf.tail()

Unnamed: 0_level_0,000005.SZ,600601.SH,600602.SH,600651.SH,600652.SH,600653.SH,600654.SH,600656.SH,000004.SZ,000002.SZ,...,603985.SH,300651.SZ,603229.SH,603728.SH,603896.SH,603926.SH,002871.SZ,603086.SH,603113.SH,603180.SH
Own_Factor_Turnover_Volatility_20D,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-05-08,1.250134,0.212963,,,0.765779,0.15406,,,,0.142627,...,,,,,,,,,,
2017-05-09,1.257208,0.179756,,,1.5473,0.181474,,,,0.145766,...,,,,,,,,,,
2017-05-10,1.220779,0.173083,,,1.732003,0.191652,,,,0.137896,...,,,,,,,,,,
2017-05-11,0.532665,0.180367,,,1.760893,0.207445,,,,0.100042,...,,,,,,,,,,
2017-05-12,,,,,,,,,,,...,,,,,,,,,,


In [307]:
NormalizedTOV = simpleNormalize(newsdf )
NormalizedFCAP = simpleNormalize(FCAP1 )

In [308]:
neutralizedTOV = neutralizeFactor(NormalizedTOV, NormalizedFCAP, NormalizedTOV.index)

In [309]:
neutralizedTOV.index.name = 'Own_Factor_ADJ_Turnover_Volatility_20D'
neutralizedTOV.to_csv(path+neutralizedTOV.index.name+'.csv',na_rep='NaN',date_format='%Y%m%d')

In [310]:
def calToRVolD(period):
    newdf = turnoverdf.rolling(min_periods=20,window=period,center=False).mean()
    newdf1 = turnoverdf.rolling(min_periods=500,window=500,center=False).mean()
    newdf = newdf / newdf1 -1
    newdf.index.name = 'Own_Factor_Turnover_Volatility_deviation_%dD' % period
    newdf.to_csv(path+'Own_Factor_Turnover_Volatility_deviation_%dD.csv' % period,na_rep='NaN',date_format='%Y%m%d')
    return newdf

In [311]:
df2 = calToRVolD(20)

In [312]:
df2 .tail()

Unnamed: 0_level_0,000005.SZ,600601.SH,600602.SH,600651.SH,600652.SH,600653.SH,600654.SH,600656.SH,000004.SZ,000002.SZ,...,603985.SH,300651.SZ,603229.SH,603728.SH,603896.SH,603926.SH,002871.SZ,603086.SH,603113.SH,603180.SH
Own_Factor_Turnover_Volatility_deviation_20D,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-05-08,-0.634183,-0.814758,,,-0.459947,-0.817694,,,,-0.774554,...,,,,,,,,,,
2017-05-09,-0.639129,-0.822977,,,-0.373184,-0.826478,,,,-0.779787,...,,,,,,,,,,
2017-05-10,-0.669533,-0.825602,,,-0.328533,-0.839734,,,,-0.791583,...,,,,,,,,,,
2017-05-11,-0.741654,-0.823224,,,-0.288398,-0.848144,,,,-0.803355,...,,,,,,,,,,
2017-05-12,,,,,,,,,,,...,,,,,,,,,,


In [313]:
NormalizedTOVD = simpleNormalize(df2)
neutralizedTOVD = neutralizeFactor(NormalizedTOVD, NormalizedFCAP, NormalizedTOVD.index)
neutralizedTOVD.index.name = 'Own_Factor_ADJ_Turnover_Volatility_Deviation_20D'
neutralizedTOVD.to_csv(path+neutralizedTOVD.index.name+'.csv',na_rep='NaN',date_format='%Y%m%d')

In [314]:
neutralizedTOVD .tail()

Unnamed: 0_level_0,000005.SZ,600601.SH,600602.SH,600651.SH,600652.SH,600653.SH,600654.SH,600656.SH,000004.SZ,000002.SZ,...,603985.SH,300651.SZ,603229.SH,603728.SH,603896.SH,603926.SH,002871.SZ,603086.SH,603113.SH,603180.SH
Own_Factor_ADJ_Turnover_Volatility_Deviation_20D,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-05-08,-0.425492,-0.726569,,,-0.174244,-0.705414,,,,-0.885363,...,,,,,,,,,,
2017-05-09,-0.411459,-0.725081,,,-0.025334,-0.701809,,,,-0.897248,...,,,,,,,,,,
2017-05-10,-0.432598,-0.705105,,,0.073274,-0.701956,,,,-0.875218,...,,,,,,,,,,
2017-05-11,-0.533535,-0.698631,,,0.150445,-0.711468,,,,-0.916507,...,,,,,,,,,,
2017-05-12,,,,,,,,,,,...,,,,,,,,,,
