In [7]:
from mpl_toolkits.mplot3d import Axes3D
import numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import h5py ,os
import datetime,time
from scipy import stats
from math import sqrt
from scipy import optimize
import statsmodels.tsa.api as smt
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import math,sklearn
import pywt,gc
from statsmodels.stats.diagnostic import acorr_ljungbox
from multiprocessing.dummy import Pool 
from scipy.stats import spearmanr

In [None]:
#封装整个过程
class intraday_factor_generator(object):
    def __init__(self, raw_factor):
        self.raw_factor = raw_factor
        
    def optimized_factor_generator(self,sample = 20000,multiple = np.array([1,2,5,10]),interval = 240):
        #define summarization methods

        #interval and final output dict
        lag_index = multiple*interval
        alldims = {}
        alldims['raw_factor'] = self.raw_factor.values
        drop = (int((lag_index[-1]+int(np.round(np.sqrt(lag_index[-1])))-1)/240)+1)*240
        
        if str(sample)=='all':
            dropindex = self.raw_factor.index[drop:]
            dataset = self.raw_factor.copy()
        else:
            dropindex = self.raw_factor.index[drop:sample]
            dataset = self.raw_factor.iloc[:sample,:]
            
        def summarization(df,lag_term):
            s = {}
            s['mean']=df.rolling(lag_term).mean().loc[dropindex,:]
            s['median']=df.rolling(lag_term).median().loc[dropindex,:]
            s['std']=df.rolling(lag_term).std().loc[dropindex,:]
            s['IQR']=(df.rolling(lag_term).quantile(0.75)-df.rolling(lag_term).quantile(0.25)).loc[dropindex,:]
            s['min']=df.rolling(lag_term).min().loc[dropindex,:]
            s['max']=df.rolling(lag_term).max().loc[dropindex,:]
            s['P05']=df.rolling(lag_term).quantile(0.05).loc[dropindex,:]
            s['P95']=df.rolling(lag_term).quantile(0.95).loc[dropindex,:]
            s['skewness'] = df.rolling(lag_term).skew().loc[dropindex,:]
            s['kurtosis'] = df.rolling(lag_term).kurt().loc[dropindex,:]
            return(s)

        #predefined transformations
        sma = {}
        ema = {}
        abs_dataset = abs(dataset)
        diff_1= pd.DataFrame(np.diff(dataset,n=1,axis=0,prepend=np.nan),index = dataset.index,columns=dataset.columns)
        diff_2= pd.DataFrame(np.diff(diff_1,n=1,axis=0,prepend=np.nan),index = dataset.index,columns=dataset.columns)

        #fill sma and ema
        for lag_term in (lag_index):
            ma_term = int(np.round(np.sqrt(lag_term)))
            sma[lag_term]= dataset.rolling(ma_term,axis=0).mean()
            ema[lag_term]= dataset.ewm(span=ma_term,axis=0).mean()

        print('all data prepared')

        #all functions
        sums = {}
        sums['i_x'] = lambda x: summarization(dataset,x)
        sums['abs_x'] = lambda x: summarization(abs_dataset,x)
        sums['diff_x_1'] = lambda x: summarization(diff_1,x)
        sums['diff_x_2'] = lambda x: summarization(diff_2,x)
        sums['sma_x'] = lambda x: summarization(sma[lag_term],x)
        sums['ema_x'] = lambda x: summarization(ema[lag_term],x)
        sums['acc-mean']= lambda x: ((ema[x]/sma[x]).rolling(x).mean()).loc[dropindex,:].values
        sums['acc-std']= lambda x: ((ema[x]/sma[x]).rolling(x).std()).loc[dropindex,:].values
        sums['rd'] = lambda x: (alldims[str(x)+' i_x std']/alldims[str(x)+' diff_x_1 std'])

        #all transformations
        dims =[str(i)+' '+j for i in lag_index for j in sums.keys()]

        #do all calculations
        for i in dims:
            text = i.split(' ')
            lag_term = int(text[0])
            trans = text[1]
            if trans not in ['acc-mean','acc-std','rd']:
                features = sums[trans](lag_term)
                for j in features.keys():
                    alldims[i+' '+j] = features[j].values
            else:
                features = sums[trans](lag_term)
                alldims[i] = features
        
        self.alldims = alldims
        self.features = list(alldims.keys())
        self.timeindex = dropindex
        self.tics = self.holdtillclose.columns

    def factor_samples(self,sample_step=30):
        #聚合数据取截面
        #按时间分层抽样获得IC序列
        indexrange = np.arange(len(self.timeindex),step=sample_step)
        cs = pd.DataFrame(None,index = self.timeindex[indexrange],columns=self.features)
        t1 = datetime.datetime.now()
        for i in self.features:
            for j in indexrange:
                cs.loc[self.timeindex[j],i] = spearmanr(self.alldims[i][j,:],holdtillclose.loc[self.timeindex[j],:])[0]
        cs.fillna(0,inplace = True)
        self.cs = cs.fillna(0)

    def factor_filter(self,corlimit = 0.5,ic_limit = 0.07,var_limit=0.3):

        def evalutaions(x,ax = None):
            if (ax == 1):
                idx = x.index
            elif (ax == 0):
                idx = x.columns
            s = pd.DataFrame(None,index=idx,columns=['mean','abs_mean','std','m/s','abs(x)>0.03','abs(x)>0.1','abs(x)>0.3'])
            s['mean']=np.mean(x,axis=ax)
            s['abs_mean'] = abs(np.mean(x,axis=ax))
            s['std']=np.std(x,axis=ax)
            s['m/s']=abs(s['mean']/s['std'])
            s['abs(x)>0.03'] = x[abs(x)>0.03].count(axis=ax)/x.count(axis=ax)
            s['abs(x)>0.1'] = x[abs(x)>0.1].count(axis=ax)/x.count(axis=ax)
            s['abs(x)>0.3'] = x[abs(x)>0.3].count(axis=ax)/x.count(axis=ax)
            return(s)

        #对IC序列进行统计量分析
        compare_cs = evalutaions(self.cs,ax=0)

        #选择IC序列优秀且稳定的样本
        ############################best 5
        alter_s = list(compare_cs[(compare_cs['abs_mean']>ic_limit)&(compare_cs['m/s']>var_limit)].index)

        if (alter_s==['raw_factor']) or (alter_s==[]):
            print('None factor available')
            return([])

        #获得相关性
        #alter_index = [alter_s.index(i) for i in alter_s]
        co = np.stack([np.corrcoef(np.stack([self.alldims[l][:,i]for l in alter_s])) for i in range(len(self.tics))])

        #冒泡法去除相关性过高的数据
        average_cor = pd.DataFrame(np.nanmean(co,axis=0),index=alter_s,columns=alter_s)
        clean_order = np.argsort(np.argsort(abs(average_cor).mean()))
        clean_index = clean_order.sort_values().index

        i=0
        while i<len(clean_index):
            clean = abs(average_cor.loc[clean_index[i],:])
            dp = clean[(clean<1)&(clean>corlimit)].index
            average_cor.drop(dp,axis=1,inplace =True)
            clean_order = np.argsort(np.argsort(abs(average_cor).mean()))
            clean_index = clean_order.sort_values().index
            i=i+1
        output = {}
        output['correlations'] = average_cor.loc[clean_index,clean_index]
        output['factors'] = clean_index
        output['IC'] = compare_cs.T[clean_index]
        output['all IC'] = compare_cs
        
        for k in output['factors']:
            k_index = self.features.index(k)
            output[k] = pd.DataFrame(self.alldims[k],columns=self.tics,index = self.timeindex)

        return(output)
    
    #write npy file
    def factor_saving(self,factors,multiple = np.array([1,2,6,12,24,60,120]),interval = 20):
    #define summarization methods
        def summarization(df,lag_term,fac):
            s = {}
            s['mean']=lambda x:df.rolling(x).mean().loc[dropindex,:]
            s['median']=lambda x:df.rolling(x).median().loc[dropindex,:]
            s['std']=lambda x:df.rolling(x).std().loc[dropindex,:]
            s['IQR']=lambda x:(df.rolling(x).quantile(0.75)-df.rolling(lag_term).quantile(0.25)).loc[dropindex,:]
            s['min']=lambda x:df.rolling(x).min().loc[dropindex,:]
            s['max']=lambda x:df.rolling(x).max().loc[dropindex,:]
            s['P05']=lambda x:df.rolling(x).quantile(0.05).loc[dropindex,:]
            s['P95']=lambda x:df.rolling(x).quantile(0.95).loc[dropindex,:]
            s['skewness'] = lambda x:x.rolling(x).skew().loc[dropindex,:]
            s['kurtosis'] = lambda x:x.rolling(x).kurt().loc[dropindex,:]
            return(s[fac](lag_term))

        #interval and final output dict
        lag_index = multiple*interval
        alldims = {}
        drop = (int((lag_index[-1]+int(np.round(np.sqrt(lag_index[-1])))-1)/240)+1)*240

        dropindex = self.raw_factor.index[drop:]
        dataset = self.raw_factor.copy()

        #predefined transformations
        sma = {}
        ema = {}
        abs_dataset = abs(dataset)
        diff_1= pd.DataFrame(np.diff(dataset,n=1,axis=0,prepend=np.nan),index = dataset.index,columns=dataset.columns)
        diff_2= pd.DataFrame(np.diff(diff_1,n=1,axis=0,prepend=np.nan),index = dataset.index,columns=dataset.columns)

        #fill sma and ema
        for lag_term in (lag_index):
            ma_term = int(np.round(np.sqrt(lag_term)))
            sma[lag_term]= dataset.rolling(ma_term,axis=0).mean()
            ema[lag_term]= dataset.ewm(span=ma_term,axis=0).mean()

        print('all data prepared')

        #all functions
        sums = {}
        sums['i_x'] = lambda x: summarization(dataset,x,j)
        sums['abs_x'] = lambda x: summarization(abs_dataset,x,j)
        sums['diff_x_1'] = lambda x: summarization(diff_1,x,j)
        sums['diff_x_2'] = lambda x: summarization(diff_2,x,j)
        sums['sma_x'] = lambda x: summarization(sma[lag_term],x,j)
        sums['ema_x'] = lambda x: summarization(ema[lag_term],x,j)
        sums['acc-mean']= lambda x: ((ema[x]/sma[x]).rolling(x).mean()).loc[dropindex,:].values
        sums['acc-std']= lambda x: ((ema[x]/sma[x]).rolling(x).std()).loc[dropindex,:].values
        sums['rd'] = lambda x: (alldims[str(x)+' i_x std']/alldims[str(x)+' diff_x_1 std'])

        #do all calculations
        for i in factors:
            text = i.split(' ')
            lag_term = int(text[0])
            trans = text[1]
            if trans not in ['acc-mean','acc-std','rd']:
                j = text[2]
                alldims[i] = sums[trans](lag_term).values
            elif trans != 'rd':
                features = sums[trans](lag_term)
                alldims[i] = features
            else:
                j = 'std'
                i_std = sums['i_x'](lag_term).values
                diff_std = sums['diff_x_1'](lag_term).values
                alldims[i] = (i_std/diff_std)

        return([alldims,dropindex,self.raw_factor.columns])
def new_back_test(test_factor,holdtillclose,g = 5):
    group_return = pd.DataFrame(None,index = test_factor.index,columns=list(range(1,g+1)))
    for i in test_factor.index:
        if (test_factor.loc[i,:][test_factor.loc[i,:]==0].count())<(test_factor.loc[i,:].count()/g) and (test_factor.loc[i,:][abs(test_factor.loc[i,:])==1].count())<(test_factor.loc[i,:].count()/g):
            factor_group = pd.qcut(test_factor.loc[i,:],g,labels = list(range(1,g+1)))
            group_return.loc[i,:] = holdtillclose.loc[i,:].groupby(by=factor_group).mean()
    group_return.dropna(axis=0,inplace=True)
    group_return['h-l'] = group_return[g] - group_return[1]
    group_return['l-h'] = group_return[1] - group_return[g]
    return(group_return.astype('float'))

def show(df,t):
    df_sum = df.resample('d').mean()
    df_sum[df_sum==0]=np.nan
    df_sum = df_sum.dropna(axis=0)
    df_cumsum = df_sum.cumsum()
    for i in df_cumsum.columns:
        plt.plot(range(len(df_cumsum)),df_cumsum[i],label = 'group' + str(i))
    plt.legend()
    plt.title(t)

    Maxdrawdown = (df_cumsum-df_cumsum.expanding().max()).min()
    sr = df_sum.mean()*251/df_sum.std()/sqrt(251)
    ad = df_sum.mean()
    return(pd.DataFrame({'Annual Sharpe':sr,'Maxdrawdown':Maxdrawdown,'Average Daily Return':ad}))