In [12]:
from multiprocessing import Pool
# from sqlalchemy import create_engine
import pandas as pd
import numpy as np
import time
import os
# import matplotlib.pyplot as plt
import warnings
warnings.simplefilter("ignore")

def clean_data(iStocks):
    ##Iterate through stocks
    print("Processing " + stocks_tw[iStocks])

    ###load stock tick data (gzip)
    file1Path = stockDataDir + stocks_tw[iStocks] + '_md_201801_201812.csv.gz'
    file2Path = stockDataDir + stocks_tw[iStocks] + '_md_201901_201903.csv.gz'
    if os.path.exists(file1Path):
        df = pd.read_csv(file1Path, compression='gzip', usecols = cols)
        print('Data(First file) for ' + stocks_tw[iStocks] + ' loaded.')

        if os.path.exists(file2Path):
            df1 = pd.read_csv(file2Path, compression='gzip', usecols = cols)
            print('Data(Second file) for ' + stocks_tw[iStocks] + ' loaded.')

            df = df.append(df1)
    elif os.path.exists(file2Path):
        df = pd.read_csv(file2Path, compression='gzip', usecols = cols)
        print('Data(Second file) for ' + stocks_tw[iStocks] + ' loaded.')
    else:
        print('Skipping snapshots data for ' + stocks_tw[iStocks] + '.')
#         continue
    df=df[df['SP1']>0]
    df=df[df['BP1']>0]
    df=df[df['SP1']-df['BP1']>0]
    df.index = pd.to_datetime(df['date'], format='%Y-%m-%d') - pd.to_datetime('1900', format='%Y') + pd.to_datetime(df['time'].astype(str).str.slice(0, -5), format='%H%M')
    df['lastPx'] = df['lastPx'] / 100
    df['size'] = df['size'] * 1000
    df['volume'] = df['volume'] * 1000
    data=pd.DataFrame(columns=['nexttick_px','nexttick_size','close','open','high','low','size'],index=df.index)
    data['nexttick_px']=df['lastPx'].shift(-1)
    data['nexttick_size']=df['size'].shift(-1)
#     data['nexttick_px']=data['nexttick_px'].fillna(axis=0,method='ffill')
#     data['nexttick_size']=data['nexttick_size'].fillna(0)
    data=data.resample('min').first()
    data['close']=df['lastPx'].resample('min').last()
    data['close'] = data['close'].shift(1)
    data['open']=df['lastPx'].resample('min').first()
    data['high']=df['lastPx'].resample('min').max()
    data['low']=df['lastPx'].resample('min').min()
    data['size']=df['size'].resample('min').sum()
    data[['nexttick_px', 'close', 'open', 'high', 'low']]=data[['nexttick_px', 'close', 'open', 'high', 'low']].fillna(method='ffill')
    data[['size','nexttick_size']]=data[['size','nexttick_size']].fillna(0)
    data.index=pd.to_datetime(data.index)
    data=data.between_time('9:00','13:25')
#     data.to_csv('2327_5mins.csv')
    return (data)

In [9]:
def momentum(close,open,high,low,size):
    """  
    n = parameter for momentum
    alpha = decaying parameter for exponential moving average, 0 < alpha <=1
    """
    n = 1
    alpha = 0.01
    close = close.resample('5min').last()
    close = close.fillna(method='ffill')
    close = close.between_time('9:00','13:25')
    temp=pd.concat([close.index.to_series(name='date').astype(str).str.slice(start=0,stop=10),close],axis=1)
    temp['mom'] = temp.close.diff(n)
    temp['ema'] = temp.groupby('date')['mom'].apply(lambda col: col.ewm(alpha=alpha).mean())

    signal = pd.Series(0, index=close.index)
    signal[(temp.mom.shift(1) <= temp.ema) & (temp.mom > temp.ema)] = 1
    signal[(temp.mom.shift(1) >= temp.ema) & (temp.mom < temp.ema)] = -1
    
    for i in range(1, len(signal), 1):
        if signal[i] == 0:
            signal[i] = signal[i-1]
    #reset the first n signals of every day to 0 to avoid the interday effect
    for i in np.unique(close.index.date):
        signal[str(i)][:n] = 0 
    
    signal = signal.resample('1min').asfreq()
    signal = signal.fillna(method='ffill')
    signal = signal.between_time('9:00','13:25')
    
    return signal

def Ichimuku_ver1a(close,open,high,low,size):
    """  
    simple version
    """
    n2 = 1
    close = close.resample('5min').last()
    high = high.resample('5min').max()
    low = low.resample('5min').min()
    close = close.fillna(method='ffill')
    high = high.fillna(method='ffill')
    low = low.fillna(method='ffill')
    close = close.between_time('9:00','13:25')
    high = high.between_time('9:00','13:25')
    low = low.between_time('9:00','13:25')
    base = (high.rolling(n2).max() + low.rolling(n2).min())/2
    
    signal=pd.Series(0, index=close.index)
    signal[(base.shift(1)>=close.shift(1)) & (base<close)] = 1
    signal[(base.shift(1)<=close.shift(1)) & (base>close)] = -1
    
    for i in range(1, len(signal), 1):
        if signal[i] == 0:
            signal[i] = signal[i-1]

    #reset the first n2 signals of every day to 0 to avoid the interday effect
    for i in np.unique(close.index.date):
        signal[str(i)][:n2] = 0
    
    signal = signal.resample('1min').asfreq()
    signal = signal.fillna(method='ffill')
    signal = signal.between_time('9:00','13:25')
    
    return signal

def Ichimuku_ver1b(close,open,high,low,size):
    """  
    simple version
    """
    n2 = 2
    close = close.resample('5min').last()
    high = high.resample('5min').max()
    low = low.resample('5min').min()
    close = close.fillna(method='ffill')
    high = high.fillna(method='ffill')
    low = low.fillna(method='ffill')
    close = close.between_time('9:00','13:25')
    high = high.between_time('9:00','13:25')
    low = low.between_time('9:00','13:25')
    base = (high.rolling(n2).max() + low.rolling(n2).min())/2
    
    signal=pd.Series(0, index=close.index)
    signal[(base.shift(1)>=close.shift(1)) & (base<close)] = 1
    signal[(base.shift(1)<=close.shift(1)) & (base>close)] = -1
    
    for i in range(1, len(signal), 1):
        if signal[i] == 0:
            signal[i] = signal[i-1]

    #reset the first n2 signals of every day to 0 to avoid the interday effect
    for i in np.unique(close.index.date):
        signal[str(i)][:n2] = 0
    
    signal = signal.resample('1min').asfreq()
    signal = signal.fillna(method='ffill')
    signal = signal.between_time('9:00','13:25')
    
    return signal

def Ichimuku_ver2(close,open,high,low,size):
    """  
    strong version
    """
    n1 = 2
    n2 = 4
    n3 = 5
    
    close = close.resample('5min').last()
    high = high.resample('5min').max()
    low = low.resample('5min').min()
    close = close.fillna(method='ffill')
    high = high.fillna(method='ffill')
    low = low.fillna(method='ffill')
    close = close.between_time('9:00','13:25')
    high = high.between_time('9:00','13:25')
    low = low.between_time('9:00','13:25')
    
    conversion = (high.rolling(n1).max() + low.rolling(n1).min())/2
    base = (high.rolling(n2).max() + low.rolling(n2).min())/2
    leadingSpanA = (conversion + base) / 2
    leadingSpanB = (high.rolling(n3).max() + low.rolling(n3).min())/2
    kumo_upper = np.maximum(leadingSpanA, leadingSpanB).shift(n2)
    #kumo_upper2 = pd.DataFrame([leadingSpanA, leadingSpanB]).max()
    kumo_lower = np.minimum(leadingSpanA, leadingSpanB).shift(n2)
    #kumo_lower2 = pd.DataFrame([leadingSpanA, leadingSpanB]).min()
    
    signal=pd.Series(0, index=close.index)
    signal[(base.shift(1)>=close.shift(1)) & (base<close) & (close.shift(1) > kumo_upper)] = 1
    signal[(base.shift(1)<=close.shift(1)) & (base>close) & (close.shift(1) < kumo_lower)] = -1
    
    for i in range(1, len(signal), 1):
        if signal[i] == 0:
            signal[i] = signal[i-1]
    
    #reset the first n2+n3 signals of every day to 0 to avoid the interday effect
    for i in np.unique(close.index.date):
        signal[str(i)][:(n2+n3)] = 0
    
    signal = signal.resample('1min').asfreq()
    signal = signal.fillna(method='ffill')
    signal = signal.between_time('9:00','13:25')
    
    return signal

def MACD_1(close,open,high,low,size):
    freq = 1
    short=4
    long=8
    n=3
    da = close[close.index.minute % freq == 0]
    da=pd.concat([da.index.to_series(name='date').astype(str).str.slice(start=0,stop=10),da],axis=1)
    short_window=da.groupby('date')['close'].apply(lambda col: col.ewm(span=short//freq, min_periods=short//freq,adjust=False).mean())
    long_window=da.groupby('date')['close'].apply(lambda col: col.ewm(span=long//freq, min_periods=long//freq,adjust=False).mean())
    macd=short_window-long_window
    SignalLine=macd.ewm(span=n/freq,min_periods=n/freq,adjust=False).mean()
    macds=macd-SignalLine
    signal=pd.Series({}, index=short_window.index)
    signal[(macd.shift(1)<=macds)&(macd>macds)]=1
    signal[(macd.shift(1)>=macds)&(macd<macds)]=-1
    data = pd.concat([signal.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), signal], axis=1)
    signal = data.groupby('date')[0].fillna(method='ffill')
    signal = signal.fillna(0)
    signal = signal.reindex(close.index, method='ffill')
    return signal

def acceleration_1(close,open,high,low,size):
    freq = 1
    n=3
    da = close[close.index.minute % freq == 0]
    da=pd.concat([da.index.to_series(name='date').astype(str).str.slice(start=0,stop=10),da],axis=1)
    mom=da.groupby('date')['close'].shift(0)-da.groupby('date')['close'].shift(n//freq)
    acc=mom-mom.shift(1)
    signal=pd.Series({}, index=mom.index)
    signal[(acc.shift(1)<=0)&(acc>0)]=1
    signal[(acc.shift(1)>=0)&(acc<0)]=-1
    data = pd.concat([signal.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), signal], axis=1)
    signal = data.groupby('date')[0].fillna(method='ffill')
    signal = signal.fillna(0)
    signal = signal.reindex(close.index, method='ffill')
    return signal

def acceleration_2(close,open,high,low,size):
    freq = 1
    n=2
    da = close[close.index.minute % freq == 0]
    da=pd.concat([da.index.to_series(name='date').astype(str).str.slice(start=0,stop=10),da],axis=1)
    mom=da.groupby('date')['close'].shift(0)-da.groupby('date')['close'].shift(n//freq)
    acc=mom-mom.shift(1)
    signal=pd.Series({}, index=mom.index)
    signal[(acc.shift(1)<=0)&(acc>0)]=1
    signal[(acc.shift(1)>=0)&(acc<0)]=-1
    data = pd.concat([signal.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), signal], axis=1)
    signal = data.groupby('date')[0].fillna(method='ffill')
    signal = signal.fillna(0)
    signal = signal.reindex(close.index, method='ffill')
    return signal

def acceleration_3(close,open,high,low,size):
    freq = 1
    n=5
    da = close[close.index.minute % freq == 0]
    da=pd.concat([da.index.to_series(name='date').astype(str).str.slice(start=0,stop=10),da],axis=1)
    mom=da.groupby('date')['close'].shift(0)-da.groupby('date')['close'].shift(n//freq)
    acc=mom-mom.shift(1)
    signal=pd.Series({}, index=mom.index)
    signal[(acc.shift(1)<=0)&(acc>0)]=1
    signal[(acc.shift(1)>=0)&(acc<0)]=-1
    data = pd.concat([signal.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), signal], axis=1)
    signal = data.groupby('date')[0].fillna(method='ffill')
    signal = signal.fillna(0)
    signal = signal.reindex(close.index, method='ffill')
    return signal

def acceleration_4(close,open,high,low,size):
    freq = 1
    n=4
    da = close[close.index.minute % freq == 0]
    da=pd.concat([da.index.to_series(name='date').astype(str).str.slice(start=0,stop=10),da],axis=1)
    mom=da.groupby('date')['close'].shift(0)-da.groupby('date')['close'].shift(n//freq)
    acc=mom-mom.shift(1)
    signal=pd.Series({}, index=mom.index)
    signal[(acc.shift(1)<=0)&(acc>0)]=1
    signal[(acc.shift(1)>=0)&(acc<0)]=-1
    data = pd.concat([signal.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), signal], axis=1)
    signal = data.groupby('date')[0].fillna(method='ffill')
    signal = signal.fillna(0)
    signal = signal.reindex(close.index, method='ffill')
    return signal

def EMA_1(close,open,high,low,size):
    freq = 1
    short=5
    long=10
    da = close[close.index.minute % freq == 0]
    da=pd.concat([da.index.to_series(name='date').astype(str).str.slice(start=0,stop=10),da],axis=1)
    short_window=da.groupby('date')['close'].apply(lambda col: col.ewm(span=short//freq, min_periods=short//freq,adjust=False).mean())
    long_window=da.groupby('date')['close'].apply(lambda col: col.ewm(span=long//freq, min_periods=long//freq,adjust=False).mean())
    dif=short_window-long_window
    signal=pd.Series({}, index=short_window.index)
    signal[(dif.shift(1)<=0)&(dif>0)]=1
    signal[(dif.shift(1)>=0)&(dif<0)]=-1
    data = pd.concat([signal.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), signal], axis=1)
    signal = data.groupby('date')[0].fillna(method='ffill')
    signal = signal.fillna(0)
    signal = signal.reindex(close.index, method='ffill')
    return signal

def EMA_2(close,open,high,low,size):
    freq = 1
    short=4
    long=8
    da = close[close.index.minute % freq == 0]
    da=pd.concat([da.index.to_series(name='date').astype(str).str.slice(start=0,stop=10),da],axis=1)
    short_window=da.groupby('date')['close'].apply(lambda col: col.ewm(span=short//freq, min_periods=short//freq,adjust=False).mean())
    long_window=da.groupby('date')['close'].apply(lambda col: col.ewm(span=long//freq, min_periods=long//freq,adjust=False).mean())
    dif=short_window-long_window
    signal=pd.Series({}, index=short_window.index)
    signal[(dif.shift(1)<=0)&(dif>0)]=1
    signal[(dif.shift(1)>=0)&(dif<0)]=-1
    data = pd.concat([signal.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), signal], axis=1)
    signal = data.groupby('date')[0].fillna(method='ffill')
    signal = signal.fillna(0)
    signal = signal.reindex(close.index, method='ffill')
    return signal

def chaikin (close,open,high,low,size):
    # Set the parameters
    n1 = 3
    n2 = 23

    # Calculate the Chaikin Index
    adl = (2*close-low-high)/(high-low)
    Chaikin = adl.ewm(span = n1, min_periods = n1-1).mean() - adl.ewm(span = n2, min_periods = n2-1).mean()
    
    # Calculate the signal
    signal = np.sign(Chaikin)
    
    return signal

def money_flow (close,open,high,low,size):
    # Set the parameters
    n = 100
    min_p = 1
    upline = 99
    downline = 1
    
    # Calculate the MFI
    TypicalPrice = (high+low+close)/3
    diffML = TypicalPrice.diff()
    positive_ML_mark = diffML.apply(lambda x: 1 if x>0 else 0)
    negative_ML_mark = diffML.apply(lambda x: 1 if x<0 else 0)
    positive_ML = positive_ML_mark*size
    negative_ML = negative_ML_mark*size
    p_ML = positive_ML.rolling(window=n, min_periods=min_p).sum()
    n_ML = negative_ML.rolling(window=n, min_periods=min_p).sum()
    MFI = 100-100/(1+(p_ML/n_ML))
   
    # Calculate the signal
    signal = ((MFI.shift(1)<downline)&(MFI>downline))*1 + ((MFI.shift(1)>upline)&(MFI<upline))*(-1)
    
    return signal

def nvi (close,open,high,low,size):
    # Set the parameters and package
    initial_nvi = 1000
    n = 88751
    
    # Calculate the Negative Volume Index
    diff_size = size.diff()
    negative_size_mark = diff_size.apply(lambda x: 1 if x<0 else 0)
    multi = (1 + (close-close.shift(1))/close.shift(1))*negative_size_mark
    multipler = multi.apply(lambda x: 1 if x==0 else x)
    multipler[0] = initial_nvi 
    nvi = multipler.cumprod()
    
    # Calculate the signal
    ema = nvi.ewm(span = n, min_periods = 1).mean()
    signal = ((nvi.shift(1)<ema)&(nvi>ema))*1
   
    return signal

def pvi (close,open,high,low,size):
    # Set the parameters and package
    initial_pvi = 1000
    n=168000
    pvi = []
    
    # Calculate the Negative Volume Index
    diff_size = size.diff()
    positive_size_mark = diff_size.apply(lambda x: 1 if x>0 else 0)
    multi = (1 + (close-close.shift(1))/close.shift(1))*positive_size_mark
    multipler = multi.apply(lambda x: 1 if x==0 else x)
    multipler[0] = initial_pvi
    pvi = multipler.cumprod()
    
    # Calculate the signal
    ema = pvi.ewm(span = n, min_periods = 1).mean()
    signal = ((pvi.shift(1)<ema)&(pvi>ema))*1+((pvi.shift(1)>ema)&(pvi<ema))*(-1)
   
    return signal

def fast_stochastic_1(close, open, high, low, size):
    # par = parameter
    para1 =8606
    para2 = 2781
    para3 = 424
    high_temp = pd.concat([high.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), high], axis=1)
    low_temp = pd.concat([low.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), low], axis=1)
    hh = high_temp.groupby('date').high.rolling(window=para1, min_periods=1).max()
    ll = low_temp.groupby('date').low.rolling(window=para2, min_periods=1).min()
    hh.index = ll.index = close.index
    k_value_t = (close - ll) / (hh - ll)

    k_value_t_temp = pd.concat(
        [k_value_t.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), k_value_t],
        axis=1)
    k_value_t_temp.columns = ['date', 'close']
    k_value_t_1 = k_value_t_temp.groupby('date')['close'].shift(1)
    k_value_t_1.index = close.index

    # k_value_t_1 = k_value_t.shift(1)
    # high_temp = pd.concat([high.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), high], axis=1)
    d_value_t = k_value_t_temp.groupby('date').close.rolling(window=para3, min_periods=1).mean()
    d_value_t.index = close.index
    # d_value_t = k_value_t.rolling(window=700, min_periods=1).mean()

    #     d_temp=pd.concat([close.index.to_series(name='date').astype(str).str.slice(start=0,stop=10),k_value_t],axis=1)
    #     print(d_temp)
    #     d_value_t = k_value_t.groupby('date').k_value_t.rolling(window=500,min_periods=1).mean()
    #     d_value_t.index = close.index
    #     d_temp.columns = ['k_value_t']

    signal = ((k_value_t_1 < d_value_t) & (k_value_t > d_value_t)) * (1) + (
            (k_value_t_1 > d_value_t) & (k_value_t < d_value_t)) * (-1)
    # signal = -signal
    return signal


def fast_stochastic_2(close, open, high, low, size):
    # par = parameter
    para1 =8600
    para2 = 2781
    para3 = 424
    high_temp = pd.concat([high.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), high], axis=1)
    low_temp = pd.concat([low.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), low], axis=1)
    hh = high_temp.groupby('date').high.rolling(window=para1, min_periods=1).max()
    ll = low_temp.groupby('date').low.rolling(window=para2, min_periods=1).min()
    hh.index = ll.index = close.index
    k_value_t = (close - ll) / (hh - ll)

    k_value_t_temp = pd.concat(
        [k_value_t.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), k_value_t],
        axis=1)
    k_value_t_temp.columns = ['date', 'close']
    k_value_t_1 = k_value_t_temp.groupby('date')['close'].shift(1)
    k_value_t_1.index = close.index

    # k_value_t_1 = k_value_t.shift(1)
    # high_temp = pd.concat([high.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), high], axis=1)
    d_value_t = k_value_t_temp.groupby('date').close.rolling(window=para3, min_periods=1).mean()
    d_value_t.index = close.index
    # d_value_t = k_value_t.rolling(window=700, min_periods=1).mean()

    #     d_temp=pd.concat([close.index.to_series(name='date').astype(str).str.slice(start=0,stop=10),k_value_t],axis=1)
    #     print(d_temp)
    #     d_value_t = k_value_t.groupby('date').k_value_t.rolling(window=500,min_periods=1).mean()
    #     d_value_t.index = close.index
    #     d_temp.columns = ['k_value_t']

    signal = ((k_value_t_1 < d_value_t) & (k_value_t > d_value_t)) * (1) + (
            (k_value_t_1 > d_value_t) & (k_value_t < d_value_t)) * (-1)
    # signal = -signal
    return signal


def fast_stochastic_3(close, open, high, low, size):
    # par = parameter
    para1 =8606
    para2 = 2780
    para3 = 424
    high_temp = pd.concat([high.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), high], axis=1)
    low_temp = pd.concat([low.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), low], axis=1)
    hh = high_temp.groupby('date').high.rolling(window=para1, min_periods=1).max()
    ll = low_temp.groupby('date').low.rolling(window=para2, min_periods=1).min()
    hh.index = ll.index = close.index
    k_value_t = (close - ll) / (hh - ll)

    k_value_t_temp = pd.concat(
        [k_value_t.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), k_value_t],
        axis=1)
    k_value_t_temp.columns = ['date', 'close']
    k_value_t_1 = k_value_t_temp.groupby('date')['close'].shift(1)
    k_value_t_1.index = close.index

    # k_value_t_1 = k_value_t.shift(1)
    # high_temp = pd.concat([high.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), high], axis=1)
    d_value_t = k_value_t_temp.groupby('date').close.rolling(window=para3, min_periods=1).mean()
    d_value_t.index = close.index
    # d_value_t = k_value_t.rolling(window=700, min_periods=1).mean()

    #     d_temp=pd.concat([close.index.to_series(name='date').astype(str).str.slice(start=0,stop=10),k_value_t],axis=1)
    #     print(d_temp)
    #     d_value_t = k_value_t.groupby('date').k_value_t.rolling(window=500,min_periods=1).mean()
    #     d_value_t.index = close.index
    #     d_temp.columns = ['k_value_t']

    signal = ((k_value_t_1 < d_value_t) & (k_value_t > d_value_t)) * (1) + (
            (k_value_t_1 > d_value_t) & (k_value_t < d_value_t)) * (-1)
    # signal = -signal
    return signal


def fast_stochastic_4(close, open, high, low, size):
    # par = parameter
    para1 =8600
    para2 = 2780
    para3 = 424
    high_temp = pd.concat([high.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), high], axis=1)
    low_temp = pd.concat([low.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), low], axis=1)
    hh = high_temp.groupby('date').high.rolling(window=para1, min_periods=1).max()
    ll = low_temp.groupby('date').low.rolling(window=para2, min_periods=1).min()
    hh.index = ll.index = close.index
    k_value_t = (close - ll) / (hh - ll)

    k_value_t_temp = pd.concat(
        [k_value_t.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), k_value_t],
        axis=1)
    k_value_t_temp.columns = ['date', 'close']
    k_value_t_1 = k_value_t_temp.groupby('date')['close'].shift(1)
    k_value_t_1.index = close.index

    # k_value_t_1 = k_value_t.shift(1)
    # high_temp = pd.concat([high.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), high], axis=1)
    d_value_t = k_value_t_temp.groupby('date').close.rolling(window=para3, min_periods=1).mean()
    d_value_t.index = close.index
    # d_value_t = k_value_t.rolling(window=700, min_periods=1).mean()

    #     d_temp=pd.concat([close.index.to_series(name='date').astype(str).str.slice(start=0,stop=10),k_value_t],axis=1)
    #     print(d_temp)
    #     d_value_t = k_value_t.groupby('date').k_value_t.rolling(window=500,min_periods=1).mean()
    #     d_value_t.index = close.index
    #     d_temp.columns = ['k_value_t']

    signal = ((k_value_t_1 < d_value_t) & (k_value_t > d_value_t)) * (1) + (
            (k_value_t_1 > d_value_t) & (k_value_t < d_value_t)) * (-1)
    # signal = -signal
    return signal


def slow_stochastic(close, open, high, low, size):
    para1 = 2837
    para2 = 3483
    para3 = 4547
    para4 = 9497
    high_temp = pd.concat([high.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), high], axis=1)
    low_temp = pd.concat([low.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), low], axis=1)

    hh = high_temp.groupby('date').high.rolling(window=para1, min_periods=1).max()
    ll = low_temp.groupby('date').low.rolling(window=para2, min_periods=1).min()
    hh.index = ll.index = close.index
    k_value_t = (close - ll) / (hh - ll)

    k_value_t_temp = pd.concat(
        [k_value_t.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), k_value_t],
        axis=1)
    k_value_t_temp.columns = ['date', 'close']
    k_t_1 = k_value_t_temp.groupby('date')['close'].shift(1)
    k_t = k_value_t_temp.groupby('date').close.rolling(window=para3, min_periods=1).mean()
    k_t_1.index = k_t.index = close.index

    k_t_temp = pd.concat(
        [k_t.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), k_t],
        axis=1)
    d_t = k_t_temp.groupby('date').close.rolling(window=para4, min_periods=1).mean()
    d_t.index = close.index

    signal = ((k_t_1 < d_t) & (k_t > d_t)) * (1) + ((k_t_1 > d_t) & (k_t < d_t)) * (-1)

    return signal


def RSI_1(close, open, high, low, size):
    para1 = 8469
    para2 = 341
    para3 = 6013
    para4 = 459

    close_temp = pd.concat([close.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), close], axis=1)
    difference = close_temp.groupby('date').close.diff()

    up = difference.apply(lambda x: 1 if x > 0 else 0)
    down = difference.apply(lambda x: 1 if x < 0 else 0)

    up_temp = pd.concat([up.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), up], axis=1)
    up_count = up_temp.groupby('date').close.rolling(window=para1, min_periods=1).sum()
    up_count.index = close.index

    down_temp = pd.concat([down.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), down], axis=1)
    down_count = down_temp.groupby('date').close.rolling(window=para2, min_periods=1).sum()
    down_count.index = close.index
    # print(down_count.index)

    down_count = down_count * (-1)

    up_count_temp = pd.concat([up_count.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), up_count],
                              axis=1)
    down_count_temp = pd.concat(
        [down_count.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), down_count],
        axis=1)

    up_sma = up_count_temp.groupby('date').close.rolling(window=para3, min_periods=1).mean()
    down_sma = down_count_temp.groupby('date').close.rolling(window=para4, min_periods=1).mean()
    up_sma.index = down_sma.index = close.index

    print(down_sma[down_sma == 0])
    rsi_value = 100 - 100 / (1 + up_sma / down_sma)

    print(rsi_value)

    rsi_value_temp = pd.concat(
        [rsi_value.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), down_count],
        axis=1)
    rsi_value_1 = rsi_value_temp.groupby('date')['close'].shift(1)
    rsi_value_1.index = close.index

    signal = ((rsi_value_1 <= 20) & (rsi_value > 20)) * (1) + ((rsi_value_1 >= 80) & (rsi_value < 80)) * (-1)
    # print(len(signal[signal ==1]))
    return signal


def RSI_2(close, open, high, low, size):
    para1 = 8008
    para2 = 5933
    para3 = 4670
    para4 = 997

    close_temp = pd.concat([close.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), close], axis=1)
    difference = close_temp.groupby('date').close.diff()

    up = difference.apply(lambda x: 1 if x > 0 else 0)
    down = difference.apply(lambda x: 1 if x < 0 else 0)

    up_temp = pd.concat([up.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), up], axis=1)
    up_count = up_temp.groupby('date').close.rolling(window=para1, min_periods=1).sum()
    up_count.index = close.index

    down_temp = pd.concat([down.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), down], axis=1)
    down_count = down_temp.groupby('date').close.rolling(window=para2, min_periods=1).sum()
    down_count.index = close.index
    # print(down_count.index)

    down_count = down_count * (-1)

    up_count_temp = pd.concat([up_count.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), up_count],
                              axis=1)
    down_count_temp = pd.concat(
        [down_count.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), down_count],
        axis=1)

    up_sma = up_count_temp.groupby('date').close.rolling(window=para3, min_periods=1).mean()
    down_sma = down_count_temp.groupby('date').close.rolling(window=para4, min_periods=1).mean()
    up_sma.index = down_sma.index = close.index

    print(down_sma[down_sma == 0])
    rsi_value = 100 - 100 / (1 + up_sma / down_sma)

    print(rsi_value)

    rsi_value_temp = pd.concat(
        [rsi_value.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), down_count],
        axis=1)
    rsi_value_1 = rsi_value_temp.groupby('date')['close'].shift(1)
    rsi_value_1.index = close.index

    signal = ((rsi_value_1 <= 10) & (rsi_value > 10)) * (1) + ((rsi_value_1 >= 90) & (rsi_value < 90)) * (-1)
    # print(len(signal[signal ==1]))
    return signal


def RSI_3(close, open, high, low, size):
    para1 = 2816
    para2 = 1204
    para3 = 2314
    para4 = 820

    close_temp = pd.concat([close.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), close], axis=1)
    difference = close_temp.groupby('date').close.diff()

    up = difference.apply(lambda x: 1 if x > 0 else 0)
    down = difference.apply(lambda x: 1 if x < 0 else 0)

    up_temp = pd.concat([up.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), up], axis=1)
    up_count = up_temp.groupby('date').close.rolling(window=para1, min_periods=1).sum()
    up_count.index = close.index

    down_temp = pd.concat([down.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), down], axis=1)
    down_count = down_temp.groupby('date').close.rolling(window=para2, min_periods=1).sum()
    down_count.index = close.index
    # print(down_count.index)

    down_count = down_count * (-1)

    up_count_temp = pd.concat([up_count.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), up_count],
                              axis=1)
    down_count_temp = pd.concat(
        [down_count.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), down_count],
        axis=1)

    up_sma = up_count_temp.groupby('date').close.rolling(window=para3, min_periods=1).mean()
    down_sma = down_count_temp.groupby('date').close.rolling(window=para4, min_periods=1).mean()
    up_sma.index = down_sma.index = close.index

    print(down_sma[down_sma == 0])
    rsi_value = 100 - 100 / (1 + up_sma / down_sma)

    print(rsi_value)

    rsi_value_temp = pd.concat(
        [rsi_value.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), down_count],
        axis=1)
    rsi_value_1 = rsi_value_temp.groupby('date')['close'].shift(1)
    rsi_value_1.index = close.index

    signal = ((rsi_value_1 <= 15) & (rsi_value > 15)) * (1) + ((rsi_value_1 >= 85) & (rsi_value < 85)) * (-1)
    # print(len(signal[signal ==1]))
    return signal


def MARSI_1(close, open, high, low, size):
    para1 = 6482
    para2 = 9145
    para3 = 348
    para4 = 2701
    para5 = 4672
    close_temp = pd.concat([close.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), close], axis=1)
    difference = close_temp.groupby('date').close.diff()
    up = difference.apply(lambda x: 1 if x > 0 else 0)
    down = difference.apply(lambda x: 1 if x < 0 else 0)
    up_temp = pd.concat([up.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), up], axis=1)
    up_count = up_temp.groupby('date').close.rolling(window=para1, min_periods=1).sum()
    up_count.index = close.index
    # print('sdfghjkhg',up_count)
    down_temp = pd.concat([down.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), down], axis=1)
    down_count = down_temp.groupby('date').close.rolling(window=para2, min_periods=1).sum()
    down_count.index = close.index
    # print(down_count.index)
    down_count = down_count * (-1)
    up_count_temp = pd.concat([up_count.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), up_count],
                              axis=1)
    down_count_temp = pd.concat(
        [down_count.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), down_count],
        axis=1)
    up_sma = up_count_temp.groupby('date').close.rolling(window=para3, min_periods=1).mean()
    down_sma = down_count_temp.groupby('date').close.rolling(window=para4, min_periods=1).mean()
    up_sma.index = down_sma.index = close.index
    # print(down_sma[down_sma == 0])
    rsi_value = 100 - 100 / (1 + up_sma / down_sma)
    rsi_value_temp = pd.concat(
        [rsi_value.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), rsi_value],
        axis=1)
    marsi = rsi_value_temp.groupby('date').close.rolling(window=para5, min_periods=1).mean()
    marsi.index = close.index
    marsi_temp = pd.concat([marsi.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), marsi],
                           axis=1)
    marsi_1 = marsi_temp.groupby('date')['close'].shift(1)
    marsi_1.index = close.index
    signal = ((marsi_1 <= 30) & (marsi > 30)) * (1) + ((marsi_1 >= 70) & (marsi < 70)) * (-1)
    return signal


def MARSI_2(close, open, high, low, size):
    para1 = 5434
    para2 = 9886
    para3 = 5987
    para4 = 4685
    para5 = 9077
    close_temp = pd.concat([close.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), close], axis=1)
    difference = close_temp.groupby('date').close.diff()
    up = difference.apply(lambda x: 1 if x > 0 else 0)
    down = difference.apply(lambda x: 1 if x < 0 else 0)
    up_temp = pd.concat([up.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), up], axis=1)
    up_count = up_temp.groupby('date').close.rolling(window=para1, min_periods=1).sum()
    up_count.index = close.index
    # print('sdfghjkhg',up_count)
    down_temp = pd.concat([down.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), down], axis=1)
    down_count = down_temp.groupby('date').close.rolling(window=para2, min_periods=1).sum()
    down_count.index = close.index
    # print(down_count.index)
    down_count = down_count * (-1)
    up_count_temp = pd.concat([up_count.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), up_count],
                              axis=1)
    down_count_temp = pd.concat(
        [down_count.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), down_count],
        axis=1)
    up_sma = up_count_temp.groupby('date').close.rolling(window=para3, min_periods=1).mean()
    down_sma = down_count_temp.groupby('date').close.rolling(window=para4, min_periods=1).mean()
    up_sma.index = down_sma.index = close.index
    # print(down_sma[down_sma == 0])
    rsi_value = 100 - 100 / (1 + up_sma / down_sma)
    rsi_value_temp = pd.concat(
        [rsi_value.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), rsi_value],
        axis=1)
    marsi = rsi_value_temp.groupby('date').close.rolling(window=para5, min_periods=1).mean()
    marsi.index = close.index
    marsi_temp = pd.concat([marsi.index.to_series(name='date').astype(str).str.slice(start=0, stop=10), marsi],
                           axis=1)
    marsi_1 = marsi_temp.groupby('date')['close'].shift(1)
    marsi_1.index = close.index
    signal = ((marsi_1 <= 20) & (marsi > 20)) * (1) + ((marsi_1 >= 80) & (marsi < 80)) * (-1)
    return signal



In [10]:
stocks_tw_top50 = ['2317', '2330', '2337', '2492', '2344', '2303', '2409', '2891', '2327', '3481', '2448', '6153',
                   '2408', '2883', '2888', '2353', '2313', '2371', '2377', '2456', '2002', '3026', '2886', '3037',
                   '2481', '2884', '1101', '2345', '3406', '3231', '2882', '2885', '1605', '6456', '3443', '2892',
                   '2881', '2376', '2887', '2027', '2412', '2049', '2454', '3016', '8163', '3019', '2455', '2367',
                   '2323', '1216', ]
stocks_tw_top100 = ['2317', '2330', '2337', '2492', '2344', '2303', '2409', '2891', '2327', '3481', '2448', '6153',
                    '2408', '2883', '2888', '2353', '2313', '2371', '2377', '2456', '2002', '3026', '2886', '3037',
                    '2481', '2884', '1101', '2345', '3406', '3231', '2882', '2885', '1605', '6456', '3443', '2892',
                    '2881', '2376', '2887', '2027', '2412', '2049', '2454', '3016', '8163', '3019', '2455', '2367',
                    '2323', '1216', '1314', '2308', '0050', '2498', '2356', '1402', '3673', '4938', '2834', '1102',
                    '2474', '2478', '1301', '5880', '1326', '2439', '3090', '2324', '2603', '3532', '2890', '2382',
                    '6116', '1303', '1312', '2880', '3504', '2375', '2301', '6176', '3035', '3661', '5871', '2610',
                    '4958', '2340', '6120', '3059', '9958', '2014', '2823', '3034', '6269', '9904', '2347', '2105',
                    '2349', '2618', '3596', '0058']
stocks_tw = ['2327']
strategy = ['MACD_1']#,'MACD_2','acceleration_1','acceleration_2','acceleration_3','acceleration_4','EMA_1','EMA_2']
stockDataDir = '/Users/kuen/Desktop/CTA/data/'
cols = ["date", "time", "lastPx", "size", "volume", "SP5", "SP4", "SP3", "SP2", "SP1", "BP1", "BP2", "BP3", "BP4",
        "BP5", "SV5", "SV4", "SV3", "SV2", "SV1", "BV1", "BV2", "BV3", "BV4", "BV5"]
close = pd.DataFrame(columns=stocks_tw)
size = pd.DataFrame(columns=stocks_tw)
volume = pd.DataFrame(columns=stocks_tw)

ret=pd.DataFrame(None,index=stocks_tw,columns=strategy)
drawdown=pd.DataFrame(None,index=stocks_tw,columns=strategy)
# max position 1 lot. signal 0, position doesn't change
class backtest(object):
    def __init__(self,data,signal):

        signal[signal.isna()]=0
        self.data=data
        self.signal=signal
        self.data['signal']=signal
        signal_l = list(self.data.signal)
        position = [data.signal[0]]
        time=list(data.index.strftime("%Y-%m-%d %H:%M"))
        for i in range(1,len(data),1):
            if  ('13:21' in time[i]) or ('13:22' in time[i]) or ('13:23' in time[i]) or \
            ('13:24' in time[i]) or ('13:25' in time[i]):
                position.append(0)
                continue
#             elif signal_l[i]==0: position.append(position[i-1])
            else: position.append(signal_l[i])
        data['position'] = position
        data['position'] = data['position']*1000
        data['trades']=data.position.diff()
        data.trades[0]=data.signal[0]
        data['balance']=(-data.nexttick_px*data.trades-abs(0.00095*data.nexttick_px*data.trades)).cumsum()
        data['value']=data.balance+data.position*data.close
        data.value=data.value.fillna(method='ffill')
        self.data=data
    def wealth(self):
        ret=self.data.value[-1]
        return ret
    def maxdrawdown(self):
        drawdowns = []
        max_so_far = self.data.value[0]
        for i in range(len(self.data.value)):
            if self.data.value[i] > max_so_far:
                drawdown = 0
                drawdowns.append(drawdown)
                max_so_far = self.data.value[i]
            else:
                drawdown = max_so_far - self.data.value[i]
                drawdowns.append(drawdown)
        return max(drawdowns)
    def pnl_day(self):
        pnl=self.data.value.resample('D').last()-self.data.value.resample('D').first()
        return pnl
def test_all():
    for i in range(len(stocks_tw)):
        dff=clean_data(i)
        for j in range(len(strategy)):

                fac=eval(strategy[j])(dff.close,dff.open,dff.high,dff.low,dff['size'])
                a=backtest(dff,fac)
                ret.iloc[i,j]=a.wealth()
                drawdown.iloc[i,j]=a.maxdrawdown()


In [11]:
test_all()

Processing 2327
Data(First file) for 2327 loaded.
Data(Second file) for 2327 loaded.
