In [1]:
import tushare as ts
import pandas as pd
import time
from statsmodels.tsa.stattools import adfuller

# Setup Tushare API
ts.set_token('756a1f4aba6dd90ced168a81497f46697c8499ccfa76b317daf6e874')
pro = ts.pro_api()

# Fetch all ETF tickers
etf_info = pro.fund_basic(market='E')
tickers = etf_info['ts_code'].tolist()
tickers

['508002.SH',
 '159301.SZ',
 '520830.SH',
 '159579.SZ',
 '508015.SH',
 '159329.SZ',
 '159583.SZ',
 '159800.SZ',
 '588680.SH',
 '588500.SH',
 '520990.SH',
 '520900.SH',
 '520660.SH',
 '180302.SZ',
 '508089.SH',
 '159552.SZ',
 '508086.SH',
 '159303.SZ',
 '159318.SZ',
 '560360.SH',
 '159558.SZ',
 '159322.SZ',
 '159309.SZ',
 '159315.SZ',
 '159578.SZ',
 '159542.SZ',
 '159533.SZ',
 '563090.SH',
 '159300.SZ',
 '159588.SZ',
 '563150.SH',
 '159321.SZ',
 '159597.SZ',
 '159525.SZ',
 '159557.SZ',
 '159528.SZ',
 '562700.SH',
 '588450.SH',
 '563180.SH',
 '510720.SH',
 '159306.SZ',
 '562340.SH',
 '510950.SH',
 '513820.SH',
 '517400.SH',
 '562570.SH',
 '159599.SZ',
 '562260.SH',
 '561760.SH',
 '159310.SZ',
 '159526.SZ',
 '159307.SZ',
 '180602.SZ',
 '588890.SH',
 '562820.SH',
 '513210.SH',
 '513170.SH',
 '560520.SH',
 '159582.SZ',
 '159561.SZ',
 '563280.SH',
 '560690.SH',
 '560850.SH',
 '159553.SZ',
 '562560.SH',
 '159545.SZ',
 '511130.SH',
 '562580.SH',
 '560150.SH',
 '508026.SH',
 '159589.SZ',
 '5080

In [2]:
# Define analysis period
start_date = '20240101'
end_date = '20240601'

# Collect tickers which data can be retrieved
retrieved_tickers = []


In [3]:
# Fetch data in batches to avoid API retrieval rate limit
def fetch_data_in_batches(batch_size=250, delay=40):
    etf_data_dict = {}
    total_tickers = len(tickers)
    for i in range(0, total_tickers, batch_size):
        batch_tickers = tickers[i:i + batch_size]
        for ticker in batch_tickers:
            df = pro.fund_daily(ts_code=ticker, start_date=start_date, end_date=end_date)
            if len(df) > 5:
                retrieved_tickers.append(ticker)
                etf_data_dict[ticker] = df[['trade_date', 'open', 'close']]
        print(f"Fetched batch {i // batch_size + 1}/{(total_tickers // batch_size) + 1}")
        time.sleep(delay)
    return etf_data_dict

etf_dict = fetch_data_in_batches()
etf_dict

Fetched batch 1/9
Fetched batch 2/9
Fetched batch 3/9
Fetched batch 4/9
Fetched batch 5/9
Fetched batch 6/9
Fetched batch 7/9
Fetched batch 8/9
Fetched batch 9/9


{'159525.SZ':   trade_date   open  close
 0   20240531  1.011  1.013
 1   20240530  1.017  1.012
 2   20240529  1.014  1.017
 3   20240528  1.019  1.017
 4   20240527  1.002  1.021
 5   20240524  1.003  1.003
 6   20240523  1.012  1.004
 7   20240522  1.015  1.015
 8   20240521  1.005  1.017
 9   20240520  1.006  1.015,
 '159557.SZ':   trade_date   open  close
 0   20240531  0.919  0.913
 1   20240530  0.913  0.909
 2   20240529  0.931  0.913
 3   20240528  0.937  0.931
 4   20240527  0.911  0.930
 5   20240524  0.940  0.911
 6   20240523  0.960  0.950
 7   20240522  0.965  0.964,
 '159528.SZ':   trade_date   open  close
 0   20240531  0.984  0.987
 1   20240530  0.989  0.988
 2   20240529  0.994  0.992
 3   20240528  0.998  0.992
 4   20240527  0.989  0.999
 5   20240524  0.995  0.989
 6   20240523  1.006  0.997
 7   20240522  1.008  1.010
 8   20240521  1.011  1.008
 9   20240520  1.010  1.014,
 '562700.SH':    trade_date   open  close
 0    20240531  0.942  0.955
 1    20240530  0.9

In [4]:
etf_dict['515750.SH']['close']

0     1.081
1     1.081
2     1.072
3     1.067
4     1.079
      ...  
93    1.066
94    1.089
95    1.106
96    1.123
97    1.139
Name: close, Length: 98, dtype: float64

In [5]:
# Filter datas through autocorrelation
def autocorrelation_filter(threshold):
    high_autocorr = {'Ticker': [], 'Autocorrelation': [], 'Start Time': [], 'End Time': []}
    for ticker in retrieved_tickers:
        data = etf_dict[ticker]
        autocorr = data['close'].autocorr()
        if autocorr >= threshold:
            # Find start & end date for each ETF, which are potentially different due to missing data
            start_time = data['trade_date'][len(data)-1] 
            end_time = data['trade_date'][0]
            
            # Add ETF properties to Dataframe
            high_autocorr['Ticker'].append(ticker)
            high_autocorr['Autocorrelation'].append(autocorr)
            high_autocorr['Start Time'].append(start_time)
            high_autocorr['End Time'].append(end_time)
    return pd.DataFrame(high_autocorr)

autocorr_threshold = 0.95
autocorr_etfs = autocorrelation_filter(autocorr_threshold)

print(len(autocorr_etfs))
autocorr_etfs

629


Unnamed: 0,Ticker,Autocorrelation,Start Time,End Time
0,513210.SH,0.950638,20240422,20240531
1,513170.SH,0.968082,20240419,20240531
2,159545.SZ,0.971167,20240415,20240531
3,508026.SH,0.974976,20240328,20240531
4,159581.SZ,0.955494,20240315,20240531
...,...,...,...,...
624,161903.SZ,0.957555,20240102,20240531
625,161607.SZ,0.965682,20240103,20240531
626,162605.SZ,0.953809,20240102,20240531
627,160505.SZ,0.975800,20240102,20240531


In [6]:
# Filter datas through stationarity
def stationarity_filter(threshold):
    low_stationarity = {'Ticker': [], 'Start Time': [], 'End Time': []}
    for ticker in retrieved_tickers:
        data = etf_dict[ticker]
        adf_result = adfuller(data['close'])
        p_value = adf_result[1]
        if p_value > threshold:
            # Find start & end date for each ETF
            start_time = data['trade_date'][len(data) - 1]
            end_time = data['trade_date'][0]

            # Add ETF properties to Dataframe
            low_stationarity['Ticker'].append(ticker)
            low_stationarity['Start Time'].append(start_time)
            low_stationarity['End Time'].append(end_time)
    return pd.DataFrame(low_stationarity)

significance_level = 0.6
stationarity_etfs = stationarity_filter(significance_level)

print(len(stationarity_etfs))
stationarity_etfs

610


Unnamed: 0,Ticker,Start Time,End Time
0,159557.SZ,20240522,20240531
1,159528.SZ,20240520,20240531
2,562700.SH,20240514,20240531
3,588450.SH,20240522,20240531
4,563180.SH,20240514,20240531
...,...,...,...
605,161903.SZ,20240102,20240531
606,161607.SZ,20240103,20240531
607,162703.SZ,20240102,20240531
608,160505.SZ,20240102,20240531


In [7]:
# Find intersection between previously filtered datas
def intersection_filter(autocorr_df, stationarity_df):
    # Get sets of tickers from both DataFrames
    autocorr_tickers = set(autocorr_df['Ticker'])
    stationarity_tickers = set(stationarity_df['Ticker'])

    # Find the intersection of the two sets
    common_tickers = autocorr_tickers & stationarity_tickers
    return common_tickers

shared_tickers = intersection_filter(autocorr_etfs, stationarity_etfs)
print(len(shared_tickers))
shared_tickers

434


{'159150.SZ',
 '159350.SZ',
 '159508.SZ',
 '159509.SZ',
 '159510.SZ',
 '159511.SZ',
 '159512.SZ',
 '159515.SZ',
 '159518.SZ',
 '159519.SZ',
 '159520.SZ',
 '159523.SZ',
 '159545.SZ',
 '159562.SZ',
 '159568.SZ',
 '159576.SZ',
 '159581.SZ',
 '159601.SZ',
 '159602.SZ',
 '159606.SZ',
 '159611.SZ',
 '159612.SZ',
 '159619.SZ',
 '159621.SZ',
 '159622.SZ',
 '159627.SZ',
 '159630.SZ',
 '159631.SZ',
 '159632.SZ',
 '159635.SZ',
 '159636.SZ',
 '159639.SZ',
 '159640.SZ',
 '159641.SZ',
 '159642.SZ',
 '159643.SZ',
 '159645.SZ',
 '159646.SZ',
 '159649.SZ',
 '159650.SZ',
 '159651.SZ',
 '159652.SZ',
 '159653.SZ',
 '159657.SZ',
 '159660.SZ',
 '159661.SZ',
 '159662.SZ',
 '159670.SZ',
 '159672.SZ',
 '159673.SZ',
 '159686.SZ',
 '159687.SZ',
 '159689.SZ',
 '159690.SZ',
 '159693.SZ',
 '159697.SZ',
 '159699.SZ',
 '159706.SZ',
 '159708.SZ',
 '159709.SZ',
 '159717.SZ',
 '159719.SZ',
 '159723.SZ',
 '159730.SZ',
 '159731.SZ',
 '159735.SZ',
 '159748.SZ',
 '159758.SZ',
 '159788.SZ',
 '159791.SZ',
 '159792.SZ',
 '1597