In [None]:
import random, scipy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from deap import base, creator, tools, algorithms

In [None]:
"""
Data import
"""
asset_index = pd.read_csv("data/aidx_eod_prices.csv")

# data sorting/longer than 800 days
grouped_asset = asset_index.groupby("S_IRDCODE")
asset_dfs = {ird_code: group for ird_code, group in grouped_asset if len(group) >= 800}
for ird_code, grouped_df in asset_dfs.items():
    grouped_df['TRADE_DT'] = pd.to_datetime(grouped_df['TRADE_DT'], format='%Y%m%d')
    grouped_df.sort_values(by='TRADE_DT', inplace=True)

In [None]:
"""
Parameters
"""

NUM_LIMIT = 100 # assets amount limitation range
CORR_LIMIT = 0.5 # assets' correlation limiation


In [None]:
"""
Asset sampling
"""

def sample(num_limit, asset_dfs, corr_limit):
    index_list = random.sample(list(asset_dfs.keys()), num_limit)
    
    def is_non_related(index_list):
        for i in range(0, len(index_list)):
            for j in range(i+1, len(index_list)):
                i_df = asset_dfs[index_list[i]]
                j_df = asset_dfs[index_list[j]]
                min_length = min(len(i_df['PCHG']), len(j_df['PCHG']))
                corr, _ = scipy.stats.spearmanr(i_df['PCHG'].iloc[:min_length], j_df['PCHG'].iloc[:min_length])
                if corr > corr_limit:
                    return False
        return True
    
    while is_non_related(index_list) == False:
        index_list = random.sample(list(asset_dfs.keys()), num_limit)
    
    return index_list

In [None]:
"""
Asset data process
"""

# assets pool
# unique_assets = sample(NUM_LIMIT, asset_dfs, CORR_LIMIT)
unique_assets = ['930963.CSI', '483051.CNI', '930923.CSI', 'CN5122.CNI', 'h20856.CSI', 'h30467.CSI', 'h30188.CSI', 'h01179.CSI', '931722USD210.CSI', 'h21094.CSI']

# assets dataframe
asset_index_copy = asset_index.copy()
asset_index_copy['TRADE_DT'] = pd.to_datetime(asset_index_copy['TRADE_DT'], format='%Y%m%d')
asset_index_copy.sort_values(by='TRADE_DT', inplace=True)
asset_index_copy.set_index('TRADE_DT', inplace=True)
asset_index_copy = asset_index_copy.pivot(columns='S_IRDCODE', values='CLOSE').ffill()[unique_assets].dropna()
asset_index_copy = asset_index_copy.tail(BACKTEST_DAY)

print(unique_assets)
print(asset_index_copy)