In [1]:
import pandas as pd 
import numpy as np 
import datetime 

from CV import cross_validation as CV
from CV import combinatorial as CB

# Order 
1. X, Y generation 
2. Train(valid) / Test splitting
3. CPCV
   어차피 여기서 뒤에서 개수만큼 잘라주는 거면 데이터 포인트를 자르는 것과 다를바 없음
   


In [2]:
df = pd.read_csv("./data/data_input_demo.csv", index_col = [0])
df = df.set_index(['date'])

In [3]:
df.head()

Unnamed: 0_level_0,13ty_index,interty_index,lty_index,mbs_index,13cy_index,intercy_index,lcy_index,ty_index,cy_index,agg_index,real_known,cat_obs1,cat_obs2,cat_knwon1,cat_knwon2,static
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1997-05-19,133.46,784.22,840.64,751.37,668.1,893.58,912.47,824.23,861.0,715.66,215.632652,2,2,4,3,1.0
1997-05-20,133.58,785.01,840.34,751.82,668.74,894.56,912.29,824.78,861.51,716.09,980.112727,2,1,5,4,1.0
1997-05-21,133.59,784.63,836.47,751.97,668.77,893.58,908.76,823.54,859.71,715.31,247.766286,2,1,4,4,1.0
1997-05-22,133.58,784.33,834.66,751.97,668.71,892.96,906.86,822.86,858.69,714.88,306.038065,2,1,3,4,1.0
1997-05-23,133.58,784.7,835.59,752.27,668.71,893.58,907.77,823.37,859.37,715.31,944.644883,0,2,3,3,1.0


# X, Y generation 

In [4]:

def generate_xy_seq(df: pd.DataFrame, x_seq = 66, y_seq = 22):
    """
    Generate samples from
    :param df:
    :param x_seq:
    :param y_seq:
    :param scaler:
    :return:
    # x: (epoch_size, input_length, num_nodes, input_dim)
    # y: (epoch_size, output_length, num_nodes, output_dim)
    """
    num_samples, num_nodes = df.shape
    dates_arr = np.array(df.index)
    data = np.expand_dims(df.values, axis = -1) # df -> array [N, F, 1]

    x_offsets = np.arange(-x_seq+1, 1)  
    y_offsets = np.arange(1, y_seq+1)

    # feature_list = [data]

    x, y = [], []
    x_date, y_date = [],[]

    min_t = abs(min(x_offsets))
    max_t = abs(num_samples - abs(max(y_offsets)))

    for t in range(min_t, max_t):
        # value seperation
        x.append(data[t+x_offsets, ...])
        y.append(data[t+y_offsets, ...])
        # date seperation
        x_date.append(dates_arr[t+x_offsets])
        y_date.append(dates_arr[t+y_offsets])
        
    x = np.stack(x, axis = 0)
    y = np.stack(y, axis = 0)

    x_date = np.stack(x_date, axis = 0)
    y_date = np.stack(y_date, axis = 0)

    return x, y, x_date, y_date

In [5]:
x, y, x_date, y_date = generate_xy_seq(df)

In [6]:
x.shape

(6298, 66, 16, 1)

In [7]:
x_date.shape

(6298, 66)

# Splitting

In [8]:
num_samples = df.shape[0]
num_train = round(num_samples * 0.8)
# num_val = round(num_train * 0.2)
# num_train = num_train - num_val   
num_test = num_samples - (num_train)

In [9]:
TRAIN = df.iloc[:num_train]
TEST = df.iloc[num_train : ]

# CPCV
- 6C2
- purging
- embargo 

In [10]:
def initialize(asset_prices):
    '''
    필요한 값들 생성

    :param asset_prices: (pd.DataFrame) Asset prices
    '''
    asset_name       = asset_prices.columns
    number_of_assets = asset_name.size
    time             = asset_prices.index
    length_of_time   = time.size
    first_weights    = np.ones(number_of_assets) / number_of_assets 
    all_weights      = np.zeros((length_of_time + 1, number_of_assets))

    return asset_name, number_of_assets, time, length_of_time, first_weights, all_weights


def calculate_return(asset_prices, resample_by=None):
    """
    수익률 계산 , 기간 resample 가능하게 만들기

    :param asset_prices: (pd.DataFrame) Asset prices
    :param resample_by: (str) Period to resample data, None for no resampling
    :return: (pd.DataFrame) Returns per asset
    """
    if resample_by:
        asset_prices = asset_prices.resample(resample_by).last()
    asset_returns = asset_prices.pct_change().fillna(0)
    return asset_returns

In [11]:
asset_prices = df.copy()

In [12]:
asset_name = None
number_of_assets = None
time = None
length_of_time = None
first_weights = None
all_weights = None

asset_name, number_of_assets, time, length_of_time, first_weights, all_weights = initialize(asset_prices)
monthly_return = calculate_return(asset_prices)

In [13]:
training_data = monthly_return[1:-12].copy() # 마지막 12 제외
test_data = monthly_return[-24:].copy() # 마지막 24 부터 시작
# test_data.drop(['B','N','P'],axis=1 ,inplace=True)
training_data_array = np.array(training_data)
test_data_array = np.array(test_data)

# 데이터분리 

In [14]:
from typing import Callable
import pandas as pd
import numpy as np

from sklearn.metrics import log_loss
from sklearn.model_selection import KFold
from sklearn.base import ClassifierMixin
from sklearn.model_selection import BaseCrossValidator


def ml_get_train_times(samples_info_sets: pd.Series, test_times: pd.Series) -> pd.Series:
    # pylint: disable=invalid-name
    """
    Advances in Financial Machine Learning, Snippet 7.1, page 106.

    Purging observations in the training set

    This function find the training set indexes given the information on which each record is based
    and the range for the test set.
    Given test_times, find the times of the training observations.

    :param samples_info_sets: (pd.Series) The information range on which each record is constructed from
        *samples_info_sets.index*: Time when the information extraction started.
        *samples_info_sets.value*: Time when the information extraction ended.
    :param test_times: (pd.Series) Times for the test dataset.
    :return: (pd.Series) Training set
    """
    train = samples_info_sets.copy(deep=True)
    # train.index : train start index 
    # train : train end index
    
    for start_ix, end_ix in test_times.iteritems():
        df0 = train[(start_ix <= train.index) & (train.index <= end_ix)].index  # Train starts within test
        df1 = train[(start_ix <= train) & (train <= end_ix)].index  # Train ends within test
        df2 = train[(train.index <= start_ix) & (end_ix <= train)].index  # Train envelops test
        train = train.drop(df0.union(df1).union(df2))
        
    return train


In [15]:
"""
Implements the Combinatorial Purged Cross-Validation class from Chapter 12
"""
import sys 
from itertools import combinations
from typing import List

import pandas as pd
import numpy as np

from scipy.special import comb
from sklearn.model_selection import KFold
# from .cross_validation import ml_get_train_times


def _get_number_of_backtest_paths(n_train_splits: int, n_test_splits: int) -> float:
    """
    Number of combinatorial paths for CPCV(N,K)
    :param n_train_splits: (int) number of train splits
    :param n_test_splits: (int) number of test splits
    :return: (int) number of backtest paths for CPCV(N,k)
    """
    return int(comb(n_train_splits, n_train_splits - n_test_splits) * n_test_splits / n_train_splits)


class CombinatorialPurgedKFold(KFold):
    """
    Advances in Financial Machine Learning, Chapter 12.

    Implements Combinatial Purged Cross Validation (CPCV)

    The train is purged of observations overlapping test-label intervals
    Test set is assumed contiguous (shuffle=False), w/o training samples in between

    :param n_splits: (int) The number of splits. Default to 3
    :param samples_info_sets: (pd.Series) The information range on which each record is constructed from
        *samples_info_sets.index*: Time when the information extraction started.
        *samples_info_sets.value*: Time when the information extraction ended.
    :param pct_embargo: (float) Percent that determines the embargo size.
    """

    def __init__(self,
                 n_splits: int = 3,
                 n_test_splits: int = 2,
                 samples_info_sets: pd.Series = None,
                 pct_embargo: float = 0.):

        if not isinstance(samples_info_sets, pd.Series):
            raise ValueError('The samples_info_sets param must be a pd.Series')
        super(CombinatorialPurgedKFold, self).__init__(n_splits, shuffle=False, random_state=None)

        self.samples_info_sets = samples_info_sets
        self.pct_embargo = pct_embargo
        self.n_test_splits = n_test_splits
        self.num_backtest_paths = _get_number_of_backtest_paths(self.n_splits, self.n_test_splits)
        self.backtest_paths = []  # Array of backtest paths

    def _generate_combinatorial_test_ranges(self, splits_indices: dict) -> List:
        """
        Using start and end indices of test splits from KFolds and number of test_splits (self.n_test_splits),
        generates combinatorial test ranges splits

        :param splits_indices: (dict) Test fold integer index: [start test index, end test index]
        :return: (list) Combinatorial test splits ([start index, end index])
        """

        # Possible test splits for each fold
        combinatorial_splits = list(combinations(list(splits_indices.keys()), self.n_test_splits))
        combinatorial_test_ranges = []  # List of test indices formed from combinatorial splits
        for combination in combinatorial_splits:
            temp_test_indices = []  # Array of test indices for current split combination
            for int_index in combination:
                temp_test_indices.append(splits_indices[int_index])
            combinatorial_test_ranges.append(temp_test_indices)
        return combinatorial_test_ranges

    def _fill_backtest_paths(self, train_indices: list, test_splits: list):
        """
        Using start and end indices of test splits and purged/embargoed train indices from CPCV, find backtest path and
        place in the path where these indices should be used.

        :param test_splits: (list) of lists with first element corresponding to test start index and second - test end
        """
        # Fill backtest paths using train/test splits from CPCV
        for split in test_splits:
            found = False  # Flag indicating that split was found and filled in one of backtest paths
            for path in self.backtest_paths:
                for path_el in path:
                    if path_el['train'] is None and split == path_el['test'] and found is False:
                        path_el['train'] = np.array(train_indices)
                        path_el['test'] = list(range(split[0], split[-1]))
                        found = True

    # noinspection PyPep8Naming
    def split(self,
              X: pd.DataFrame,
              y: pd.Series = None,
              groups=None):
        """
        The main method to call for the PurgedKFold class

        :param X: (pd.DataFrame) Samples dataset that is to be split
        :param y: (pd.Series) Sample labels series
        :param groups: (array-like), with shape (n_samples,), optional
            Group labels for the samples used while splitting the dataset into
            train/test set.
        :return: (tuple) [train list of sample indices, and test list of sample indices]
        """
        if X.shape[0] != self.samples_info_sets.shape[0]:
            raise ValueError("X and the 'samples_info_sets' series param must be the same length")

        test_ranges: [(int, int)] = [(ix[0], ix[-1] + 1) for ix in np.array_split(np.arange(X.shape[0]), self.n_splits)]
        print ("test_ranges: ", test_ranges)
        print ("test_ranges len: ", len(test_ranges))
        splits_indices = {}
        splits_indices_reverse = {}
        for index, [start_ix, end_ix] in enumerate(test_ranges):
            splits_indices[index] = [start_ix, end_ix]
            splits_indices_reverse[start_ix, end_ix] = [index]
        print ("splits_indices: ", splits_indices)
        print ("splits_indices len : ", len(splits_indices))
        print ("splits_indices_reverse: ", splits_indices_reverse)
        combinatorial_test_ranges = self._generate_combinatorial_test_ranges(splits_indices)
        
        print ("combinatorial_test_ranges: ", combinatorial_test_ranges)
        print ("combinatorial_test_ranges len: ", len(combinatorial_test_ranges))
        
        # Prepare backtest paths
        for _ in range(self.num_backtest_paths):
            path = []
            for split_idx in splits_indices.values():
                path.append({'train': None, 'test': split_idx})
            self.backtest_paths.append(path)

        print ('X.shape: ',X.shape)
        embargo: int = int(X.shape[0] * self.pct_embargo)
        print ("comb test ranges len: ", len(combinatorial_test_ranges))
        for test_splits in combinatorial_test_ranges:
            
            print ("current test_splits: ", test_splits)
            print (splits_indices_reverse[tuple(test_splits[0])], splits_indices_reverse[tuple(test_splits[1])])
            
            # print (   )
            
            # Embargo
            test_times = pd.Series(index=[self.samples_info_sets[ix[0]] for ix in test_splits], data=[
                self.samples_info_sets[ix[1] - 1] if ix[1] - 1 + embargo >= X.shape[0] else self.samples_info_sets[
                    ix[1] - 1 + embargo]
                for ix in test_splits]) # test time list 에 train 으로부터 소거할 만큼

            test_indices = []
            for [start_ix, end_ix] in test_splits:
                test_indices.extend(list(range(start_ix, end_ix)))

            # Purge
            train_times = ml_get_train_times(self.samples_info_sets, test_times)

            # Get indices
            train_indices = []
            for train_ix in train_times.index:
                train_indices.append(self.samples_info_sets.index.get_loc(train_ix))

            self._fill_backtest_paths(train_indices, test_splits)

            yield np.array(train_indices), np.array(test_indices)


In [16]:
from itertools import combinations

In [17]:
numFold = 6 
combNum = 2
folds = [i for i in range(numFold)]
val_comb = list(combinations(folds, combNum))
fold_set = [(ix[0], ix[-1] + 1) for ix in np.array_split(np.arange(TRAIN.shape[0]), 6)]

# Splitting

In [18]:
history_points = 12
sample_info_sets = pd.Series(index=training_data[:-history_points].index, data=training_data[history_points:].index)
    # history_points 간격을 유지하면서 진행 

pct_embargo = 0.01

cv_gen_purged = CombinatorialPurgedKFold(n_splits=6, n_test_splits= 2,samples_info_sets=sample_info_sets, pct_embargo=pct_embargo)

In [19]:
i = 0 

all_X_training_data = np.array([training_data_array[i:i+history_points].copy() for i in range(len(training_data_array) - history_points)])
all_y_training_data = np.array([training_data_array[i + history_points].copy() for i in range(len(training_data_array) - history_points)])
gen = cv_gen_purged.split(X=all_X_training_data, y=all_y_training_data)

In [20]:
train_shape_ls = []
valid_shape_ls = []


In [21]:
i +=1
print ('i:', i)
train, valid = next(gen)
print ("\n")
print ("train shape: ", train.shape, "valid shape: ", valid.shape)
train_shape_ls.append(train.shape)
valid_shape_ls.append(valid.shape)

i: 1
test_ranges:  [(0, 1060), (1060, 2120), (2120, 3180), (3180, 4240), (4240, 5300), (5300, 6360)]
test_ranges len:  6
splits_indices:  {0: [0, 1060], 1: [1060, 2120], 2: [2120, 3180], 3: [3180, 4240], 4: [4240, 5300], 5: [5300, 6360]}
splits_indices len :  6
splits_indices_reverse:  {(0, 1060): [0], (1060, 2120): [1], (2120, 3180): [2], (3180, 4240): [3], (4240, 5300): [4], (5300, 6360): [5]}
combinatorial_test_ranges:  [[[0, 1060], [1060, 2120]], [[0, 1060], [2120, 3180]], [[0, 1060], [3180, 4240]], [[0, 1060], [4240, 5300]], [[0, 1060], [5300, 6360]], [[1060, 2120], [2120, 3180]], [[1060, 2120], [3180, 4240]], [[1060, 2120], [4240, 5300]], [[1060, 2120], [5300, 6360]], [[2120, 3180], [3180, 4240]], [[2120, 3180], [4240, 5300]], [[2120, 3180], [5300, 6360]], [[3180, 4240], [4240, 5300]], [[3180, 4240], [5300, 6360]], [[4240, 5300], [5300, 6360]]]
combinatorial_test_ranges len:  15
X.shape:  (6360, 12, 16)
comb test ranges len:  15
current test_splits:  [[0, 1060], [1060, 2120]]
[0]

In [22]:
i +=1
print ('i:', i)
train, valid = next(gen)
print ("\n")
print ("train shape: ", train.shape, "valid shape: ", valid.shape)
train_shape_ls.append(train.shape)
valid_shape_ls.append(valid.shape)

i: 2
current test_splits:  [[0, 1060], [2120, 3180]]
[0] [2]


train shape:  (4090,) valid shape:  (2120,)


In [23]:
print ((1060-0)+(6360 - 5300))

2120


In [24]:
train_shape_ls

[(4165,), (4090,)]

In [25]:
valid_shape_ls

[(2120,), (2120,)]

In [26]:
fold_dict = {}

for key, value in enumerate(fold_set):
    fold_dict[key] = value

In [27]:
fold_dict

{0: (0, 852),
 1: (852, 1704),
 2: (1704, 2555),
 3: (2555, 3406),
 4: (3406, 4257),
 5: (4257, 5108)}

In [28]:
test_splits

NameError: name 'test_splits' is not defined

In [None]:
combinatorial_test_ranges = cv_gen_purged._generate_combinatorial_test_ranges(fold_dict)

In [None]:
combinatorial_test_ranges

[[(0, 852), (852, 1704)],
 [(0, 852), (1704, 2555)],
 [(0, 852), (2555, 3406)],
 [(0, 852), (3406, 4257)],
 [(0, 852), (4257, 5108)],
 [(852, 1704), (1704, 2555)],
 [(852, 1704), (2555, 3406)],
 [(852, 1704), (3406, 4257)],
 [(852, 1704), (4257, 5108)],
 [(1704, 2555), (2555, 3406)],
 [(1704, 2555), (3406, 4257)],
 [(1704, 2555), (4257, 5108)],
 [(2555, 3406), (3406, 4257)],
 [(2555, 3406), (4257, 5108)],
 [(3406, 4257), (4257, 5108)]]

In [None]:
len(combinatorial_test_ranges)

15

In [None]:
sample_info_sets[0]

'1997-06-05'

In [None]:
sample_info_sets

date
1997-05-20    1997-06-05
1997-05-21    1997-06-06
1997-05-22    1997-06-09
1997-05-23    1997-06-10
1997-05-26    1997-06-11
                 ...    
2021-09-28    2021-10-14
2021-09-29    2021-10-15
2021-09-30    2021-10-18
2021-10-01    2021-10-19
2021-10-04    2021-10-20
Name: date, Length: 6360, dtype: object

In [None]:
def yield_def():
    for i in [1,2,3]:
        yield (i)

In [None]:
a = yield_def()
next(a)

1

In [None]:
a = 0

train_id = []
valid_id = []
all_X_training_data = np.array([training_data_array[i:i+history_points].copy() for i in range(len(training_data_array) - history_points)])
all_y_training_data = np.array([training_data_array[i + history_points].copy() for i in range(len(training_data_array) - history_points)])
gen = cv_gen_purged.split(X=all_X_training_data, y=all_y_training_data)


In [None]:
embargo = all_X_training_data.shape[0] * pct_embargo

test_splits = fold_set

test_times = pd.Series(index=[sample_info_sets[ix[0]] for ix in test_splits], data=[
                sample_info_sets[ix[1] - 1] if ix[1] - 1 + embargo >= all_X_training_data.shape[0] else sample_info_sets[
                    ix[1] - 1 + embargo]
                for ix in test_splits])

KeyError: 914.6

In [None]:
history_points = 12
sample_info_sets = pd.Series(index=training_data[:-history_points].index, data=training_data[history_points:].index)

pct_embargo = 0.01
cv_gen_purged = CB.CombinatorialPurgedKFold(n_splits=4, samples_info_sets=sample_info_sets, pct_embargo=pct_embargo)

In [None]:
df.shape

(6385, 16)

In [None]:
sample_info_sets.shape

(6360,)

In [None]:
cv_gen_purged

In [None]:
combinatorial_test_ranges

[[(0, 852), (852, 1704)],
 [(0, 852), (1704, 2555)],
 [(0, 852), (2555, 3406)],
 [(0, 852), (3406, 4257)],
 [(0, 852), (4257, 5108)],
 [(852, 1704), (1704, 2555)],
 [(852, 1704), (2555, 3406)],
 [(852, 1704), (3406, 4257)],
 [(852, 1704), (4257, 5108)],
 [(1704, 2555), (2555, 3406)],
 [(1704, 2555), (3406, 4257)],
 [(1704, 2555), (4257, 5108)],
 [(2555, 3406), (3406, 4257)],
 [(2555, 3406), (4257, 5108)],
 [(3406, 4257), (4257, 5108)]]