In [1]:
import pandas as pd 
import numpy as np 
import datetime 

from CV import cross_validation as CV
from CV import combinatorial as CB

# Order 
1. X, Y generation 
2. Train(valid) / Test splitting
3. CPCV
   어차피 여기서 뒤에서 개수만큼 잘라주는 거면 데이터 포인트를 자르는 것과 다를바 없음
   


In [2]:
df = pd.read_csv("./data/data_input_demo.csv", index_col = [0])
df = df.set_index(['date'])

In [3]:
df.head()

Unnamed: 0_level_0,13ty_index,interty_index,lty_index,mbs_index,13cy_index,intercy_index,lcy_index,ty_index,cy_index,agg_index,real_known,cat_obs1,cat_obs2,cat_knwon1,cat_knwon2,static
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1997-05-19,133.46,784.22,840.64,751.37,668.1,893.58,912.47,824.23,861.0,715.66,215.632652,2,2,4,3,1.0
1997-05-20,133.58,785.01,840.34,751.82,668.74,894.56,912.29,824.78,861.51,716.09,980.112727,2,1,5,4,1.0
1997-05-21,133.59,784.63,836.47,751.97,668.77,893.58,908.76,823.54,859.71,715.31,247.766286,2,1,4,4,1.0
1997-05-22,133.58,784.33,834.66,751.97,668.71,892.96,906.86,822.86,858.69,714.88,306.038065,2,1,3,4,1.0
1997-05-23,133.58,784.7,835.59,752.27,668.71,893.58,907.77,823.37,859.37,715.31,944.644883,0,2,3,3,1.0


In [4]:

def generate_xy_seq(df: pd.DataFrame, x_seq = 66, y_seq = 22):
    """
    Generate samples from
    :param df:
    :param x_seq:
    :param y_seq:
    :param scaler:
    :return:
    # x: (epoch_size, input_length, num_nodes, input_dim)
    # y: (epoch_size, output_length, num_nodes, output_dim)
    """
    num_samples, num_nodes = df.shape
    dates_arr = np.array(df.index)
    data = np.expand_dims(df.values, axis = -1) # df -> array [N, F, 1]

    x_offsets = np.arange(-x_seq+1, 1)  
    y_offsets = np.arange(1, y_seq+1)

    # feature_list = [data]

    x, y = [], []
    x_date, y_date = [],[]

    min_t = abs(min(x_offsets))
    max_t = abs(num_samples - abs(max(y_offsets)))

    for t in range(min_t, max_t):
        # value seperation
        x.append(data[t+x_offsets, ...])
        y.append(data[t+y_offsets, ...])
        # date seperation
        x_date.append(dates_arr[t+x_offsets])
        y_date.append(dates_arr[t+y_offsets])
        
    x = np.stack(x, axis = 0)
    y = np.stack(y, axis = 0)

    x_date = np.stack(x_date, axis = 0)
    y_date = np.stack(y_date, axis = 0)

    return x, y, x_date, y_date

In [5]:
X, Y, x_date, y_date = generate_xy_seq(df)

In [6]:
X.shape

(6298, 66, 16, 1)

In [7]:
x_date.shape

(6298, 66)

# Search Path Num by train/test index

In [8]:
from itertools import combinations

In [12]:
# combination 
total_split_num = 10
val_split_num = 2

folds = [i for i in range(total_split_num)]
val_comb = list(combinations(folds, val_split_num))
fold_set = [(ix[0], ix[-1] + 1) for ix in np.array_split(np.arange(X.shape[0]), 6)]

In [13]:
len(val_comb)

45

In [251]:
# Path Generation 
train_split_num = total_split_num - val_split_num

path_fold_num  = train_split_num + 1 # 한 path 에 존재하는 fold 의 개수 / train_split_num + 1  = 5
path_num = int(len(val_comb) * val_split_num / total_split_num) # 전체 path 의 개수 = path_fold_num 

model_num = len(val_comb) 


[[0], [1], [2], [3], [4], [5], [6], [7], [8]]

In [15]:
model_num

45

In [258]:
total_split_num

10

In [257]:
train_split_num

8

In [191]:


'''
Path 별 Train model index

1. Path 별 Train model index 정하기 
    1) path 별 block 개수를 정한다. (전체 split 개수 - test split 개수)
    2) 전체 path 개수를 정한다. 
    3) path 별 model 의 index 를 결정해준다. 
        - i-th path 의 first value : i 
        - 각 path 의 j-th value : (j-1)th value + train_split_num - (j-1)
        - i-th path 의 (i+1)-th value 부터는 path_i[i] + 1

2. Path 별 하나의 Train 별 test set index 

    하나의 train 에 대해서 test set 두개 존재 
    두개중 어떤 것이 해당 path 에 해당되는 것인지 확인 
    
3. 
'''
total_path_train = {}

for path_ind in range(path_num): # 
    train_model_ind = {}
    path_ls = []
    if path_ind == 0 :   
        firstfold = [i for i in range(path_fold_num)]
        firstfold = [x  for x in firstfold]
        total_path_train[path_ind] = (firstfold)
        
    else:
        path_ls = [0 for _ in range(path_fold_num)] # [0, 0, 0, 0, 0]
        path_ls[0] = path_ind
        
        for minus in range(train_split_num):
            ind = minus + 1
            if ind <= path_ind: # ind : 채워 넣고자 하는 위치
                path_ls[ind] = path_ls[ind-1] + train_split_num - minus
            else: # ind > path_ind 
                path_ls[ind] = path_ls[ind-1] + 1
        path_ls = [x for x in path_ls]
        total_path_train[path_ind] = path_ls

        

In [192]:
total_path_train

{0: [0, 1, 2, 3, 4, 5, 6, 7, 8],
 1: [1, 9, 10, 11, 12, 13, 14, 15, 16],
 2: [2, 10, 17, 18, 19, 20, 21, 22, 23],
 3: [3, 11, 18, 24, 25, 26, 27, 28, 29],
 4: [4, 12, 19, 25, 30, 31, 32, 33, 34],
 5: [5, 13, 20, 26, 31, 35, 36, 37, 38],
 6: [6, 14, 21, 27, 32, 36, 39, 40, 41],
 7: [7, 15, 22, 28, 33, 37, 40, 42, 43],
 8: [8, 16, 23, 29, 34, 38, 41, 43, 44]}

In [234]:
# path 별 TEST set 
total_path_test = {}

for path_ind in range(path_num):
    a = [0 for i in range(total_split_num-1)] # 각 path 에 대해서 test set 의 index 저장

    for split_ind in range(total_split_num-1):
        
        if split_ind < path_ind:
            a[split_ind] = 0        
        elif path_ind == split_ind:
            a[split_ind] = [0,1]
            
        else: # split_ind > path_ind
            a[split_ind] = 1
            
    total_path_test[path_ind] = a

In [235]:
total_path_train

{0: [0, 1, 2, 3, 4, 5, 6, 7, 8],
 1: [1, 9, 10, 11, 12, 13, 14, 15, 16],
 2: [2, 10, 17, 18, 19, 20, 21, 22, 23],
 3: [3, 11, 18, 24, 25, 26, 27, 28, 29],
 4: [4, 12, 19, 25, 30, 31, 32, 33, 34],
 5: [5, 13, 20, 26, 31, 35, 36, 37, 38],
 6: [6, 14, 21, 27, 32, 36, 39, 40, 41],
 7: [7, 15, 22, 28, 33, 37, 40, 42, 43],
 8: [8, 16, 23, 29, 34, 38, 41, 43, 44]}

In [240]:
total_path_test

{0: [[0, 1], 1, 1, 1, 1, 1, 1, 1, 1],
 1: [0, [0, 1], 1, 1, 1, 1, 1, 1, 1],
 2: [0, 0, [0, 1], 1, 1, 1, 1, 1, 1],
 3: [0, 0, 0, [0, 1], 1, 1, 1, 1, 1],
 4: [0, 0, 0, 0, [0, 1], 1, 1, 1, 1],
 5: [0, 0, 0, 0, 0, [0, 1], 1, 1, 1],
 6: [0, 0, 0, 0, 0, 0, [0, 1], 1, 1],
 7: [0, 0, 0, 0, 0, 0, 0, [0, 1], 1],
 8: [0, 0, 0, 0, 0, 0, 0, 0, [0, 1]]}

In [237]:
list(total_path_train.values())

[[0, 1, 2, 3, 4, 5, 6, 7, 8],
 [1, 9, 10, 11, 12, 13, 14, 15, 16],
 [2, 10, 17, 18, 19, 20, 21, 22, 23],
 [3, 11, 18, 24, 25, 26, 27, 28, 29],
 [4, 12, 19, 25, 30, 31, 32, 33, 34],
 [5, 13, 20, 26, 31, 35, 36, 37, 38],
 [6, 14, 21, 27, 32, 36, 39, 40, 41],
 [7, 15, 22, 28, 33, 37, 40, 42, 43],
 [8, 16, 23, 29, 34, 38, 41, 43, 44]]

In [231]:
list(total_path_test.values())

[[(0, 1), 1, 1, 1, 1, 1, 1, 1, 1],
 [0, (0, 1), 1, 1, 1, 1, 1, 1, 1],
 [0, 0, (0, 1), 1, 1, 1, 1, 1, 1],
 [0, 0, 0, (0, 1), 1, 1, 1, 1, 1],
 [0, 0, 0, 0, (0, 1), 1, 1, 1, 1],
 [0, 0, 0, 0, 0, (0, 1), 1, 1, 1],
 [0, 0, 0, 0, 0, 0, (0, 1), 1, 1],
 [0, 0, 0, 0, 0, 0, 0, (0, 1), 1],
 [0, 0, 0, 0, 0, 0, 0, 0, (0, 1)]]

In [248]:
# train, test index 별 path 번호 
tr_ts_ind_path = {} 

for path_ind in range(path_num):
    for split_ind in range(9):
        tr_ind = list(total_path_train.values())[path_ind][split_ind]
        ts_ind = list(total_path_test.values())[path_ind][split_ind]
        if type(ts_ind) == list:
            for ts in ts_ind:
                tr_ts_ind_path[tr_ind, ts] = path_ind
        else:
            tr_ts_ind_path[tr_ind, ts_ind] = path_ind
        

In [250]:
tr_ts_ind_path

{(0, 0): 0,
 (0, 1): 0,
 (1, 1): 0,
 (2, 1): 0,
 (3, 1): 0,
 (4, 1): 0,
 (5, 1): 0,
 (6, 1): 0,
 (7, 1): 0,
 (8, 1): 0,
 (1, 0): 1,
 (9, 0): 1,
 (9, 1): 1,
 (10, 1): 1,
 (11, 1): 1,
 (12, 1): 1,
 (13, 1): 1,
 (14, 1): 1,
 (15, 1): 1,
 (16, 1): 1,
 (2, 0): 2,
 (10, 0): 2,
 (17, 0): 2,
 (17, 1): 2,
 (18, 1): 2,
 (19, 1): 2,
 (20, 1): 2,
 (21, 1): 2,
 (22, 1): 2,
 (23, 1): 2,
 (3, 0): 3,
 (11, 0): 3,
 (18, 0): 3,
 (24, 0): 3,
 (24, 1): 3,
 (25, 1): 3,
 (26, 1): 3,
 (27, 1): 3,
 (28, 1): 3,
 (29, 1): 3,
 (4, 0): 4,
 (12, 0): 4,
 (19, 0): 4,
 (25, 0): 4,
 (30, 0): 4,
 (30, 1): 4,
 (31, 1): 4,
 (32, 1): 4,
 (33, 1): 4,
 (34, 1): 4,
 (5, 0): 5,
 (13, 0): 5,
 (20, 0): 5,
 (26, 0): 5,
 (31, 0): 5,
 (35, 0): 5,
 (35, 1): 5,
 (36, 1): 5,
 (37, 1): 5,
 (38, 1): 5,
 (6, 0): 6,
 (14, 0): 6,
 (21, 0): 6,
 (27, 0): 6,
 (32, 0): 6,
 (36, 0): 6,
 (39, 0): 6,
 (39, 1): 6,
 (40, 1): 6,
 (41, 1): 6,
 (7, 0): 7,
 (15, 0): 7,
 (22, 0): 7,
 (28, 0): 7,
 (33, 0): 7,
 (37, 0): 7,
 (40, 0): 7,
 (42, 0): 7,
 (42,

## Test set 3 개 일 때

In [446]:

import pandas as pd
import numpy as np 
from itertools import combinations

# combination 
total_split_num = 6
val_split_num = 3

folds = [i for i in range(total_split_num)]
val_comb = list(combinations(folds, val_split_num))
fold_set = [(ix[0], ix[-1] + 1) for ix in np.array_split(np.arange(X.shape[0]), 6)]


In [447]:
val_comb

[(0, 1, 2),
 (0, 1, 3),
 (0, 1, 4),
 (0, 1, 5),
 (0, 2, 3),
 (0, 2, 4),
 (0, 2, 5),
 (0, 3, 4),
 (0, 3, 5),
 (0, 4, 5),
 (1, 2, 3),
 (1, 2, 4),
 (1, 2, 5),
 (1, 3, 4),
 (1, 3, 5),
 (1, 4, 5),
 (2, 3, 4),
 (2, 3, 5),
 (2, 4, 5),
 (3, 4, 5)]

In [448]:
# Path Generation 
train_split_num = total_split_num - val_split_num

path_fold_num  = train_split_num + 1 # 한 path 에 존재하는 fold 의 개수 / train_split_num + 1  = 5
path_num = int(len(val_comb) * val_split_num / total_split_num) # 전체 path 의 개수 = path_fold_num 

In [456]:
path_fold_num

4

In [450]:
path_num

10

In [460]:

# total_path_train 
    # Path 별 train number
'''
Path 별 Train model index

1. Path 별 Train model index 정하기 
    1) path 별 block 개수를 정한다. (전체 split 개수 - test split 개수)
    2) 전체 path 개수를 정한다. 
    3) path 별 model 의 index 를 결정해준다. 
        - i-th path 의 first value : i 
        - 각 path 의 j-th value : (j-1)th value + train_split_num - (j-1)
        - i-th path 의 (i+1)-th value 부터는 path_i[i] + 1

2. Path 별 하나의 Train 별 test set index 

    하나의 train 에 대해서 test set 두개 존재 
    두개중 어떤 것이 해당 path 에 해당되는 것인지 확인 
    
3. 
'''
total_path_train = {}

for path_ind in range(path_num): # 
    train_model_ind = {}
    path_ls = []
    if path_ind == 0 :   
        firstfold = [i for i in range(path_fold_num)]
        firstfold = [x  for x in firstfold]
        total_path_train[path_ind] = (firstfold)
    
    elif path_ind <= (path_fold_num-1):
        path_ls = [0 for _ in range(path_fold_num)] # [0, 0, 0, 0, 0]
        path_ls[0] = path_ind
        for minus in range(train_split_num):
            ind = minus + 1
            if ind <= path_ind: # ind : 채워 넣고자 하는 위치
                path_ls[ind] = path_ls[ind-1] + train_split_num - minus
            else: # ind > path_ind 
                if ind > path_fold_num:
                  path_ls[ind] = path_ls[ind-1] + 6  
                # path_ls[ind] = path_ls[ind-1] + 1
                path_ls[ind] = path_ls[ind-1] + 1
    
    elif path_ind >= path_fold_num:
        path_ls = [0 for _ in range(path_fold_num)] # [0, 0, 0, 0, 0]
        
        
        
        
                    
        path_ls = [x for x in path_ls]
        total_path_train[path_ind] = path_ls


In [464]:
train_split_num

3

In [461]:
path_fold_num

4

In [462]:
total_path_train # 6C3 d일경우 0~3까지 작동 ok

{0: [0, 1, 2, 3],
 1: [1, 4, 5, 6],
 2: [2, 5, 7, 8],
 3: [3, 6, 8, 9],
 4: [4, 7, 9, 10],
 5: [5, 8, 10, 11],
 6: [6, 9, 11, 12],
 7: [7, 10, 12, 13],
 8: [8, 11, 13, 14],
 9: [9, 12, 14, 15]}

In [463]:
{4: [4, 10, 11, 12]}

{4: [4, 10, 11, 12]}

In [453]:
# path 별 TEST set 
total_path_test = {}

for path_ind in range(path_num):
    a = [0 for i in range(total_split_num-1)] # 각 path 에 대해서 test set 의 index 저장
    for split_ind in range(total_split_num-1):
        if split_ind < path_ind:
            a[split_ind] = 0        
        elif path_ind == split_ind:
            a[split_ind] = [0,1,2]
        else: # split_ind > path_ind
            a[split_ind] = 1
    total_path_test[path_ind] = a

In [454]:
total_path_test

{0: [[0, 1, 2], 1, 1, 1, 1],
 1: [0, [0, 1, 2], 1, 1, 1],
 2: [0, 0, [0, 1, 2], 1, 1],
 3: [0, 0, 0, [0, 1, 2], 1],
 4: [0, 0, 0, 0, [0, 1, 2]],
 5: [0, 0, 0, 0, 0],
 6: [0, 0, 0, 0, 0],
 7: [0, 0, 0, 0, 0],
 8: [0, 0, 0, 0, 0],
 9: [0, 0, 0, 0, 0]}

In [None]:
# train, test index 별 path 번호 
# search_path_num
tr_ts_ind_path = {} 
for path_ind in range(path_num):
    for split_ind in range(total_split_num - 1):
        tr_ind = list(total_path_train.values())[path_ind][split_ind]
        ts_ind = list(total_path_test.values())[path_ind][split_ind]
        if type(ts_ind) == list:
            for ts in ts_ind:
                tr_ts_ind_path[tr_ind, ts] = path_ind
        else:
            tr_ts_ind_path[tr_ind, ts_ind] = path_ind
        # tr_ts_ind_path = {(0,0): 0, (0,1): 0, (1,1): 0, ..., (43,1):8}

In [311]:
pathmap = TrainValidPathNum(total_split_num, 2)

In [336]:
train_valid_path = pathmap.train_valid_path()

In [337]:
train_valid_path

{(0, 0): 0,
 (0, 1): 0,
 (1, 1): 0,
 (2, 1): 0,
 (3, 1): 0,
 (4, 1): 0,
 (5, 1): 0,
 (6, 1): 0,
 (7, 1): 0,
 (8, 1): 0,
 (1, 0): 1,
 (9, 0): 1,
 (9, 1): 1,
 (10, 1): 1,
 (11, 1): 1,
 (12, 1): 1,
 (13, 1): 1,
 (14, 1): 1,
 (15, 1): 1,
 (16, 1): 1,
 (2, 0): 2,
 (10, 0): 2,
 (17, 0): 2,
 (17, 1): 2,
 (18, 1): 2,
 (19, 1): 2,
 (20, 1): 2,
 (21, 1): 2,
 (22, 1): 2,
 (23, 1): 2,
 (3, 0): 3,
 (11, 0): 3,
 (18, 0): 3,
 (24, 0): 3,
 (24, 1): 3,
 (25, 1): 3,
 (26, 1): 3,
 (27, 1): 3,
 (28, 1): 3,
 (29, 1): 3,
 (4, 0): 4,
 (12, 0): 4,
 (19, 0): 4,
 (25, 0): 4,
 (30, 0): 4,
 (30, 1): 4,
 (31, 1): 4,
 (32, 1): 4,
 (33, 1): 4,
 (34, 1): 4,
 (5, 0): 5,
 (13, 0): 5,
 (20, 0): 5,
 (26, 0): 5,
 (31, 0): 5,
 (35, 0): 5,
 (35, 1): 5,
 (36, 1): 5,
 (37, 1): 5,
 (38, 1): 5,
 (6, 0): 6,
 (14, 0): 6,
 (21, 0): 6,
 (27, 0): 6,
 (32, 0): 6,
 (36, 0): 6,
 (39, 0): 6,
 (39, 1): 6,
 (40, 1): 6,
 (41, 1): 6,
 (7, 0): 7,
 (15, 0): 7,
 (22, 0): 7,
 (28, 0): 7,
 (33, 0): 7,
 (37, 0): 7,
 (40, 0): 7,
 (42, 0): 7,
 (42,

In [489]:
# CPCV Path 
    # block 별로 path counting 하기 

n_splits = 6
n_val = 2

splits = [i for i in range(n_splits)]
val_comb = list(combinations(splits, n_val))
total_train_num = len(val_comb)

train_path_count = [0 for _ in range(n_splits)]
train_path_pair = {}
for val_group in val_comb:
    path_comb = []
    for split_ind in val_group:
        path_comb.append(train_path_count[split_ind])
        train_path_count[split_ind]+=1
    
    train_path_pair[val_group] = tuple(path_comb)

In [490]:
train_path_pair

{(0, 1): (0, 0),
 (0, 2): (1, 0),
 (0, 3): (2, 0),
 (0, 4): (3, 0),
 (0, 5): (4, 0),
 (1, 2): (1, 1),
 (1, 3): (2, 1),
 (1, 4): (3, 1),
 (1, 5): (4, 1),
 (2, 3): (2, 2),
 (2, 4): (3, 2),
 (2, 5): (4, 2),
 (3, 4): (3, 3),
 (3, 5): (4, 3),
 (4, 5): (4, 4)}

In [477]:
class CPCVPath:
    def __init__(self, n_groups, n_test_groups):
        self.n_groups = n_groups
        self.n_test_groups = n_test_groups
        # calculate test_groups combinations
        self.combination = list(combinations([x for x in range(n_groups)], n_test_groups))
        self._set_path_indexes()
    def _set_path_indexes(self):
        cnt_path = [0 for _ in range(self.n_groups)]
        print ("cnt_path: ", cnt_path)
        self.pairs = {}
        for group in self.combination:
            print ("-----------------------")
            print ("Group: ", group)
            temp_path = [0 for _ in range(len(group))]
            print ("temp_path: ", temp_path)
            for idx, group_index in enumerate(group):
                temp_path[idx] = cnt_path[group_index]
                print ("group_index : ", group_index)
                print ("cnt_path[group_index]: ", cnt_path[group_index])
                cnt_path[group_index] += 1
            self.pairs[group] = tuple(temp_path)
    def get_path(self, test_group_indexes):
        return self.pairs[test_group_indexes]

In [478]:
cppath = CPCVPath(6, 3)

cnt_path:  [0, 0, 0, 0, 0, 0]
-----------------------
Group:  (0, 1, 2)
temp_path:  [0, 0, 0]
group_index :  0
cnt_path[group_index]:  0
group_index :  1
cnt_path[group_index]:  0
group_index :  2
cnt_path[group_index]:  0
-----------------------
Group:  (0, 1, 3)
temp_path:  [0, 0, 0]
group_index :  0
cnt_path[group_index]:  1
group_index :  1
cnt_path[group_index]:  1
group_index :  3
cnt_path[group_index]:  0
-----------------------
Group:  (0, 1, 4)
temp_path:  [0, 0, 0]
group_index :  0
cnt_path[group_index]:  2
group_index :  1
cnt_path[group_index]:  2
group_index :  4
cnt_path[group_index]:  0
-----------------------
Group:  (0, 1, 5)
temp_path:  [0, 0, 0]
group_index :  0
cnt_path[group_index]:  3
group_index :  1
cnt_path[group_index]:  3
group_index :  5
cnt_path[group_index]:  0
-----------------------
Group:  (0, 2, 3)
temp_path:  [0, 0, 0]
group_index :  0
cnt_path[group_index]:  4
group_index :  2
cnt_path[group_index]:  1
group_index :  3
cnt_path[group_index]:  1
-----

In [472]:
cppath.pairs

{(0, 1, 2): (0, 0, 0),
 (0, 1, 3): (1, 1, 0),
 (0, 1, 4): (2, 2, 0),
 (0, 1, 5): (3, 3, 0),
 (0, 2, 3): (4, 1, 1),
 (0, 2, 4): (5, 2, 1),
 (0, 2, 5): (6, 3, 1),
 (0, 3, 4): (7, 2, 2),
 (0, 3, 5): (8, 3, 2),
 (0, 4, 5): (9, 3, 3),
 (1, 2, 3): (4, 4, 4),
 (1, 2, 4): (5, 5, 4),
 (1, 2, 5): (6, 6, 4),
 (1, 3, 4): (7, 5, 5),
 (1, 3, 5): (8, 6, 5),
 (1, 4, 5): (9, 6, 6),
 (2, 3, 4): (7, 7, 7),
 (2, 3, 5): (8, 8, 7),
 (2, 4, 5): (9, 8, 8),
 (3, 4, 5): (9, 9, 9)}

In [68]:
from typing import Callable
import pandas as pd
import numpy as np

from sklearn.metrics import log_loss
from sklearn.model_selection import KFold
from sklearn.base import ClassifierMixin
from sklearn.model_selection import BaseCrossValidator


def ml_get_train_times(samples_info_sets: pd.Series, test_times: pd.Series) -> pd.Series:
    # pylint: disable=invalid-name
    """
    Advances in Financial Machine Learning, Snippet 7.1, page 106.

    Purging observations in the training set

    This function find the training set indexes given the information on which each record is based
    and the range for the test set.
    Given test_times, find the times of the training observations.

    :param samples_info_sets: (pd.Series) The information range on which each record is constructed from
        *samples_info_sets.index*: Time when the information extraction started.
        *samples_info_sets.value*: Time when the information extraction ended.
    :param test_times: (pd.Series) Times for the test dataset.
    :return: (pd.Series) Training set
    """
    train = samples_info_sets.copy(deep=True)
    # train.index : train start index 
    # train : train end index
    
    for start_ix, end_ix in test_times.iteritems():
        df0 = train[(start_ix <= train.index) & (train.index <= end_ix)].index  # Train starts within test
        df1 = train[(start_ix <= train) & (train <= end_ix)].index  # Train ends within test
        df2 = train[(train.index <= start_ix) & (end_ix <= train)].index  # Train envelops test
        train = train.drop(df0.union(df1).union(df2))
        
    return train


In [252]:

import pandas as pd
import numpy as np 
from itertools import combinations

# combination 
total_split_num = 10
val_split_num = 2

folds = [i for i in range(total_split_num)]
val_comb = list(combinations(folds, val_split_num))
fold_set = [(ix[0], ix[-1] + 1) for ix in np.array_split(np.arange(X.shape[0]), 6)]


# Path Generation 
train_split_num = total_split_num - val_split_num

path_fold_num  = train_split_num + 1 # 한 path 에 존재하는 fold 의 개수 / train_split_num + 1  = 5
path_num = int(len(val_comb) * val_split_num / total_split_num) # 전체 path 의 개수 = path_fold_num 

model_num = len(val_comb) 


In [254]:
model_num

45

# CPCV
- Combination 
- purging
- embargo 

In [389]:
"""
Implements the Combinatorial Purged Cross-Validation class from Chapter 12
"""
import sys 


from itertools import combinations
from typing import List

import pandas as pd
import numpy as np

from scipy.special import comb
from sklearn.model_selection import KFold
# from .cross_validation import ml_get_train_times


def _get_number_of_backtest_paths(n_train_splits: int, n_test_splits: int) -> float:
    """
    Number of combinatorial paths for CPCV(N,K)
    :param n_train_splits: (int) number of train splits
    :param n_test_splits: (int) number of test splits
    :return: (int) number of backtest paths for CPCV(N,k)
    """
    return int(comb(n_train_splits, n_train_splits - n_test_splits) * n_test_splits / n_train_splits)


class CombinatorialPurgedKFold(KFold):
    """
    Advances in Financial Machine Learning, Chapter 12.

    Implements Combinatial Purged Cross Validation (CPCV)

    The train is purged of observations overlapping test-label intervals
    Test set is assumed contiguous (shuffle=False), w/o training samples in between

    :param n_splits: (int) The number of splits. Default to 3
    :param samples_info_sets: (pd.Series) The information range on which each record is constructed from
        *samples_info_sets.index*: Time when the information extraction started.
        *samples_info_sets.value*: Time when the information extraction ended.
    :param pct_embargo: (float) Percent that determines the embargo size.
    """

    def __init__(self,
                 n_splits: int = 3,
                 n_test_splits: int = 2,
                 samples_info_sets: pd.Series = None,
                 pct_embargo: float = 0.):

        if not isinstance(samples_info_sets, pd.Series):
            raise ValueError('The samples_info_sets param must be a pd.Series')
        super(CombinatorialPurgedKFold, self).__init__(n_splits, shuffle=False, random_state=None)

        self.samples_info_sets = samples_info_sets
        self.pct_embargo = pct_embargo
        self.n_test_splits = n_test_splits
        self.num_backtest_paths = _get_number_of_backtest_paths(self.n_splits, self.n_test_splits)
        self.backtest_paths = []  # Array of backtest paths

    def _generate_combinatorial_test_ranges(self, splits_indices: dict) -> List:
        """
        Using start and end indices of test splits from KFolds and number of test_splits (self.n_test_splits),
        generates combinatorial test ranges splits

        :param splits_indices: (dict) Test fold integer index: [start test index, end test index]
        :return: (list) Combinatorial test splits ([start index, end index])
        """

        # Possible test splits for each fold
        combinatorial_splits = list(combinations(list(splits_indices.keys()), self.n_test_splits))
        combinatorial_test_ranges = []  # List of test indices formed from combinatorial splits
        for combination in combinatorial_splits:
            temp_test_indices = []  # Array of test indices for current split combination
            for int_index in combination:
                temp_test_indices.append(splits_indices[int_index])
            combinatorial_test_ranges.append(temp_test_indices)
        return combinatorial_test_ranges

    def _fill_backtest_paths(self, train_indices: list, test_splits: list):
        """
        Using start and end indices of test splits and purged/embargoed train indices from CPCV, find backtest path and
        place in the path where these indices should be used.

        :param test_splits: (list) of lists with first element corresponding to test start index and second - test end
        """
        # Fill backtest paths using train/test splits from CPCV
        for split in test_splits:
            found = False  # Flag indicating that split was found and filled in one of backtest paths
            for path in self.backtest_paths:
                for path_el in path:
                    if path_el['train'] is None and split == path_el['test'] and found is False:
                        path_el['train'] = np.array(train_indices)
                        path_el['test'] = list(range(split[0], split[-1]))
                        found = True

    # noinspection PyPep8Naming
    def split(self,
              X: pd.DataFrame,
              y: pd.Series = None,
              groups=None):
        """
        The main method to call for the PurgedKFold class

        :param X: (pd.DataFrame) Samples dataset that is to be split
        :param y: (pd.Series) Sample labels series
        :param groups: (array-like), with shape (n_samples,), optional
            Group labels for the samples used while splitting the dataset into
            train/test set.
        :return: (tuple) [train list of sample indices, and test list of sample indices]
        """
        if X.shape[0] != self.samples_info_sets.shape[0]:
            raise ValueError("X and the 'samples_info_sets' series param must be the same length")

        test_ranges: [(int, int)] = [(ix[0], ix[-1] + 1) for ix in np.array_split(np.arange(X.shape[0]), self.n_splits)]
        splits_indices = {}
        for index, [start_ix, end_ix] in enumerate(test_ranges):
            splits_indices[index] = [start_ix, end_ix]

        combinatorial_test_ranges = self._generate_combinatorial_test_ranges(splits_indices)
        # Prepare backtest paths
        for _ in range(self.num_backtest_paths):
            path = []
            for split_idx in splits_indices.values():
                path.append({'train': None, 'test': split_idx})
            self.backtest_paths.append(path)

        embargo: int = int(X.shape[0] * self.pct_embargo)
        
        for test_splits in combinatorial_test_ranges:
            
            # Embargo
            test_times = pd.Series(index=[self.samples_info_sets[ix[0]] for ix in test_splits], data=[
                self.samples_info_sets[ix[1] - 1] if ix[1] - 1 + embargo >= X.shape[0] else self.samples_info_sets[
                    ix[1] - 1 + embargo]
                for ix in test_splits])

            test_indices = []
            for [start_ix, end_ix] in test_splits:
                test_indices.append(list(range(start_ix, end_ix)))

            # Purge
            train_times = ml_get_train_times(self.samples_info_sets, test_times)

            # Get indices
            train_indices = []
            for train_ix in train_times.index:
                train_indices.append(self.samples_info_sets.index.get_loc(train_ix))

            self._fill_backtest_paths(train_indices, test_splits)

            yield np.array(train_indices), [np.array(x) for x in test_indices] 


In [500]:
combinatorial_test_ranges

NameError: name 'combinatorial_test_ranges' is not defined

In [390]:
asset_prices = df.copy()

In [391]:
asset_name = None
number_of_assets = None
time = None
length_of_time = None
first_weights = None
all_weights = None

asset_name, number_of_assets, time, length_of_time, first_weights, all_weights = initialize(asset_prices)
monthly_return = calculate_return(asset_prices)

In [497]:
all_weights

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [392]:
training_data = monthly_return[1:-12].copy() # 마지막 12 제외
test_data = monthly_return[-24:].copy() # 마지막 24 부터 시작
# test_data.drop(['B','N','P'],axis=1 ,inplace=True)
training_data_array = np.array(training_data)
test_data_array = np.array(test_data)

In [424]:
history_points = 12

sample_info_sets = pd.Series(index=training_data[:-history_points].index, data=training_data[history_points:].index)
    # history_points 간격을 유지하면서 진행 

pct_embargo = 0.01

cv_gen_purged = CombinatorialPurgedKFold(n_splits=total_split_num, n_test_splits= val_split_num, samples_info_sets=sample_info_sets, pct_embargo=pct_embargo)

In [501]:
sample_info_sets

date
1997-05-20    1997-06-05
1997-05-21    1997-06-06
1997-05-22    1997-06-09
1997-05-23    1997-06-10
1997-05-26    1997-06-11
                 ...    
2021-09-28    2021-10-14
2021-09-29    2021-10-15
2021-09-30    2021-10-18
2021-10-01    2021-10-19
2021-10-04    2021-10-20
Name: date, Length: 6360, dtype: object

In [499]:
training_data.head()

Unnamed: 0_level_0,13ty_index,interty_index,lty_index,mbs_index,13cy_index,intercy_index,lcy_index,ty_index,cy_index,agg_index,real_known,cat_obs1,cat_obs2,cat_knwon1,cat_knwon2,static
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1997-05-20,0.000899,0.001007,-0.000357,0.000599,0.000958,0.001097,-0.000197,0.000667,0.000592,0.000601,3.545289,0.0,-0.5,0.25,0.333333,0.0
1997-05-21,7.5e-05,-0.000484,-0.004605,0.0002,4.5e-05,-0.001096,-0.003869,-0.001503,-0.002089,-0.001089,-0.747206,0.0,0.0,-0.2,0.0,0.0
1997-05-22,-7.5e-05,-0.000382,-0.002164,0.0,-9e-05,-0.000694,-0.002091,-0.000826,-0.001186,-0.000601,0.235188,0.0,0.0,-0.25,0.0,0.0
1997-05-23,0.0,0.000472,0.001114,0.000399,0.0,0.000694,0.001003,0.00062,0.000792,0.000601,2.086691,-1.0,1.0,0.0,-0.25,0.0
1997-05-26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.749731,0.0,-0.5,0.0,0.0,0.0


In [498]:
sample_info_sets

date
1997-05-20    1997-06-05
1997-05-21    1997-06-06
1997-05-22    1997-06-09
1997-05-23    1997-06-10
1997-05-26    1997-06-11
                 ...    
2021-09-28    2021-10-14
2021-09-29    2021-10-15
2021-09-30    2021-10-18
2021-10-01    2021-10-19
2021-10-04    2021-10-20
Name: date, Length: 6360, dtype: object

In [425]:
i = 0 

all_X_training_data = np.array([training_data_array[i:i+history_points].copy() for i in range(len(training_data_array) - history_points)])
all_y_training_data = np.array([training_data_array[i + history_points].copy() for i in range(len(training_data_array) - history_points)])
gen = cv_gen_purged.split(X=all_X_training_data, y=all_y_training_data)

In [426]:
all_X_training_data.shape

(6360, 12, 16)

In [427]:
all_X_training_data.shape

(6360, 12, 16)

# CPCV with Training Process

In [415]:
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader

import torch 
from torch import nn 


# Base Model 

In [None]:
batch_num = 32
epoch_num = 10

criterion = nn.L1Loss()
optimizer = torch.optim.Adam(Model.parameters(), lr = 0.01)

In [418]:
all_X_training_data.shape

(6360, 12, 16)

In [423]:
all_y_training_data.shape

(6360, 16)

In [422]:
np.array(df.index)[train]

array(['1997-05-19', '1997-05-20', '1997-05-21', ..., '2016-11-14',
       '2016-11-15', '2016-11-16'], dtype=object)

In [416]:
n_test = 2

total_loss = [0 for i in range(path_num)]

for train_ind, (train, valid_set) in enumerate(gen): # 결국 순서대로 -> 순서 자체가 train model number
    i +=1

    # print ('i:', i)
    # MODEL INIT

    # TRAIN     
    # TRAIN Dataloader generation 

    train_ds = TensorDataset(all_X_training_data[train], all_y_training_data[train])
    train_dl = DataLoader(train_ds, batch_size = batch_num, shuffle = True)
    
    # Epoch 
    for epoch in range(epoch_num):
        Model.train()
        epoch_train_loss = []
        
        for train_x, train_y in train_dl:

            train_pred = Model(train_x)
            
            loss_train = criterion(train_pred, train_y)
            loss_train.backward()
            optimizer.step()
            optimizer.zero_grad()
            

    # pred_tr = Model(TRAIN)
    # loss_tr = criterion(pred_tr, label)
    
    # TEST 
        # TEST Dataloader generation 
    Model.eval()
    for val_ind, valid in enumerate(valid_set):
        # print ('len vald: ', len(valid_set))
        valid_ds = TensorDataset(all_X_training_data[valid], all_y_training_data[valid])
        valid_dl = DataLoader(valid_ds, batch_size = batch_num, shuffle = True)
        
        for valid_x, valid_y in valid_dl:
            
            val_pred = Model(valid_x)
            
            criterion(val_pred, valid_y)
            
            
        path_ind = train_valid_path[(train_ind, val_ind)]
        # print (train_ind, val_ind)
        # print ("path_ind: ", path_ind)    
        # total_loss[path_ind] += loss_val
        
for i in range(len(total_loss)):
    total_loss[i] /= n_test



# print ("\n")
# print ("train shape: ", train.shape, "valid shape: ", valid.shape)
# train_shape_ls.append(train.shape)
# valid_shape_ls.append(valid.shape)

In [407]:
train

array([   0,    1,    2, ..., 5085, 5086, 5087])

In [410]:
X[train].shape

(5088, 66, 16, 1)

In [400]:
train.shape


(5088,)

In [None]:
train_ind, val_ind

(0, 0)

In [358]:
train_valid_path[train_ind, val_ind]

0

In [55]:
# Train - Test -Path 로 이어지는 dictionary 만들기 

path_ind = 0

train_test_path = {
    [i] for i in range(model_num)
}

train_ls = total_path_train[path_ind]
for train_ind in train_ls: 
    train_test_path[]
    


[1, 2, 3, 4, 5, 6, 7, 8, 9]

# X, Y generation 

In [20]:

def generate_xy_seq(df: pd.DataFrame, x_seq = 66, y_seq = 22):
    """
    Generate samples from
    :param df:
    :param x_seq:
    :param y_seq:
    :param scaler:
    :return:
    # x: (epoch_size, input_length, num_nodes, input_dim)
    # y: (epoch_size, output_length, num_nodes, output_dim)
    """
    num_samples, num_nodes = df.shape
    dates_arr = np.array(df.index)
    data = np.expand_dims(df.values, axis = -1) # df -> array [N, F, 1]

    x_offsets = np.arange(-x_seq+1, 1)  
    y_offsets = np.arange(1, y_seq+1)

    # feature_list = [data]

    x, y = [], []
    x_date, y_date = [],[]

    min_t = abs(min(x_offsets))
    max_t = abs(num_samples - abs(max(y_offsets)))

    for t in range(min_t, max_t):
        # value seperation
        x.append(data[t+x_offsets, ...])
        y.append(data[t+y_offsets, ...])
        # date seperation
        x_date.append(dates_arr[t+x_offsets])
        y_date.append(dates_arr[t+y_offsets])
        
    x = np.stack(x, axis = 0)
    y = np.stack(y, axis = 0)

    x_date = np.stack(x_date, axis = 0)
    y_date = np.stack(y_date, axis = 0)

    return x, y, x_date, y_date

In [21]:
x, y, x_date, y_date = generate_xy_seq(df)

In [22]:
x.shape

(6298, 66, 16, 1)

In [23]:
x_date.shape

(6298, 66)

# CPCV
- 6C2
- purging
- embargo 

In [24]:
def initialize(asset_prices):
    '''
    필요한 값들 생성

    :param asset_prices: (pd.DataFrame) Asset prices
    '''
    asset_name       = asset_prices.columns
    number_of_assets = asset_name.size
    time             = asset_prices.index
    length_of_time   = time.size
    first_weights    = np.ones(number_of_assets) / number_of_assets 
    all_weights      = np.zeros((length_of_time + 1, number_of_assets))

    return asset_name, number_of_assets, time, length_of_time, first_weights, all_weights


def calculate_return(asset_prices, resample_by=None):
    """
    수익률 계산 , 기간 resample 가능하게 만들기

    :param asset_prices: (pd.DataFrame) Asset prices
    :param resample_by: (str) Period to resample data, None for no resampling
    :return: (pd.DataFrame) Returns per asset
    """
    if resample_by:
        asset_prices = asset_prices.resample(resample_by).last()
    asset_returns = asset_prices.pct_change().fillna(0)
    return asset_returns

# 데이터분리 

In [70]:
a = []
# for i in range(10):
a.append(list(range(1,10)))
a.append(list(range(45,85)))

In [71]:
[np.array(x) for x in a]

[array([1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
        62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78,
        79, 80, 81, 82, 83, 84])]

In [72]:
from itertools import combinations

# Splitting

In [None]:
i +=1
print ('i:', i)
train, valid = next(gen)
print ("\n")
print ("train shape: ", train.shape, "valid shape: ", valid.shape)
train_shape_ls.append(train.shape)
valid_shape_ls.append(valid.shape)

i: 2
current test_splits:  [[0, 1060], [2120, 3180]]
[0] [2]


train shape:  (4090,) valid shape:  (2120,)


In [None]:
print ((1060-0)+(6360 - 5300))

2120


In [None]:
train_shape_ls

[(4165,), (4090,)]

In [None]:
valid_shape_ls

[(2120,), (2120,)]

In [None]:
fold_dict = {}

for key, value in enumerate(fold_set):
    fold_dict[key] = value

In [None]:
fold_dict

{0: (0, 852),
 1: (852, 1704),
 2: (1704, 2555),
 3: (2555, 3406),
 4: (3406, 4257),
 5: (4257, 5108)}

In [None]:
test_splits

NameError: name 'test_splits' is not defined

In [None]:
combinatorial_test_ranges = cv_gen_purged._generate_combinatorial_test_ranges(fold_dict)

In [None]:
combinatorial_test_ranges

[[(0, 852), (852, 1704)],
 [(0, 852), (1704, 2555)],
 [(0, 852), (2555, 3406)],
 [(0, 852), (3406, 4257)],
 [(0, 852), (4257, 5108)],
 [(852, 1704), (1704, 2555)],
 [(852, 1704), (2555, 3406)],
 [(852, 1704), (3406, 4257)],
 [(852, 1704), (4257, 5108)],
 [(1704, 2555), (2555, 3406)],
 [(1704, 2555), (3406, 4257)],
 [(1704, 2555), (4257, 5108)],
 [(2555, 3406), (3406, 4257)],
 [(2555, 3406), (4257, 5108)],
 [(3406, 4257), (4257, 5108)]]

In [None]:
len(combinatorial_test_ranges)

15

In [None]:
sample_info_sets[0]

'1997-06-05'

In [None]:
sample_info_sets

date
1997-05-20    1997-06-05
1997-05-21    1997-06-06
1997-05-22    1997-06-09
1997-05-23    1997-06-10
1997-05-26    1997-06-11
                 ...    
2021-09-28    2021-10-14
2021-09-29    2021-10-15
2021-09-30    2021-10-18
2021-10-01    2021-10-19
2021-10-04    2021-10-20
Name: date, Length: 6360, dtype: object