In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from joblib import load, dump, Parallel, delayed

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold
from sklearn.mixture import GaussianMixture

from scipy.stats import pearsonr
from scipy.special import comb
from itertools import combinations

# load data

In [2]:
data_path = "../../inputs/ubiquant-market-prediction"
# train = pd.read_pickle(os.path.join(data_path, "train_new.pkl"))
# train = pd.read_pickle(os.path.join(data_path, "train_normalized.pkl"))
train = pd.read_pickle(os.path.join(data_path, "train_demodel_cnn.pkl"))

# group by time id then normalize

In [None]:
# col = "target_demodel"
# col_mean = train[[col, "time_id"]].groupby("time_id").mean().rename(columns={col : "{}_mean".format(col)})
# col_std = train[[col, "time_id"]].groupby("time_id").std().rename(columns={col : "{}_std".format(col)})

# train = train.merge(col_mean, on="time_id")
# train = train.merge(col_std, on="time_id")

# train[col] = (train[col] - train["{}_mean".format(col)]) / train["{}_std".format(col)]

# train = train.drop(["{}_mean".format(col), "{}_std".format(col)], axis=1)

In [None]:
feature_cols = ["f_{}".format(feature_idx) for feature_idx in range(300)] + ["target_normalized", "avg_target_normalized", "target_demean_normalized", "avg_target_demean_normalized"]

for col in tqdm(feature_cols):
    
    col_mean = train[[col, "time_id"]].groupby("time_id").mean().rename(columns={col : "{}_mean".format(col)})
    col_std = train[[col, "time_id"]].groupby("time_id").std().rename(columns={col : "{}_std".format(col)})
    
    train = train.merge(col_mean, on="time_id")
    train = train.merge(col_std, on="time_id")
    
    train[col] = (train[col] - train["{}_mean".format(col)]) / train["{}_std".format(col)]
    
    train = train.drop(["{}_mean".format(col), "{}_std".format(col)], axis=1)

# eda

In [None]:
# train.columns.tolist()

In [None]:
# feature_idx = 296
# feature_name = "f_{}".format(feature_idx)
# feature_arr = train[feature_name].values.flatten()

# gmm = GaussianMixture(n_components=3).fit(np.expand_dims(feature_arr, axis=1))

In [None]:
demodel_scaler = StandardScaler().fit(np.expand_dims(train["target_demodel"].values, axis=1))

train_demodel_mean, train_demodel_var = demodel_scaler.mean_, demodel_scaler.var_

train["target_demodel"] = train["target_demodel"].clip(
            train_demodel_mean[0] - 3 * np.sqrt(train_demodel_var[0]), 
            train_demodel_mean[0] + 3 * np.sqrt(train_demodel_var[0])
        )

In [None]:
corr, _ = pearsonr(train["target_demodel"].values, train["target"].values)
print(corr)

In [None]:
# clip with target
feature_cols = ["f_{}".format(feature_idx) for feature_idx in range(300)] + ["target_normalized", "avg_target_normalized", "target_demean_normalized", "avg_target_demean_normalized"]
scaler = StandardScaler().fit(train[feature_cols])


dump(scaler, "train_std_scaler.bin", compress=True)
dump(feature_cols, "train_std_scaler_columns.pkl")

# clip outliers include target
train_mean, train_var = scaler.mean_, scaler.var_

for col_idx, col in enumerate(feature_cols):

    train[col] = train[col].clip(
            train_mean[col_idx] - 3 * np.sqrt(train_var[col_idx]), 
            train_mean[col_idx] + 3 * np.sqrt(train_var[col_idx])
        )

In [None]:
dump(train_mean, "train_mean.pkl")
dump(train_var, "train_var.pkl")

In [None]:
for feature_col in feature_cols:

    feature_arr = train[feature_col].values.flatten()

    print(feature_col, feature_arr.mean(), feature_arr.std())

    _ = plt.hist(feature_arr, 200)
    plt.show()

In [None]:
# train.to_pickle(os.path.join(data_path, "train_normalized.pkl"))
train.to_pickle(os.path.join(data_path, "train_demodel.pkl"))

# group kfold

In [None]:
class CombinatorialPurgedGroupKFold():
    def __init__(self, n_splits = 6, n_test_splits = 2, purge = 1, pctEmbargo = 0.01, **kwargs):
        self.n_splits = n_splits
        self.n_test_splits = n_test_splits
        self.purge = purge
        self.pctEmbargo = pctEmbargo
        
    def split(self, X, y = None, groups = None):
        if groups is None:
            raise ValueError(
                "The 'groups' parameter should not be None")
            
        u, ind = np.unique(groups, return_index = True)
        unique_groups = u[np.argsort(ind)]
        n_groups = len(unique_groups)
        group_dict = {}
        for idx in range(len(X)):
            if groups[idx] in group_dict:
                group_dict[groups[idx]].append(idx)
            else:
                group_dict[groups[idx]] = [idx]
                
        n_folds = comb(self.n_splits, self.n_test_splits, exact = True)
        if n_folds > n_groups:
            raise ValueError(
                ("Cannot have number of folds={0} greater than"
                 " the number of groups={1}").format(n_folds,
                                                     n_groups))
            
        mbrg = int(n_groups * self.pctEmbargo)
        if mbrg < 0:
            raise ValueError(
                "The number of 'embargoed' groups should not be negative")
        
        split_dict = {}
        group_test_size = n_groups // self.n_splits
        for split in range(self.n_splits):
            if split == self.n_splits - 1:
                split_dict[split] = unique_groups[int(split * group_test_size):].tolist()
            else:
                split_dict[split] = unique_groups[int(split * group_test_size):int((split + 1) * group_test_size)].tolist()
        
        for test_splits in combinations(range(self.n_splits), self.n_test_splits):
            test_groups = []
            banned_groups = []
            for split in test_splits:
                test_groups += split_dict[split]
                banned_groups += unique_groups[split_dict[split][0] - self.purge:split_dict[split][0]].tolist()
                banned_groups += unique_groups[split_dict[split][-1] + 1:split_dict[split][-1] + self.purge + mbrg + 1].tolist()
            train_groups = [i for i in unique_groups if (i not in banned_groups) and (i not in test_groups)]

            train_idx = []
            test_idx = []
            for train_group in train_groups:
                train_idx += group_dict[train_group]
            for test_group in test_groups:
                test_idx += group_dict[test_group]
            yield train_idx, test_idx

In [3]:
kfold = GroupKFold(n_splits=4)

for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(train, train["target"], train["time_id"])):
    
    df_train = train.iloc[trn_idx]
    df_val = train.iloc[val_idx]
    
    df_train.to_pickle(os.path.join(data_path, "train_normalized_GroupKFold_{}_train.pkl".format(fold_id)))
    df_val.to_pickle(os.path.join(data_path, "train_normalized_GroupKFold_{}_val.pkl".format(fold_id)))

In [None]:
data_path = "../../inputs/ubiquant-market-prediction"
train = pd.read_pickle(os.path.join(data_path, "train_new.pkl"))

kfold = GroupKFold(n_splits=4)

for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(train, train["target"], train["time_id"])):
    
    df_train = train.iloc[trn_idx]
    df_val = train.iloc[val_idx]
    
    df_train.to_pickle(os.path.join(data_path, "train_new_GroupKFold_{}_train.pkl".format(fold_id)))
    df_val.to_pickle(os.path.join(data_path, "train_new_GroupKFold_{}_val.pkl".format(fold_id)))

In [None]:
data_path = "../../inputs/ubiquant-market-prediction"
train = pd.read_pickle(os.path.join(data_path, "train_demodel.pkl"))

kfold = GroupKFold(n_splits=4)

for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(train, train["target"], train["time_id"])):
    
    df_train = train.iloc[trn_idx]
    df_val = train.iloc[val_idx]
    
    df_train.to_pickle(os.path.join(data_path, "train_demodel_GroupKFold_{}_train.pkl".format(fold_id)))
    df_val.to_pickle(os.path.join(data_path, "train_demodel_GroupKFold_{}_val.pkl".format(fold_id)))