# Run hyper-parameter tuning for data preparation process

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
os.chdir("..")

'/home/ludvigwgerdin/courses/Financial Big Data/FinanicalBigData-EPFL/code'

In [3]:
import pandas as pd
import numpy as np
import pyarrow.feather as feather

from sklearn.linear_model import LogisticRegression
from utils.cv import PurgedGroupTimeSeriesSplit
from lightgbm import LGBMClassifier

from utils.metrics import utility_score
from utils.data_preparation import preprocess

from sklearn import metrics

In [4]:
os.cpu_count()

8

In [5]:
df = feather.read_feather("../data/raw/train.feather")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2390491 entries, 0 to 2390490
Columns: 138 entries, date to ts_id
dtypes: float64(135), int64(3)
memory usage: 2.5 GB


Reduce the memory usage by half when importing the data. 
Source: https://www.kaggle.com/jorijnsmit/one-liner-to-halve-your-memory-usage

In [6]:
float64_cols = df.select_dtypes(include='float64').columns
mapper = {col_name: np.float32 for col_name in float64_cols}
df = df.astype(mapper, copy=False)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2390491 entries, 0 to 2390490
Columns: 138 entries, date to ts_id
dtypes: float32(135), int64(3)
memory usage: 1.3 GB


In [7]:
# Define label
df["action"] = np.where(df["resp"]*df["weight"]>0, 1, 0)
# Get feature names
features = df.columns[df.columns.str.contains("feature")]

### Split train and test sets. 
Test set is used to compare the different methods.  There are 500 days in total.  We use 400 days to train and 90 days to test. 

In [8]:
gap = 10
split = 400
train = df[df.date <= split]
test = df[df.date > split+gap]

### Cross-Validation loop.

In [9]:
# Initialize Gap group TS CV.
K = 5
tscv = PurgedGroupTimeSeriesSplit(n_splits=K, 
                                  max_train_group_size=189,
                                  group_gap=gap, 
                                  max_test_group_size=63
                                  )

In [31]:
# Define parameter grid for feature selection method.  
param_grid_selection = {
    'impute_strategy': ['mean', 'median'],
    'K': ['n_clusters', 10, 20, 30, 40]
}
combinations_selection = generate_all_combinations(param_grid_selection)

In [11]:
# Define parameter grid for merging features method.
param_grid_subcluster = {
    'impute_strategy': ['mean', 'median'],
    'agg': [np.mean, np.prod, np.sum]
}
combinations_subcluster = generate_all_combinations(param_grid_subcluster)

In [12]:
print("# Combinations for intra cluster feature selection",len(combinations_selection),
      "\n# Combination for merging subclusters",len(combinations_subcluster))

# Combinations for intra cluster feature selection 8 
# Combination for merging subclusters 6


In [13]:
def predict_and_evaluate(df, X_tr_sc, X_te_sc, 
                         X_te, y_tr, 
                         tr, te,
                         lgbm_all_f):
    # Train LGBM with new features
    lgbm_sc = LGBMClassifier().fit(X_tr_sc, y_tr)
    
    # Compute predictions for both LGBM.
    y_pred_all_f = lgbm_all_f.predict(X_te)
    y_pred_sc = lgbm_sc.predict(X_te_sc)

    # Compute scoring metrics.
    score_te_all_f = utility_score(
        date=df["date"].iloc[te].values,
        weight=df["weight"].iloc[te].values,
        resp=df["resp"].iloc[te].values,
        action=y_pred_all_f
    )
    
    score_te_sc = utility_score(
        date=df["date"].iloc[te].values,
        weight=df["weight"].iloc[te].values,
        resp=df["resp"].iloc[te].values,
        action=y_pred_sc
    )
    
    return score_te_all_f, score_te_sc

In [1]:
def compute_comb_performance(train, comb, method=1):
    """
    method=1 is cluster weighted feature selection.
    method=2 is subcluster feature merging
    """
    utility = []
    for fold, (tr, te) in enumerate(tscv.split(X=train, y=train["action"], groups=train["date"])):    
        print(f"Fold: {fold}")
        
        # Split train and test for X and y in CV.
        X_tr, y_tr = train[features].iloc[tr], train["action"].iloc[tr]
        X_te, y_te = train[features].iloc[te], train["action"].iloc[te]
    
        # Standardize, community detection, and feature selection
        X_tr_sc, X_te_sc, lgbm_all_f = preprocess(
            df = train,
            X_tr=X_tr,
            y_tr=y_tr,
            X_te=X_te,
            tr=tr,
            comb=comb,
            method=method
        )
        
        # Model fitting and evaluation
        score_te_all_f, score_te_sc = predict_and_evaluate(
            train, X_tr_sc, X_te_sc, X_te, y_tr, tr, te, lgbm_all_f
        )
        
        utility.append({"all_features":score_te_all_f,
                        "reduced_features":score_te_sc})
    utility_df = pd.DataFrame(utility)
    return utility_df.mean()

In [15]:
from joblib import Parallel, delayed
from tqdm import tqdm
%env JOBLIB_TEMP_FOLDER=/tmp

env: JOBLIB_TEMP_FOLDER=/tmp


In [32]:
results_selection = Parallel(n_jobs=2)(delayed(compute_comb_performance)(train, comb, method=1) for comb in combinations_selection)

In [None]:
scores_selection = pd.concat(results_selection, axis=1)
best_parameters_selection = combinations_selection[np.argmax(scores_selection.loc["reduced_features", :])]
best_parameters_selection

In [35]:
# Save results
import pickle
#pickle.dump(results_selection, open("../data/clean/results_selection.pickle", "wb"))
#pickle.dump(best_parameters_selection, open("../data/clean/best_parameters_selection.pickle", "wb"))

For the selection method, the gridsearch was accidentally run without one of the options. The best params are determined by merging results.

In [37]:
results_selection_n_clusters = pickle.load(open("../data/clean/results_selection_n_clusters.pickle", "rb"))
results_selection_wo_n_clusters = pickle.load(open("../data/clean/results_selection_wo_n_clusters.pickle", "rb"))

In [38]:
scores_selection = pd.concat(results_selection_n_clusters + results_selection_wo_n_clusters, axis=1)

In [39]:
scores_selection

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
all_features,191.738808,205.998504,191.738808,191.738808,191.738808,191.738808,205.998504,205.998504,205.998504,205.998504
reduced_features,39.292231,32.728847,41.968317,194.776724,131.787935,150.14435,109.495302,181.436749,179.904207,185.57016


The first two columns correspond to the gridsearch with only the n_clusters option for K. We see that an approach from the gridsearch without the option "n_clusters" performed best, so we use that in the results notebook

In [19]:
scores_subcluster = pd.concat(results_subcluster, axis=1)
best_parameters_subcluster = combinations_subcluster[np.argmax(scores_subcluster.loc["reduced_features", :])]
best_parameters_subcluster

{'impute_strategy': 'mean',
 'agg': <function numpy.sum(a, axis=None, dtype=None, out=None, keepdims=<no value>, initial=<no value>, where=<no value>)>}

In [20]:
scores_subcluster

Unnamed: 0,0,1,2,3,4,5
all_features,188.069953,188.069953,188.069953,184.00506,184.00506,184.00506
reduced_features,177.831777,99.927195,188.796126,170.065785,84.249073,150.080256


In [21]:
# Save results
pickle.dump(best_parameters_subcluster, open("../data/clean/best_parameters_subclusters.pickle", "wb"))