# Hyper-parameter Tuning with Cross-Validation

### Loading Libraries

In [2]:
# Randomness
import random

# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd
from pandas import Timestamp

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt

import plotly.graph_objects as go
import plotly.io as pio
%matplotlib inline

# Date & Time
from datetime import datetime, timedelta

# Typing
from typing import Tuple, List, Dict, Union, Optional, Any, Generator

# Scikit-Learn
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, plot_roc_curve

# Scientific Statistical Python
from scipy.stats import jarque_bera

ImportError: cannot import name 'plot_roc_curve' from 'sklearn.metrics' (/Users/isisromero/anaconda3/envs/OOP/lib/python3.11/site-packages/sklearn/metrics/__init__.py)

In [3]:
class MyPipeline(Pipeline):
    
    def fit(self, X: pd.DataFrame, y: pd.Series, sample_weight: Optional[pd.Series] = None, **fit_params) -> 'MyPipeline':
        if sample_weight is not None:
            fit_params[self.steps[-1][0] + '__sample_weight'] = sample_weight
        return super().fit(X, y, **fit_params)

NameError: name 'Pipeline' is not defined

In [None]:
def clf_hyper_fit_base(
    feat: pd.DataFrame, lbl: pd.Series, t1: pd.Series, pipe_clf: Any, param_grid: Dict[str, list],
    cv: int = 3, bagging: list = [0, None, 1.0], n_jobs: int = -1, pctEmbargo: float = 0.0, **fit_params) -> Any:
   
    if set(lbl.values) == {0, 1}:
        scoring='f1'    # f1 for meta-labeling
    else:
        scoring='neg_log_loss'    # symmetric towards all cases
    inner_cv = PurgedKFold(n_splits=cv, t1=t1, pctEmbargo=pctEmbargo)    # purged
    gs=GridSearchCV(estimator=pipe_clf ,param_grid=param_grid, scoring=scoring, cv=inner_cv, n_jobs=n_jobs)
    gs = gs.fit(feat, lbl, **fit_params).best_estimator_    # pipeline
    if bagging[1] is not None and bagging[1] > 0:
        gs = BaggingClassifier(base_estimator=MyPipeline(gs.steps), n_estimators=int(bagging[0]),
                               max_samples=float(bagging[1]), max_features=float(bagging[2]), n_jobs=n_jobs)
        gs = gs.fit(feat, lbl, sample_weight=fit_params[gs.base_estimator.steps[-1][0]+'__sample_weight'])
        gs = Pipeline([('bag', gs)])
    return gs

In [None]:

def clf_hyper_fit(
    feat: pd.DataFrame, lbl: pd.Series, t1: pd.Series, pipe_clf: Any, param_grid: Dict[str, list],
    cv: int = 3, bagging: list = [0, None, 1.0], rndSearchIter: int = 0,
    n_jobs: int = -1, pctEmbargo: float = 0.0, **fit_params) -> Any:
    
    if set(lbl.values) == {0, 1}:
        scoring='f1'    # f1 for meta-labeling
    else:
        scoring='neg_log_loss'    # symmetric towards all cases
    inner_cv = PurgedKFold(n_splits=cv, t1=t1, pctEmbargo=pctEmbargo)    # purged
    
    if rndSearchIter == 0:
        gs = GridSearchCV(estimator=pipe_clf, param_grid=param_grid, scoring=scoring, cv=inner_cv, n_jobs=n_jobs)
    else:
        gs = RandomizedSearchCV(estimator=pipe_clf, param_distributions=param_grid, scoring=scoring,
                                cv=inner_cv, n_jobs=n_jobs, n_iter=rndSearchIter)
    gs = gs.fit(feat, lbl, **fit_params).best_estimator_    # pipeline
    
    if bagging[1] is not None and bagging[1] > 0:
        gs = BaggingClassifier(base_estimator=MyPipeline(gs.steps), n_estimators=int(bagging[0]),
                               max_samples=float(bagging[1]), max_features=float(bagging[2]), n_jobs=n_jobs)
        gs = gs.fit(feat, lbl, sample_weight=fit_params[gs.base_estimator.steps[-1][0]+'__sample_weight'])
        gs = Pipeline([('bag', gs)])
    return gs

In [None]:
class logUniform_gen(rv_continuous):
    def _cdf(self, x: float) -> float:
        return np.log(x / self.a) / np.log(self.b / self.a)


def log_uniform(a: float = 1.0, b: float = np.exp(1.0)) -> 'logUniform_gen':
    return logUniform_gen(a=a, b=b, name='log_uniform')

In [None]:
def get_IS_sharpe_ratio(clf: Any) -> float:
    best_estimator_ind = np.argmin(clf.cv_results_['rank_test_score'])
    mean_score = clf.cv_results_['mean_test_score'][best_estimator_ind]
    std_score = clf.cv_results_['std_test_score'][best_estimator_ind]
    if mean_score < 0:
        return -mean_score / std_score
    else:
        return mean_score / std_score