In [2]:
import rpwf
from rpwf import database, rpwf
from pathlib import Path
from typing import Dict
from dataclasses import dataclass
import pandas
from sklearn.model_selection import (
    RepeatedStratifiedKFold,
    GridSearchCV,
    cross_val_score
)

In [3]:
tmp_dir = Path("C:\\Users\\hp458\\AppData\\Local\\Temp\\1\\Rtmp0kP0ze\\vignetteab4c80a84")
tmp_dir_posix = str(tmp_dir.as_posix())
print(tmp_dir_posix)

C:/Users/hp458/AppData/Local/Temp/1/Rtmp0kP0ze/vignetteab4c80a84


In [None]:
@dataclass
class TestArgs:
    db_name: str
    project_root: str
    cores: int
    inner_n_cv: int
    inner_n_repeats: int
    outer_n_cv: int
    outer_n_repeats: int

In [None]:
args = TestArgs("db.SQLite", tmp_dir_posix, 3, 2, 1, 2, 1)
wflow_id = 1

In [None]:
db_obj = database.Base(args.project_root, args.db_name)
wflow_df = db_obj.all_wflow()

In [None]:
wflow_obj = rpwf.Wflow(db_obj, wflow_id)
n_cores = args.cores

# Generate the parameters
p_grid = rpwf.RGrid(db_obj, wflow_obj).get_grid()

df_obj = rpwf.TrainDf(db_obj, wflow_obj)
X, y = df_obj.get_df_X(), df_obj.get_df_y()

model_type_obj = rpwf.Model(db_obj, wflow_obj)
base_learner = rpwf.BaseLearner(wflow_obj, model_type_obj).base_learner
score = wflow_obj._get_par("costs")

In [None]:
p_grid

In [None]:
test = p_grid[1]

In [None]:
test

In [None]:
def val_to_list(d: Dict):
    for v in d:
        d[v] = [d[v]]
    return d

In [None]:
wrapped_grid = [None] * len(p_grid)

In [None]:
for i in range(len(p_grid)):
    wrapped_grid[i]= val_to_list(p_grid[i])

In [None]:
wrapped_grid

In [None]:
# Nested resampling
inner_cv = RepeatedStratifiedKFold(
    n_splits=args.inner_n_cv, 
    n_repeats=args.inner_n_repeats,
    random_state=wflow_obj.random_state
)
outer_cv = RepeatedStratifiedKFold(
    n_splits=args.outer_n_cv,
    n_repeats=args.outer_n_repeats,
    random_state=wflow_obj.random_state,
)

if p_grid is None:
    print("No tune grid specified, running with default params")
    nested_score = cross_val_score(
        base_learner, X=X, y=y, cv=outer_cv, n_jobs=n_cores, scoring=score
    )

else:
    print("Performing nested-cv using provided Rgrid")
    param_tuner = GridSearchCV(
        estimator=base_learner,
        param_grid=p_grid,
        cv=inner_cv,
        n_jobs=n_cores,
        scoring=score,
    )
    nested_score = cross_val_score(param_tuner, X=X, y=y, cv=outer_cv)

In [None]:
# if args.export:
    # Export the results
exporter = rpwf.Export(db_obj, wflow_obj)
nested_score_df = pandas.DataFrame(nested_score, columns=[score])
exporter.export_cv(nested_score_df, "nested_cv")
exporter.export_db()

In [None]:
%run -m rpwf.script.nested_resampling -h

In [4]:
%run -m rpwf.script.nested_resampling $tmp_dir_posix -db db.SQLite -s

db is at C:\/Users/hp458/AppData/Local/Temp/1/Rtmp0kP0ze/vignetteab4c80a84/rpwfDb/db.SQLite
Connecting to sqlite:///C:\/Users/hp458/AppData/Local/Temp/1/Rtmp0kP0ze/vignetteab4c80a84/rpwfDb/db.SQLite
   wflow_id      model_tag   recipe_tag  \
0         1  XGBClassifier     xgb_base   
1         2  XGBClassifier      xgb_pca   
2         3        svm_rbf  scaled_base   
3         4           enet  scaled_base   
4         5        svm_rbf   scaled_pca   
5         6           enet   scaled_pca   

                                py_base_learner_args result_path  
0  {"eval_metric":"logloss","n_estimators":50,"us...        None  
1  {"eval_metric":"logloss","n_estimators":50,"us...        None  
2                  {"kernel":"rbf","cache_size":500}        None  
3  {"solver":"saga","penalty":"elasticnet","warm_...        None  
4                  {"kernel":"rbf","cache_size":500}        None  
5  {"solver":"saga","penalty":"elasticnet","warm_...        None  


In [6]:
%run -m rpwf.script.nested_resampling $tmp_dir_posix -db db.SQLite -af -c 7 -icv 5 -icr 1 -ocv 5 -ocr 1

running [1, 2, 3, 4, 5, 6]
running wflow 1
Running <class 'xgboost.sklearn.XGBClassifier'>
Performing nested-cv using provided Rgrid


  schema = {
