In [4]:
import rpwf
from rpwf import database, rpwf
from pathlib import Path
from dataclasses import dataclass
import pandas
from sklearn.model_selection import (
    RepeatedStratifiedKFold,
    GridSearchCV,
    StratifiedKFold,
    cross_val_score
)

In [5]:
tmp_dir = Path("C:\\Users\\hp458\\AppData\\Local\\Temp\\1\\Rtmp0kP0ze\\vignetteab4c80a84")
tmp_dir_posix = str(tmp_dir.as_posix())
print(tmp_dir_posix)

C:/Users/hp458/AppData/Local/Temp/1/Rtmp0kP0ze/vignetteab4c80a84


In [None]:
@dataclass
class TestArgs:
    db_name: str
    project_root: str
    cores: int
    inner_n_cv: int
    outer_n_cv: int
    outer_n_repeats: int

In [None]:
args = TestArgs("db.SQLite", tmp_dir_posix, 1, 1, 1, 1)
wflow_id = 1

In [None]:
db_obj = database.Base(args.project_root, args.db_name)
wflow_df = db_obj.all_wflow()

In [None]:
wflow_obj = rpwf.Wflow(db_obj, wflow_id)
n_cores = args.cores

# Generate the parameters
p_grid = rpwf.RGrid(db_obj, wflow_obj).get_grid()

df_obj = rpwf.TrainDf(db_obj, wflow_obj)
X, y = df_obj.get_df_X(), df_obj.get_df_y()

model_type_obj = rpwf.Model(db_obj, wflow_obj)
base_learner = rpwf.BaseLearner(wflow_obj, model_type_obj).base_learner
score = wflow_obj._get_par("costs")

In [None]:
# Nested resampling
inner_cv = StratifiedKFold(
    n_splits=args.inner_n_cv, shuffle=True, random_state=wflow_obj.random_state
)
outer_cv = RepeatedStratifiedKFold(
    n_splits=args.outer_n_cv,
    n_repeats=args.outer_n_repeats,
    random_state=wflow_obj.random_state,
)

if p_grid is None:
    print("No tune grid specified, running with default params")
    nested_score = cross_val_score(
        base_learner, X=X, y=y, cv=outer_cv, n_jobs=n_cores, scoring=score
    )

else:
    print("Performing nested-cv using provided Rgrid")
    param_tuner = GridSearchCV(
        estimator=base_learner,
        param_grid=p_grid,
        cv=inner_cv,
        n_jobs=n_cores,
        scoring=score,
    )
    nested_score = cross_val_score(param_tuner, X=X, y=y, cv=outer_cv)

# if args.export:
    # Export the results
exporter = rpwf.Export(db_obj, wflow_obj)
nested_score_df = pandas.DataFrame(nested_score, columns=[score])
exporter.export_cv(nested_score_df, "nested_cv")
exporter.export_db()

In [1]:
%run -m rpwf.script.nested_resampling -h

usage: nested_resampling.py [-h] [-db db-name]
                            (-s | -a | -w wflow-id [wflow-id ...]) [-f]
                            [-c cores] [-icv inner-n-cv]
                            [-icr inner-n-repeats] [-ocv outer-n-cv]
                            [-ocr outer-n-repeats]
                            project_root

run nested_cv of the provided wflow_ids

positional arguments:
  project_root          path to directory that holds the 'rpwfDb' folder

optional arguments:
  -h, --help            show this help message and exit
  -db db-name, --db-name db-name
                        name of the database, (e.g. 'db.SQLite')
  -s, --show-wflow      show list of current workflows
  -a, --all-id          run all wflows in the db
  -w wflow-id [wflow-id ...], --wflow-id wflow-id [wflow-id ...]
                        input list of wflows to run
  -f, --force           force runnning of wflows
  -c cores, --cores cores
                        number of cores for parallizati

In [7]:
%run -m rpwf.script.nested_resampling $tmp_dir_posix -db db.SQLite -s

db is at C:\/Users/hp458/AppData/Local/Temp/1/Rtmp0kP0ze/vignetteab4c80a84/rpwfDb/db.SQLite
Connecting to sqlite:///C:\/Users/hp458/AppData/Local/Temp/1/Rtmp0kP0ze/vignetteab4c80a84/rpwfDb/db.SQLite
   wflow_id      model_tag   recipe_tag  \
0         1  XGBClassifier     xgb_base   
1         2  XGBClassifier      xgb_pca   
2         3        svm_rbf  scaled_base   
3         4           enet  scaled_base   
4         5        svm_rbf   scaled_pca   
5         6           enet   scaled_pca   

                                py_base_learner_args result_path  
0  {"eval_metric":"logloss","n_estimators":50,"us...        None  
1  {"eval_metric":"logloss","n_estimators":50,"us...        None  
2                  {"kernel":"rbf","cache_size":500}        None  
3  {"solver":"saga","penalty":"elasticnet","warm_...        None  
4                  {"kernel":"rbf","cache_size":500}        None  
5  {"solver":"saga","penalty":"elasticnet","warm_...        None  


In [9]:
%run -m rpwf.script.nested_resampling $tmp_dir_posix -db db.SQLite -a -c 7 -icv 5 -icr 1 -ocv 5 -ocr 5

running [1, 2, 3, 4, 5, 6]
running wflow 1
Running <class 'xgboost.sklearn.XGBClassifier'>
Performing nested-cv using provided Rgrid


  schema = {


ValueError: Parameter grid for parameter (colsample_bytree) needs to be a list or numpy array, but got (<class 'float'>). Single values need to be wrapped in a list with one element.

  warn('Unknown failure executing module: <%s>' % mod_name)
