In [4]:
import rpwf
from rpwf import database, rpwf
from pathlib import Path
from typing import Dict
from dataclasses import dataclass
import pandas
from sklearn.model_selection import (
    RepeatedStratifiedKFold,
    RepeatedKFold,
    GridSearchCV,
    cross_val_score
)

In [5]:
tmp_dir = Path("/tmp/RtmpgLMhdU/file45507ef85eed")
tmp_dir_posix = str(tmp_dir)
print(tmp_dir_posix)

/tmp/RtmpgLMhdU/file45507ef85eed


In [None]:
os.listdir(tmp_dir)

In [None]:
@dataclass
class TestArgs:
    db_path: str
    board: str
    cores: int
    inner_n_cv: int
    inner_n_repeats: int
    outer_n_cv: int
    outer_n_repeats: int
    joblib_model: bool

In [None]:
args = TestArgs(
    tmp_dir.joinpath('db.SQLite').as_posix(), 
    tmp_dir.joinpath('board.yml').as_posix(), 
    3, 2, 1, 2, 1, True)
db_obj = database.Base(args.db_path)
board_obj = database.Board(args.board)

In [None]:
db_obj.all_wflow()

In [None]:
import sys
from numpy import ravel
# import sklearn.svm

wflow_id = 2
wflow_obj = rpwf.Wflow(db_obj, board_obj, wflow_id)
n_cores = args.cores

# Generate the parameters
p_grid = rpwf.RGrid(db_obj, board_obj, wflow_obj).get_grid()

df_obj = rpwf.TrainDf(db_obj, board_obj, wflow_obj)
X, y = df_obj.get_df_X(True), df_obj.get_df_y(True)

if y is None:
    print("No target provided, exiting...")
    sys.exit()

y = ravel(y)

model_type_obj = rpwf.Model(db_obj, board_obj, wflow_obj)
base_learner = rpwf.BaseLearner(wflow_obj, model_type_obj).base_learner
# base_learner = sklearn.svm.SVR
score = wflow_obj._get_par("costs")

In [None]:
model_type_obj._get_model_mode()

## nested_resampling.py

In [None]:
if (model_mode := model_type_obj._get_model_mode()) == 'regression':
    vfold_cv = RepeatedKFold
elif model_mode == 'classification':
    vfold_cv = RepeatedStratifiedKFold
else:
    raise ValueError("Either `regression` or `classification` is expected")

inner_cv = vfold_cv(
    n_splits=args.inner_n_cv, 
    n_repeats=args.inner_n_repeats,
    random_state=wflow_obj.random_state
)
outer_cv = vfold_cv(
    n_splits=args.outer_n_cv,
    n_repeats=args.outer_n_repeats,
    random_state=wflow_obj.random_state,
)

In [None]:
if (model_mode := model_type_obj._get_model_mode()) == 'regression':
    vfold_cv = RepeatedKFold
elif model_mode == 'classification':
    vfold_cv = RepeatedStratifiedKFold
else:
    raise ValueError("Either `regression` or `classification` is expected")

inner_cv = vfold_cv(
    n_splits=args.inner_n_cv, 
    n_repeats=args.inner_n_repeats,
    random_state=wflow_obj.random_state
)
outer_cv = vfold_cv(
    n_splits=args.outer_n_cv,
    n_repeats=args.outer_n_repeats,
    random_state=wflow_obj.random_state,
)

if p_grid is None:
    print("No tune grid specified, running with default params")
    nested_score = cross_val_score(
        base_learner, X=X, y=y, cv=outer_cv, n_jobs=n_cores, scoring=score
    )

else:
    print("Performing nested-cv using provided Rgrid")
    param_tuner = GridSearchCV(
        estimator=base_learner,
        param_grid=p_grid,
        cv=inner_cv,
        n_jobs=n_cores,
        scoring=score,
    )
    nested_score = cross_val_score(param_tuner, X=X, y=y, cv=outer_cv)

In [None]:
nested_score

In [None]:
# if args.export:
    # Export the results
exporter = rpwf.Export(db_obj, board_obj, "nested_cv", wflow_obj)
nested_score_df = pandas.DataFrame(nested_score, columns=[score])

In [None]:
exporter.export_cv(nested_score_df)
exporter.export_db()

In [None]:
board_obj.board.pin_read('wf_5_nested_cv_8767807281529.csv')

In [None]:
db_obj.all_wflow()

## cross_validation.py

In [None]:
cv = RepeatedStratifiedKFold(
    n_splits=5,
    n_repeats=1,
    random_state=wflow_obj.random_state,
)

if p_grid is None:
    print("No tune grid specified, running with default params")
    cv_results = cross_val_score(
        base_learner, X=X, y=y, cv=cv, n_jobs=n_cores, scoring=score
    )
else:
    print("Performing nested-cv using provided Rgrid")
    param_tuner = GridSearchCV(
        estimator=base_learner,
        param_grid=p_grid,
        cv=cv,
        n_jobs=n_cores,
        scoring=score
    )
    param_tuner.fit(X=X, y=y)
    tuning_results = pandas.DataFrame(param_tuner.cv_results_)
    cv_results = tuning_results.loc[tuning_results['rank_test_score'] == 1]

In [None]:
cv_results

In [None]:
# if args.export:
    # Export the results
exporter = rpwf.Export(db_obj, board_obj, "cv", wflow_obj)
exporter.export_cv(pandas.DataFrame(cv_results))
if args.joblib_model and param_tuner:
    exporter.export_model(param_tuner.best_estimator_)
exporter.export_db()

In [None]:
board_obj.board.pin_read('wf_5_cv_8772869485692.csv')

In [None]:
db_obj.all_wflow()

## testing the scripts

In [6]:
test_db_path = tmp_dir.joinpath('db.SQLite').as_posix()
test_board_path = tmp_dir.joinpath('board.yml').as_posix()

### nested_resampling

In [None]:
%run -m rpwf.script.nested_resampling $test_db_path -b $test_board_path -h

In [7]:
%run -m rpwf.script.nested_resampling $test_db_path -b $test_board_path -s

Connecting to sqlite:////tmp/RtmpgLMhdU/file45507ef85eed/db.SQLite
   wflow_id model_tag recipe_tag result_pin_name model_pin_name
0         1      enet       None            None           None
1         2   svm_rbf       None            None           None


In [8]:
%run -m rpwf.script.nested_resampling $test_db_path -b $test_board_path -w 5 -f -c 4 -icv 5 -icr 2 -ocv 5 -ocr 2

Either invalid wflow or all requested wflow already have results


In [None]:
%run -m rpwf.script.nested_resampling $test_db_path -b $test_board_path -af -c 4 -icv 5 -icr 2 -ocv 5 -ocr 2

### cross_validation

In [None]:
%run -m rpwf.script.cross_validation $test_db_path -b $test_board_path -h

In [None]:
%run -m rpwf.script.cross_validation $test_db_path -b $test_board_path -s

In [None]:
%run -m rpwf.script.cross_validation $test_db_path -b $test_board_path -w 5 -f -c 4 -ns 5 -nr 1