In [1]:
import pandas as pd
pd.set_option('future.no_silent_downcasting', True)

In [2]:
import numpy as np
from sklearn.model_selection import KFold

In [3]:
from skrub import TableVectorizer
import time 

In [10]:
from benchy.kaggle import METADATA, fetch_playground_series
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from skrub import TableVectorizer
from sklearn.ensemble import HistGradientBoostingClassifier

def get_datasets(task="classification"):
    n = 0
    for name, descr in METADATA.items():
        if descr['task'] == task:
            season = int(name.split("e")[0].replace("s", ""))
            episode = int(name.split("e")[1])
            yield name, fetch_playground_series(season, episode, return_X_y=True)
        n += 1
        if n > 4:
            break

def get_estimators(task="classification", elaborate=False):
    if task == "classification":
        yield "tv-lr", make_pipeline(TableVectorizer(), LogisticRegression())
        yield "tv-gbt", make_pipeline(TableVectorizer(), HistGradientBoostingClassifier())

In [27]:
def task_generator(task, n_seeds=1, n_splits=5):
    for dataname, (X, y) in get_datasets(task):
        for random_seed in range(n_seeds):
            kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_seed)
            for cv_id, (train_idx, test_idx) in enumerate(kf.split(X)):
                for est_name, est in get_estimators(task, elaborate=False):
                    X_train, X_test = X[train_idx], X[test_idx]
                    y_train, y_test = y[train_idx], y[test_idx]
                    yield {
                        "dataname": dataname,
                        "random_seed": random_seed,
                        "cv_id": cv_id,
                        "est_name": est_name,
                        "estimator": est,
                        "datasets": (X_train, X_test, y_train, y_test)
                    }

def train(dataname, random_seed, cv_id, est_name, estimator, datasets):
    X_train, X_test, y_train, y_test = datasets
    tic = time.time()
    estimator.fit(X_train, y_train)
    train_time = time.time() - tic
    
    tic = time.time()
    pred_train = estimator.predict(X_train)
    pred_test = estimator.predict(X_test)
    infer_time = time.time() - tic

    return {
        "timestamp": int(time.time()),
        "dataname": dataname,
        "cv_id": cv_id,
        "random_seed": random_seed,
        "estimator_name": est_name,
        "train_time": train_time,
        "infer_time": infer_time
    }

In [28]:
from math import sqrt

from joblib import Parallel, delayed

parallel = Parallel(n_jobs=2, return_as="generator")

output_generator = parallel(delayed(train)(**kwargs) for kwargs in task_generator(task="classification"))

In [29]:
import srsly 

In [30]:
for item in output_generator:
    srsly.write_jsonl("output.jsonl", [item], append=True, append_new_line=False)

  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
  column = colum

In [18]:
import polars as pl 

In [21]:
pl.read_ndjson("output.jsonl")['estimator_name'].value_counts()

estimator_name,count
str,u32
"""tv-gbt""",39
"""tv-lr""",1
