In [1]:
import pandas as pd
pd.set_option('future.no_silent_downcasting', True)

In [2]:
import numpy as np
from sklearn.model_selection import KFold

In [3]:
from skrub import TableVectorizer
import time 

In [44]:
from benchy.kaggle import METADATA, fetch_playground_series
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from skrub import TableVectorizer
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, mean_absolute_error, mean_squared_error, r2_score 

def calc_scores(task, y_true, y_pred):
    if task == "classification":
        return classification_report(y_true, y_pred, output_dict=True)
    if task == "regression":
        return {
            "mae": mean_absolute_error(y_true, y_pred),
            "mse": mean_squared_error(y_true, y_pred),
            "r2": r2_score(y_true, y_pred)
        }

def get_datasets(task="classification"):
    for name, descr in METADATA.items():
        if descr['task'] == task:
            season = int(name.split("e")[0].replace("s", ""))
            episode = int(name.split("e")[1])
            yield name, fetch_playground_series(season, episode, return_X_y=True)

def get_featurizers(task="classification", elaborate=False):
    yield "tablevec", TableVectorizer()
    
def get_estimators(task="classification", elaborate=False):
    if task == "classification":
        yield "lr", make_pipeline(TableVectorizer(), LogisticRegression())
        yield "hgbt", make_pipeline(TableVectorizer(), HistGradientBoostingClassifier())
        yield "rf", make_pipeline(TableVectorizer(), RandomForestClassifier())
        yield "gbt", make_pipeline(TableVectorizer(), GradientBoostingClassifier())

In [49]:
def task_generator(task, n_seeds=1, n_splits=5, dry_run=False):
    for dataname, (X, y) in get_datasets(task):
        for random_seed in range(n_seeds):
            kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_seed)
            for cv_id, (train_idx, test_idx) in enumerate(kf.split(X)):
                for tfm_name, tfm in get_featurizers(task, elaborate=False):
                    for mod_name, mod in get_estimators(task, elaborate=False):
                        if dry_run:
                            yield 1
                        else:
                            X_train, X_test = X[train_idx], X[test_idx]
                            y_train, y_test = y[train_idx], y[test_idx]
                            yield {
                                "task": task,
                                "dataname": dataname,
                                "random_seed": random_seed,
                                "cv_id": cv_id,
                                "tfm_name": tfm_name,
                                "mod_name": mod_name,
                                "featurizer": tfm,
                                "model": mod,
                                "datasets": (X_train, X_test, y_train, y_test)
                            }

def train(task, dataname, random_seed, cv_id, tfm_name, mod_name, featurizer, model, datasets):
    X_train, X_test, y_train, y_test = datasets
    # Sometimes featurization can take embarassingly long, inspect.
    tic = time.time()
    featurizer.fit(X_train, y_train)
    feat_time = time.time() - tic

    # Train the model
    tic = time.time()
    X_feat = featurizer.transform(X_train)
    model.fit(X_feat, y_train)
    train_time = time.time() - tic

    # Keep predictions around for reporting.
    pred_train = model.predict(X_feat)

    # Gather predictions for test set
    tic = time.time()
    X_feat_test = featurizer.transform(X_test)
    pred_test = model.predict(X_feat_test)
    infer_time = time.time() - tic

    return {
        "timestamp": int(time.time()),
        "dataname": dataname,
        "cv_id": cv_id,
        "random_seed": random_seed,
        "estimator_name": f"{mod_name}-{tfm_name}",
        "feat_time": feat_time,
        "train_time": train_time,
        "infer_time": infer_time,
        "scores_train": calc_scores(task, y_train, pred_train),
        "scores_test": calc_scores(task, y_test, pred_test)
    }

In [50]:
sum(task_generator(task="classification", dry_run=True))

240

In [51]:
from math import sqrt
import srsly 
from joblib import Parallel, delayed

parallel = Parallel(n_jobs=2, return_as="generator")

output_generator = parallel(delayed(train)(**kwargs) for kwargs in task_generator(task="classification"))

In [52]:
for item in output_generator:
    srsly.write_jsonl("output-elab.jsonl", [item], append=True, append_new_line=False)

  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
  column = column.replace(STR_NA_VALUES, np

In [59]:
fetch_playground_series(3, 17)["Product ID"].value_counts()

Product ID,count
str,u32
"""L49184""",41
"""L56016""",43
"""L55469""",18
"""L53641""",8
"""L47285""",22
…,…
"""M20356""",11
"""L54450""",14
"""L52877""",18
"""M17947""",17


In [62]:
import polars as pl 

In [73]:
pl.read_ndjson("output.jsonl").head(1)

timestamp,dataname,cv_id,random_seed,estimator_name,train_time,infer_time,scores_train,scores_test
i64,str,i64,i64,str,f64,f64,struct[13],struct[13]
1712664054,"""s3e26""",0,0,"""tv-lr""",0.402707,0.322722,"{{0.779643,0.904168,0.837301,4007.0},{0.0,0.0,0.0,221.0},{0.717352,0.57395,0.637689,2096.0},0.763125,{0.498998,0.492706,0.491663,6324.0},{0.731752,0.763125,0.741882,6324.0},{null,null,null,null},{null,null,null,null},{null,null,null,null},{null,null,null,null},{null,null,null,null},{null,null,null,null},{null,null,null,null}}","{{0.742105,0.88309,0.806482,958.0},{0.0,0.0,0.0,54.0},{0.702948,0.544815,0.613861,569.0},0.731183,{0.481684,0.475968,0.473448,1581.0},{0.702666,0.731183,0.709612,1581.0},{null,null,null,null},{null,null,null,null},{null,null,null,null},{null,null,null,null},{null,null,null,null},{null,null,null,null},{null,null,null,null}}"


In [86]:
pl.col("scores_train").struct.field("a")

In [97]:
df_summary = (
    pl.read_ndjson("output.jsonl")
        .with_columns(
            acc=pl.col("scores_test").struct.field("accuracy"),
            macro_f1=pl.col("scores_test").struct.field("macro avg").struct.field("f1-score")
        )
        .group_by("dataname", "estimator_name")
        .agg(pl.mean("train_time", "infer_time", "acc", "macro_f1"))
        .sort("dataname", "estimator_name")
        .with_columns(
            perf_ratio=pl.col("macro_f1") / pl.max("macro_f1").over("dataname"),
            train_ratio=pl.col("train_time") / pl.max("train_time").over("dataname"),
            infer_ratio=pl.col("infer_time") / pl.max("infer_time").over("dataname"),
        )
)

In [98]:
%pip import altair as alt

ModuleNotFoundError: No module named 'altair'