In [1]:
import pandas as pd
pd.set_option('future.no_silent_downcasting', True)

In [2]:
import numpy as np
from sklearn.model_selection import KFold

In [3]:
from skrub import TableVectorizer
import time 

In [70]:
from benchy.kaggle import METADATA, fetch_playground_series
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from skrub import TableVectorizer
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, mean_absolute_error, mean_squared_error, r2_score 
from sklearn.preprocessing import LabelEncoder

def calc_scores(task, y_true, y_pred):
    if task == "classification":
        return classification_report(y_true, y_pred, output_dict=True)
    if task == "regression":
        return {
            "mae": mean_absolute_error(y_true, y_pred),
            "mse": mean_squared_error(y_true, y_pred),
            "r2": r2_score(y_true, y_pred)
        }

def get_datasets(task="classification"):
    for name, descr in METADATA.items():
        if descr['task'] == task:
            season = int(name.split("e")[0].replace("s", ""))
            episode = int(name.split("e")[1])
            yield name, fetch_playground_series(season, episode, return_X_y=True)

def get_featurizers(task="classification", elaborate=False):
    yield "tablevec", TableVectorizer()
    
def get_estimators(task="classification", elaborate=False):
    if task == "classification":
        yield "lr", LogisticRegression()
        yield "hgbt", HistGradientBoostingClassifier()
        yield "rf", RandomForestClassifier()
        yield "gbt", GradientBoostingClassifier()
        yield "xgboost", XGBClassifier()
        yield "catboost", CatBoostClassifier()
        yield "lgbm", LGBMClassifier()

In [71]:
def task_generator(task, n_seeds=1, n_splits=5, dry_run=False):
    for dataname, (X, y) in get_datasets(task):
        for random_seed in range(n_seeds):
            kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_seed)
            for cv_id, (train_idx, test_idx) in enumerate(kf.split(X)):
                for tfm_name, tfm in get_featurizers(task, elaborate=False):
                    for mod_name, mod in get_estimators(task, elaborate=False):
                        if dry_run:
                            yield 1
                        else:
                            X_train, X_test = X[train_idx], X[test_idx]
                            y_train, y_test = y[train_idx], y[test_idx]
                            yield {
                                "task": task,
                                "dataname": dataname,
                                "random_seed": random_seed,
                                "cv_id": cv_id,
                                "tfm_name": tfm_name,
                                "mod_name": mod_name,
                                "featurizer": tfm,
                                "model": mod,
                                "datasets": (X_train, X_test, y_train, y_test)
                            }

def train(task, dataname, random_seed, cv_id, tfm_name, mod_name, featurizer, model, datasets):
    X_train, X_test, y_train, y_test = datasets
    
    # XGBoost doesnt handle this nicely internally, so we gotta do this manually
    lab_enc = LabelEncoder()
    y_train = lab_enc.fit_transform(y_train)
    y_test = lab_enc.transform(y_test)
    
    # Sometimes featurization can take embarassingly long, inspect.
    tic = time.time()
    featurizer.fit(X_train, y_train)
    feat_time = time.time() - tic

    # Train the model
    tic = time.time()
    X_feat = featurizer.transform(X_train)
    model.fit(X_feat, y_train)
    train_time = time.time() - tic

    # Keep predictions around for reporting.
    pred_train = model.predict(X_feat)

    print(y_train, pred_train)
    # Gather predictions for test set
    tic = time.time()
    X_feat_test = featurizer.transform(X_test)
    pred_test = model.predict(X_feat_test)
    infer_time = time.time() - tic

    return {
        "timestamp": int(time.time()),
        "dataname": dataname,
        "cv_id": cv_id,
        "random_seed": random_seed,
        "estimator_name": f"{mod_name}-{tfm_name}",
        "feat_time": feat_time,
        "train_time": train_time,
        "infer_time": infer_time,
        "scores_train": calc_scores(task, y_train, pred_train),
        "scores_test": calc_scores(task, y_test, pred_test)
    }

In [72]:
sum(task_generator(task="classification", dry_run=True))

420

In [73]:
from math import sqrt
import srsly 
from joblib import Parallel, delayed

parallel = Parallel(n_jobs=2, return_as="generator")

output_generator = parallel(delayed(train)(**kwargs) for kwargs in task_generator(task="classification"))

In [None]:
for item in output_generator:
    srsly.write_jsonl("output-elaborate.jsonl", [item], append=True, append_new_line=False)

  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
  column = column.replace(STR_NA_VALUES, np

[2 0 2 ... 0 2 2] [2 0 0 ... 0 0 0]
[2 0 2 ... 0 2 2] [2 0 2 ... 0 2 2]
[2 0 2 ... 0 2 2] [2 0 2 ... 0 2 2]
Learning rate set to 0.086931
0:	learn: 1.0088458	total: 59ms	remaining: 59s
1:	learn: 0.9393293	total: 75.3ms	remaining: 37.6s
2:	learn: 0.8817560	total: 86.7ms	remaining: 28.8s
3:	learn: 0.8325919	total: 103ms	remaining: 25.6s
4:	learn: 0.7920052	total: 116ms	remaining: 23.1s
5:	learn: 0.7555721	total: 132ms	remaining: 21.8s
6:	learn: 0.7236734	total: 144ms	remaining: 20.4s
7:	learn: 0.6952497	total: 160ms	remaining: 19.8s
8:	learn: 0.6710740	total: 172ms	remaining: 19s
9:	learn: 0.6494727	total: 185ms	remaining: 18.3s
10:	learn: 0.6308615	total: 198ms	remaining: 17.8s
11:	learn: 0.6138240	total: 204ms	remaining: 16.8s
12:	learn: 0.5988273	total: 213ms	remaining: 16.1s
13:	learn: 0.5851263	total: 219ms	remaining: 15.4s
14:	learn: 0.5743742	total: 227ms	remaining: 14.9s
15:	learn: 0.5633631	total: 235ms	remaining: 14.4s
16:	learn: 0.5539018	total: 241ms	remaining: 13.9s
17:	lear

  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)


157:	learn: 0.3685803	total: 2.06s	remaining: 11s
158:	learn: 0.3680439	total: 2.08s	remaining: 11s
159:	learn: 0.3675154	total: 2.09s	remaining: 11s
160:	learn: 0.3667699	total: 2.11s	remaining: 11s
161:	learn: 0.3664774	total: 2.13s	remaining: 11s
162:	learn: 0.3660124	total: 2.14s	remaining: 11s
163:	learn: 0.3654985	total: 2.15s	remaining: 11s
164:	learn: 0.3650151	total: 2.16s	remaining: 10.9s
165:	learn: 0.3643115	total: 2.17s	remaining: 10.9s
166:	learn: 0.3637881	total: 2.19s	remaining: 10.9s
167:	learn: 0.3635521	total: 2.2s	remaining: 10.9s
168:	learn: 0.3631072	total: 2.22s	remaining: 10.9s
169:	learn: 0.3624824	total: 2.23s	remaining: 10.9s
170:	learn: 0.3619477	total: 2.24s	remaining: 10.9s
171:	learn: 0.3613563	total: 2.26s	remaining: 10.9s
172:	learn: 0.3608713	total: 2.27s	remaining: 10.9s
173:	learn: 0.3605098	total: 2.28s	remaining: 10.8s
174:	learn: 0.3598266	total: 2.3s	remaining: 10.8s
175:	learn: 0.3594930	total: 2.31s	remaining: 10.8s
176:	learn: 0.3591972	total:

  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)


In [38]:
fetch_playground_series(3, 17)["Product ID"].value_counts()

Product ID,count
str,u32
"""L51068""",17
"""M24414""",4
"""M20407""",9
"""L47296""",20
"""M20090""",7
…,…
"""L50454""",23
"""L55220""",7
"""L54992""",7
"""M18412""",5


In [39]:
import polars as pl 

In [40]:
pl.read_ndjson("output.jsonl").head(1)

timestamp,dataname,cv_id,random_seed,estimator_name,train_time,infer_time,scores_train,scores_test
i64,str,i64,i64,str,f64,f64,struct[13],struct[13]
1712664054,"""s3e26""",0,0,"""tv-lr""",0.402707,0.322722,"{{0.779643,0.904168,0.837301,4007.0},{0.0,0.0,0.0,221.0},{0.717352,0.57395,0.637689,2096.0},0.763125,{0.498998,0.492706,0.491663,6324.0},{0.731752,0.763125,0.741882,6324.0},{null,null,null,null},{null,null,null,null},{null,null,null,null},{null,null,null,null},{null,null,null,null},{null,null,null,null},{null,null,null,null}}","{{0.742105,0.88309,0.806482,958.0},{0.0,0.0,0.0,54.0},{0.702948,0.544815,0.613861,569.0},0.731183,{0.481684,0.475968,0.473448,1581.0},{0.702666,0.731183,0.709612,1581.0},{null,null,null,null},{null,null,null,null},{null,null,null,null},{null,null,null,null},{null,null,null,null},{null,null,null,null},{null,null,null,null}}"


In [16]:
pl.col("scores_train").struct.field("a")

In [41]:
df_summary = (
    pl.read_ndjson("output-elaborate.jsonl")
        .with_columns(
            acc=pl.col("scores_test").struct.field("accuracy"),
            macro_f1=pl.col("scores_test").struct.field("macro avg").struct.field("f1-score")
        )
        .group_by("dataname", "estimator_name")
        .agg(pl.mean("train_time", "infer_time", "acc", "macro_f1"))
        .sort("dataname", "estimator_name")
        .with_columns(
            perf_ratio=pl.col("macro_f1") / pl.max("macro_f1").over("dataname"),
            train_ratio=pl.col("train_time") / pl.max("train_time").over("dataname"),
            infer_ratio=pl.col("infer_time") / pl.max("infer_time").over("dataname"),
        )
)

FileNotFoundError: No such file or directory (os error 2): output-elaborate.jsonl

In [18]:
import altair as alt

In [27]:
(alt.Chart(df_summary)
  .mark_line()
  .encode(x=alt.X('estimator_name', sort=["tv-lr", "tv-hgbt", "tv-rf", "tv-gbt"]), y=alt.Y('macro_f1'), color=alt.Color('dataname'))
  .properties(width=600, height=250)
  .interactive())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
  column = column.replace(STR_NA_VALUES, np.nan).replace(r"^\s+$", np.nan, regex=True)
  column = colum