# Model performance

In [1]:
import numpy as np
import shap
import pickle
from datasets import load_dataset
from src.dataset_info import get_dataset_info
from transformers import pipeline, AutoTokenizer
import pandas as pd
from datasets import load_dataset, Dataset
import os
from tqdm import tqdm
from src.utils import token_segments, text_ft_index_ends

# from src.models import Model
import lightgbm as lgb
from src.models import WeightedEnsemble, StackModel, AllAsTextModel
from src.joint_masker import JointMasker
import argparse
import scipy as sp
from sklearn.metrics import roc_auc_score


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def run_model(model_type, ds_type, test_set_size=100, tab_scale_factor=2):
    di = get_dataset_info(ds_type, model_type)
    # Data
    train_df = load_dataset(
        di.ds_name, split="train", download_mode="force_redownload"
    ).to_pandas()
    y_train = train_df[di.label_col]

    test_df = load_dataset(
        di.ds_name, split="test", download_mode="force_redownload"
    ).to_pandas()
    test_df_sample = test_df.sample(test_set_size, random_state=55)

    # Models
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
    if model_type == "all_text":
        text_pipeline = pipeline(
            "text-classification",
            # model=di.text_model_name,
            model="../models/wine/glowing-morning-9/checkpoint-6705",
            tokenizer=tokenizer,
            device="cuda:0",
            truncation=True,
            padding=True,
            top_k=None,
        )
        # Define how to convert all columns to a single string

        def cols_to_str_fn(array):
            return " | ".join(
                [f"{col}: {val}" for col, val in zip(di.tab_cols + di.text_cols, array)]
            )

        model = AllAsTextModel(
            text_pipeline=text_pipeline,
            cols_to_str_fn=cols_to_str_fn,
            # cols=di.tab_cols + di.text_cols
        )
    else:
        text_pipeline = pipeline(
            "text-classification",
            model=di.text_model_name,
            tokenizer=tokenizer,
            device="cuda:0",
            truncation=True,
            padding=True,
            top_k=None,
        )
        # Define how to convert the text columns to a single string
        if len(di.text_cols) == 1:

            def cols_to_str_fn(array):
                return array[0]

        else:

            def cols_to_str_fn(array):
                return " | ".join(
                    [f"{col}: {val}" for col, val in zip(di.text_cols, array)]
                )

        # LightGBM requires explicitly marking categorical features
        train_df[di.categorical_cols] = train_df[di.categorical_cols].astype("category")
        test_df_sample[di.categorical_cols] = test_df_sample[
            di.categorical_cols
        ].astype("category")

        tab_model = lgb.LGBMClassifier(random_state=42)
        tab_model.fit(train_df[di.tab_cols], y_train)

        if model_type == "ensemble_50":
            model = WeightedEnsemble(
                tab_model=tab_model,
                text_pipeline=text_pipeline,
                text_weight=0.5,
                cols_to_str_fn=cols_to_str_fn,
            )
        elif model_type == "ensemble_75":
            model = WeightedEnsemble(
                tab_model=tab_model,
                text_pipeline=text_pipeline,
                text_weight=0.75,
                cols_to_str_fn=cols_to_str_fn,
            )
        elif model_type == "ensemble_25":
            model = WeightedEnsemble(
                tab_model=tab_model,
                text_pipeline=text_pipeline,
                text_weight=0.25,
                cols_to_str_fn=cols_to_str_fn,
            )
        elif model_type == "stack":
            """
            For the stack model, we make predictions on the validation set. These predictions
            are then used as features for the stack model (another LightGBM model) along with
            the other tabular features. In doing so the stack model learns, depending on the
            tabular features, when to trust the tabular model and when to trust the text model.
            """
            val_df = load_dataset(
                di.ds_name, split="validation", download_mode="force_redownload"
            ).to_pandas()
            val_df[di.categorical_cols] = val_df[di.categorical_cols].astype("category")
            y_val = val_df[di.label_col]
            val_text = list(map(cols_to_str_fn, val_df[di.text_cols].values))

            # Training set is the preditions from the tabular and text models on the validation set
            # plus the tabular features from the validation set
            text_val_preds = text_pipeline(val_text)
            # text_val_preds = np.array(
            #     [format_text_pred(pred) for pred in text_val_preds]
            # )
            text_val_preds = np.array(
                [[lab["score"] for lab in pred] for pred in text_val_preds]
            )

            # add text and tabular predictions to the val_df
            stack_val_df = val_df[di.tab_cols]
            tab_val_preds = tab_model.predict_proba(stack_val_df)
            for i in range(text_val_preds.shape[1]):
                stack_val_df[f"text_pred_{i}"] = text_val_preds[:, i]
            for i in range(tab_val_preds.shape[1]):
                stack_val_df[f"tab_pred_{i}"] = tab_val_preds[:, i]

            stack_model = lgb.LGBMClassifier(random_state=42)
            stack_model.fit(stack_val_df, y_val)

            model = StackModel(
                tab_model=tab_model,
                text_pipeline=text_pipeline,
                stack_model=stack_model,
                cols_to_str_fn=cols_to_str_fn,
            )
        else:
            raise ValueError(f"Invalid model type of {model_type}")

    np.random.seed(1)
    test_sample_vals = test_df_sample[di.tab_cols + di.text_cols].values
    test_vals = test_df[di.tab_cols + di.text_cols].values

    return (
        model.predict(test_sample_vals),
        test_df_sample[di.label_col].values,
        model.predict(test_vals),
        test_df[di.label_col].values,
    )

## Fake Job Postings (ROC-AUC)

In [3]:
ds_type = "fake"

for model_type in ["ensemble_25", "ensemble_50", "ensemble_75", "all_text", "stack"]:
    sample_preds, sample_y, preds, y = run_model(
        model_type, ds_type=ds_type, tab_scale_factor=1
    )

    print(f"\n{model_type} on {ds_type}")
    print(f"Test accuaracy: {np.mean(np.argmax(preds, axis=1) == y)}")
    print(
        f"Test accuaracy (sample): {np.mean(np.argmax(sample_preds, axis=1) == sample_y)}"
    )
    print(f"Test AUC: {roc_auc_score(y, preds[:, 1])}")
    print(f"Test AUC (sample): {roc_auc_score(sample_y, sample_preds[:, 1])}")
    print(
        f"Test % by label: {[np.mean(y == label) for label in np.unique(y)]}")
    print(
        f"Test % by label (sample): {[np.mean(sample_y == label) for label in np.unique(y)]}"
    )

Using dataset fake, ordinal version


Downloading readme: 100%|██████████| 717/717 [00:00<00:00, 1.07MB/s]


Downloading and preparing dataset None/None (download: 11.54 MiB, generated: 20.39 MiB, post-processed: Unknown size, total: 31.94 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--fake_job_postings2_ordinal-d873cc356e36f3d4/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 2.46M/2.46M [00:00<00:00, 40.6MB/s]
Downloading data: 100%|██████████| 8.23M/8.23M [00:00<00:00, 63.5MB/s]
Downloading data: 100%|██████████| 1.41M/1.41M [00:00<00:00, 28.3MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.41it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2944.06it/s]
                                                                            

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--fake_job_postings2_ordinal-d873cc356e36f3d4/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 717/717 [00:00<00:00, 751kB/s]


Downloading and preparing dataset None/None (download: 11.54 MiB, generated: 20.39 MiB, post-processed: Unknown size, total: 31.94 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--fake_job_postings2_ordinal-d873cc356e36f3d4/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 2.46M/2.46M [00:00<00:00, 40.7MB/s]
Downloading data: 100%|██████████| 8.23M/8.23M [00:00<00:00, 65.3MB/s]
Downloading data: 100%|██████████| 1.41M/1.41M [00:00<00:00, 28.7MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.40it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2437.60it/s]
                                                                            

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--fake_job_postings2_ordinal-d873cc356e36f3d4/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


100it [00:00, 73752.49it/s]
3182it [00:00, 54308.79it/s]



ensemble_25 on fake
Test accuaracy: 0.9387177875549969
Test accuaracy (sample): 0.93
Test AUC: 0.8744072444723762
Test AUC (sample): 0.808421052631579
Test % by label: [0.9566310496543055, 0.04336895034569453]
Test % by label (sample): [0.95, 0.05]
Using dataset fake, ordinal version


Downloading readme: 100%|██████████| 717/717 [00:00<00:00, 174kB/s]


Downloading and preparing dataset None/None (download: 11.54 MiB, generated: 20.39 MiB, post-processed: Unknown size, total: 31.94 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--fake_job_postings2_ordinal-d873cc356e36f3d4/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 2.46M/2.46M [00:00<00:00, 39.9MB/s]
Downloading data: 100%|██████████| 8.23M/8.23M [00:00<00:00, 69.6MB/s]
Downloading data: 100%|██████████| 1.41M/1.41M [00:00<00:00, 24.8MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.42it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2444.23it/s]
                                                                            

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--fake_job_postings2_ordinal-d873cc356e36f3d4/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 717/717 [00:00<00:00, 275kB/s]


Downloading and preparing dataset None/None (download: 11.54 MiB, generated: 20.39 MiB, post-processed: Unknown size, total: 31.94 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--fake_job_postings2_ordinal-d873cc356e36f3d4/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 2.46M/2.46M [00:00<00:00, 42.4MB/s]
Downloading data: 100%|██████████| 8.23M/8.23M [00:00<00:00, 69.6MB/s]
Downloading data: 100%|██████████| 1.41M/1.41M [00:00<00:00, 26.6MB/s]
Downloading data files: 100%|██████████| 3/3 [00:01<00:00,  1.58it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2691.53it/s]
                                                                            

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--fake_job_postings2_ordinal-d873cc356e36f3d4/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


100it [00:00, 97769.32it/s]
3182it [00:00, 53804.13it/s]



ensemble_50 on fake
Test accuaracy: 0.9619736015084852
Test accuaracy (sample): 0.96
Test AUC: 0.9189305642842179
Test AUC (sample): 0.9136842105263159
Test % by label: [0.9566310496543055, 0.04336895034569453]
Test % by label (sample): [0.95, 0.05]
Using dataset fake, ordinal version


Downloading readme: 100%|██████████| 717/717 [00:00<00:00, 295kB/s]


Downloading and preparing dataset None/None (download: 11.54 MiB, generated: 20.39 MiB, post-processed: Unknown size, total: 31.94 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--fake_job_postings2_ordinal-d873cc356e36f3d4/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 2.46M/2.46M [00:00<00:00, 40.1MB/s]
Downloading data: 100%|██████████| 8.23M/8.23M [00:00<00:00, 67.2MB/s]
Downloading data: 100%|██████████| 1.41M/1.41M [00:00<00:00, 27.2MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.38it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2946.13it/s]
                                                                            

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--fake_job_postings2_ordinal-d873cc356e36f3d4/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 717/717 [00:00<00:00, 276kB/s]


Downloading and preparing dataset None/None (download: 11.54 MiB, generated: 20.39 MiB, post-processed: Unknown size, total: 31.94 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--fake_job_postings2_ordinal-d873cc356e36f3d4/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 2.46M/2.46M [00:00<00:00, 42.3MB/s]
Downloading data: 100%|██████████| 8.23M/8.23M [00:00<00:00, 62.5MB/s]
Downloading data: 100%|██████████| 1.41M/1.41M [00:00<00:00, 27.2MB/s]
Downloading data files: 100%|██████████| 3/3 [00:01<00:00,  1.50it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2782.60it/s]
                                                                            

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--fake_job_postings2_ordinal-d873cc356e36f3d4/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


100it [00:00, 79845.88it/s]
3182it [00:00, 55415.76it/s]



ensemble_75 on fake
Test accuaracy: 0.9783155248271528
Test accuaracy (sample): 0.99
Test AUC: 0.9337994439048545
Test AUC (sample): 0.9515789473684211
Test % by label: [0.9566310496543055, 0.04336895034569453]
Test % by label (sample): [0.95, 0.05]
Using dataset fake, all as text version


Downloading readme: 100%|██████████| 709/709 [00:00<00:00, 535kB/s]


Downloading and preparing dataset None/None (download: 11.55 MiB, generated: 20.58 MiB, post-processed: Unknown size, total: 32.13 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--fake_job_postings2_all_text-b16faba3acce3185/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 2.46M/2.46M [00:00<00:00, 4.97MB/s]
Downloading data: 100%|██████████| 8.24M/8.24M [00:00<00:00, 13.0MB/s]
Downloading data: 100%|██████████| 1.41M/1.41M [00:01<00:00, 801kB/s] 
Downloading data files: 100%|██████████| 3/3 [00:06<00:00,  2.29s/it]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2188.71it/s]
                                                                            

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--fake_job_postings2_all_text-b16faba3acce3185/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 709/709 [00:00<00:00, 306kB/s]


Downloading and preparing dataset None/None (download: 11.55 MiB, generated: 20.58 MiB, post-processed: Unknown size, total: 32.13 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--fake_job_postings2_all_text-b16faba3acce3185/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 2.46M/2.46M [00:00<00:00, 35.7MB/s]
Downloading data: 100%|██████████| 8.24M/8.24M [00:00<00:00, 66.2MB/s]
Downloading data: 100%|██████████| 1.41M/1.41M [00:00<00:00, 27.7MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.33it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2583.76it/s]
                                                                            

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--fake_job_postings2_all_text-b16faba3acce3185/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.

all_text on fake
Test accuaracy: 0.12507856693903205
Test accuaracy (sample): 0.1
Test AUC: 0.5576329772039079
Test AUC (sample): 0.5157894736842106
Test % by label: [0.9566310496543055, 0.04336895034569453]
Test % by label (sample): [0.95, 0.05]
Using dataset fake, ordinal version


Downloading readme: 100%|██████████| 717/717 [00:00<00:00, 249kB/s]


Downloading and preparing dataset None/None (download: 11.54 MiB, generated: 20.39 MiB, post-processed: Unknown size, total: 31.94 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--fake_job_postings2_ordinal-d873cc356e36f3d4/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 2.46M/2.46M [00:00<00:00, 37.6MB/s]
Downloading data: 100%|██████████| 8.23M/8.23M [00:00<00:00, 67.2MB/s]
Downloading data: 100%|██████████| 1.41M/1.41M [00:00<00:00, 26.8MB/s]
Downloading data files: 100%|██████████| 3/3 [00:33<00:00, 11.14s/it]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2765.48it/s]
                                                                            

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--fake_job_postings2_ordinal-d873cc356e36f3d4/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 717/717 [00:00<00:00, 235kB/s]


Downloading and preparing dataset None/None (download: 11.54 MiB, generated: 20.39 MiB, post-processed: Unknown size, total: 31.94 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--fake_job_postings2_ordinal-d873cc356e36f3d4/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 2.46M/2.46M [00:00<00:00, 39.7MB/s]
Downloading data: 100%|██████████| 8.23M/8.23M [00:00<00:00, 65.8MB/s]
Downloading data: 100%|██████████| 1.41M/1.41M [00:00<00:00, 27.6MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.46it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2757.60it/s]
                                                                            

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--fake_job_postings2_ordinal-d873cc356e36f3d4/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 717/717 [00:00<00:00, 325kB/s]


Downloading and preparing dataset None/None (download: 11.54 MiB, generated: 20.39 MiB, post-processed: Unknown size, total: 31.94 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--fake_job_postings2_ordinal-d873cc356e36f3d4/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 2.46M/2.46M [00:00<00:00, 36.9MB/s]
Downloading data: 100%|██████████| 8.23M/8.23M [00:00<00:00, 65.7MB/s]
Downloading data: 100%|██████████| 1.41M/1.41M [00:00<00:00, 28.6MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.15it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2585.35it/s]
                                                                            

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--fake_job_postings2_ordinal-d873cc356e36f3d4/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
100it [00:00, 81127.74it/s]
3182it [00:00, 54071.38it/s]



stack on fake
Test accuaracy: 0.943117536140792
Test accuaracy (sample): 0.93
Test AUC: 0.9087501666380999
Test AUC (sample): 0.9557894736842105
Test % by label: [0.9566310496543055, 0.04336895034569453]
Test % by label (sample): [0.95, 0.05]


## IMDB  (ROC-AUC)

In [4]:
ds_type = "imdb_genre"

for model_type in ["ensemble_25", "ensemble_50", "ensemble_75", "all_text", "stack"]:
    sample_preds, sample_y, preds, y = run_model(
        model_type, ds_type=ds_type, tab_scale_factor=1
    )

    print(f"\n{model_type} on {ds_type}")
    print(f"Test accuaracy: {np.mean(np.argmax(preds, axis=1) == y)}")
    print(
        f"Test accuaracy (sample): {np.mean(np.argmax(sample_preds, axis=1) == sample_y)}"
    )
    print(f"Test AUC: {roc_auc_score(y, preds[:, 1])}")
    print(f"Test AUC (sample): {roc_auc_score(sample_y, sample_preds[:, 1])}")
    print(
        f"Test % by label: {[np.mean(y == label) for label in np.unique(y)]}")
    print(
        f"Test % by label (sample): {[np.mean(sample_y == label) for label in np.unique(y)]}"
    )

Using dataset imdb_genre, ordinal version


Downloading readme: 100%|██████████| 904/904 [00:00<00:00, 501kB/s]


Downloading and preparing dataset None/None (download: 224.75 KiB, generated: 322.65 KiB, post-processed: Unknown size, total: 547.40 KiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_ordinal-95c476e18d2d7064/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 51.1k/51.1k [00:00<00:00, 6.86MB/s]
Downloading data: 100%|██████████| 144k/144k [00:00<00:00, 7.30MB/s]]
Downloading data: 100%|██████████| 34.6k/34.6k [00:00<00:00, 35.1MB/s]
Downloading data files: 100%|██████████| 3/3 [00:01<00:00,  1.64it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2741.38it/s]
                                                                           

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_ordinal-95c476e18d2d7064/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 904/904 [00:00<00:00, 324kB/s]


Downloading and preparing dataset None/None (download: 224.75 KiB, generated: 322.65 KiB, post-processed: Unknown size, total: 547.40 KiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_ordinal-95c476e18d2d7064/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 51.1k/51.1k [00:00<00:00, 7.21MB/s]
Downloading data: 100%|██████████| 144k/144k [00:00<00:00, 7.76MB/s]]
Downloading data: 100%|██████████| 34.6k/34.6k [00:00<00:00, 40.8MB/s]
Downloading data files: 100%|██████████| 3/3 [00:01<00:00,  1.62it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 3126.19it/s]
                                                                           

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_ordinal-95c476e18d2d7064/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


100it [00:00, 785450.19it/s]
200it [00:00, 502011.25it/s]



ensemble_25 on imdb_genre
Test accuaracy: 0.78
Test accuaracy (sample): 0.79
Test AUC: 0.8530677609848865
Test AUC (sample): 0.8262626262626263
Test % by label: [0.485, 0.515]
Test % by label (sample): [0.55, 0.45]
Using dataset imdb_genre, ordinal version


Downloading readme: 100%|██████████| 904/904 [00:00<00:00, 337kB/s]


Downloading and preparing dataset None/None (download: 224.75 KiB, generated: 322.65 KiB, post-processed: Unknown size, total: 547.40 KiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_ordinal-95c476e18d2d7064/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 51.1k/51.1k [00:00<00:00, 5.32MB/s]
Downloading data: 100%|██████████| 144k/144k [00:00<00:00, 7.29MB/s]]
Downloading data: 100%|██████████| 34.6k/34.6k [00:00<00:00, 42.2MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.47it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 3185.55it/s]
                                                                           

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_ordinal-95c476e18d2d7064/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 904/904 [00:00<00:00, 374kB/s]


Downloading and preparing dataset None/None (download: 224.75 KiB, generated: 322.65 KiB, post-processed: Unknown size, total: 547.40 KiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_ordinal-95c476e18d2d7064/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 51.1k/51.1k [00:00<00:00, 7.88MB/s]
Downloading data: 100%|██████████| 144k/144k [00:00<00:00, 7.06MB/s]]
Downloading data: 100%|██████████| 34.6k/34.6k [00:00<00:00, 40.1MB/s]
Downloading data files: 100%|██████████| 3/3 [00:01<00:00,  1.70it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2476.46it/s]
                                                                           

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_ordinal-95c476e18d2d7064/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


100it [00:00, 788403.01it/s]
200it [00:00, 976555.06it/s]



ensemble_50 on imdb_genre
Test accuaracy: 0.785
Test accuaracy (sample): 0.79
Test AUC: 0.8646782103893504
Test AUC (sample): 0.8513131313131314
Test % by label: [0.485, 0.515]
Test % by label (sample): [0.55, 0.45]
Using dataset imdb_genre, ordinal version


Downloading readme: 100%|██████████| 904/904 [00:00<00:00, 279kB/s]


Downloading and preparing dataset None/None (download: 224.75 KiB, generated: 322.65 KiB, post-processed: Unknown size, total: 547.40 KiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_ordinal-95c476e18d2d7064/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 51.1k/51.1k [00:00<00:00, 8.25MB/s]
Downloading data: 100%|██████████| 144k/144k [00:00<00:00, 7.16MB/s]]
Downloading data: 100%|██████████| 34.6k/34.6k [00:00<00:00, 38.8MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.38it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 3136.32it/s]
                                                                           

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_ordinal-95c476e18d2d7064/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 904/904 [00:00<00:00, 1.42MB/s]


Downloading and preparing dataset None/None (download: 224.75 KiB, generated: 322.65 KiB, post-processed: Unknown size, total: 547.40 KiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_ordinal-95c476e18d2d7064/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 51.1k/51.1k [00:00<00:00, 7.87MB/s]
Downloading data: 100%|██████████| 144k/144k [00:00<00:00, 7.18MB/s]]
Downloading data: 100%|██████████| 34.6k/34.6k [00:00<00:00, 39.0MB/s]
Downloading data files: 100%|██████████| 3/3 [00:01<00:00,  1.78it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2790.00it/s]
                                                                           

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_ordinal-95c476e18d2d7064/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


100it [00:00, 733269.93it/s]
200it [00:00, 995089.92it/s]



ensemble_75 on imdb_genre
Test accuaracy: 0.775
Test accuaracy (sample): 0.75
Test AUC: 0.8520668601741568
Test AUC (sample): 0.8387878787878787
Test % by label: [0.485, 0.515]
Test % by label (sample): [0.55, 0.45]
Using dataset imdb_genre, all as text version


Downloading readme: 100%|██████████| 906/906 [00:00<00:00, 297kB/s]


Downloading and preparing dataset None/None (download: 225.28 KiB, generated: 320.45 KiB, post-processed: Unknown size, total: 545.73 KiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_all_text-e7768922c61ebd55/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 51.0k/51.0k [00:00<00:00, 7.45MB/s]
Downloading data: 100%|██████████| 145k/145k [00:00<00:00, 7.04MB/s]]
Downloading data: 100%|██████████| 34.4k/34.4k [00:00<00:00, 40.0MB/s]
Downloading data files: 100%|██████████| 3/3 [00:01<00:00,  1.54it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 3201.76it/s]
                                                                           

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_all_text-e7768922c61ebd55/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 906/906 [00:00<00:00, 387kB/s]


Downloading and preparing dataset None/None (download: 225.28 KiB, generated: 320.45 KiB, post-processed: Unknown size, total: 545.73 KiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_all_text-e7768922c61ebd55/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 51.0k/51.0k [00:00<00:00, 7.01MB/s]
Downloading data: 100%|██████████| 145k/145k [00:00<00:00, 7.75MB/s]]
Downloading data: 100%|██████████| 34.4k/34.4k [00:00<00:00, 39.5MB/s]
Downloading data files: 100%|██████████| 3/3 [00:01<00:00,  1.64it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2916.76it/s]
                                                                           

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_all_text-e7768922c61ebd55/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.

all_text on imdb_genre
Test accuaracy: 0.03
Test accuaracy (sample): 0.01
Test AUC: 0.5008507656891202
Test AUC (sample): 0.46626262626262627
Test % by label: [0.485, 0.515]
Test % by label (sample): [0.55, 0.45]
Using dataset imdb_genre, ordinal version


Downloading readme: 100%|██████████| 904/904 [00:00<00:00, 323kB/s]


Downloading and preparing dataset None/None (download: 224.75 KiB, generated: 322.65 KiB, post-processed: Unknown size, total: 547.40 KiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_ordinal-95c476e18d2d7064/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 51.1k/51.1k [00:00<00:00, 8.01MB/s]
Downloading data: 100%|██████████| 144k/144k [00:00<00:00, 7.91MB/s]]
Downloading data: 100%|██████████| 34.6k/34.6k [00:00<00:00, 40.4MB/s]
Downloading data files: 100%|██████████| 3/3 [00:01<00:00,  1.67it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2901.96it/s]
                                                                           

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_ordinal-95c476e18d2d7064/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 904/904 [00:00<00:00, 325kB/s]


Downloading and preparing dataset None/None (download: 224.75 KiB, generated: 322.65 KiB, post-processed: Unknown size, total: 547.40 KiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_ordinal-95c476e18d2d7064/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 51.1k/51.1k [00:00<00:00, 6.89MB/s]
Downloading data: 100%|██████████| 144k/144k [00:00<00:00, 7.82MB/s]]
Downloading data: 100%|██████████| 34.6k/34.6k [00:00<00:00, 36.3MB/s]
Downloading data files: 100%|██████████| 3/3 [00:01<00:00,  1.55it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 3337.64it/s]
                                                                           

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_ordinal-95c476e18d2d7064/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 904/904 [00:00<00:00, 436kB/s]


Downloading and preparing dataset None/None (download: 224.75 KiB, generated: 322.65 KiB, post-processed: Unknown size, total: 547.40 KiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_ordinal-95c476e18d2d7064/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 51.1k/51.1k [00:00<00:00, 5.45MB/s]
Downloading data: 100%|██████████| 144k/144k [00:00<00:00, 7.17MB/s]]
Downloading data: 100%|██████████| 34.6k/34.6k [00:00<00:00, 38.0MB/s]
Downloading data files: 100%|██████████| 3/3 [00:01<00:00,  1.70it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2961.38it/s]
                                                                           

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_ordinal-95c476e18d2d7064/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/


stack on imdb_genre
Test accuaracy: 0.74
Test accuaracy (sample): 0.73
Test AUC: 0.8106295666099489
Test AUC (sample): 0.7846464646464647
Test % by label: [0.485, 0.515]
Test % by label (sample): [0.55, 0.45]


## Kickstarter  (ROC-AUC)

In [5]:
ds_type = "kick"

for model_type in ["ensemble_25", "ensemble_50", "ensemble_75", "all_text", "stack"]:
    sample_preds, sample_y, preds, y = run_model(
        model_type, ds_type=ds_type, tab_scale_factor=1
    )

    print(f"\n{model_type} on {ds_type}")
    print(f"Test accuaracy: {np.mean(np.argmax(preds, axis=1) == y)}")
    print(
        f"Test accuaracy (sample): {np.mean(np.argmax(sample_preds, axis=1) == sample_y)}"
    )
    print(f"Test AUC: {roc_auc_score(y, preds[:, 1])}")
    print(f"Test AUC (sample): {roc_auc_score(sample_y, sample_preds[:, 1])}")
    print(
        f"Test % by label: {[np.mean(y == label) for label in np.unique(y)]}")
    print(
        f"Test % by label (sample): {[np.mean(sample_y == label) for label in np.unique(y)]}"
    )

Using dataset kick, ordinal version


Downloading readme: 100%|██████████| 841/841 [00:00<00:00, 341kB/s]


Downloading and preparing dataset None/None (download: 17.58 MiB, generated: 29.54 MiB, post-processed: Unknown size, total: 47.11 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--kick_starter_funding_ordinal-bc45dae77b1c676d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 3.69M/3.69M [00:00<00:00, 43.9MB/s]
Downloading data: 100%|██████████| 12.5M/12.5M [00:00<00:00, 74.6MB/s]
Downloading data: 100%|██████████| 2.22M/2.22M [00:00<00:00, 37.4MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.40it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2825.08it/s]
                                                                             

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--kick_starter_funding_ordinal-bc45dae77b1c676d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 841/841 [00:00<00:00, 253kB/s]


Downloading and preparing dataset None/None (download: 17.58 MiB, generated: 29.54 MiB, post-processed: Unknown size, total: 47.11 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--kick_starter_funding_ordinal-bc45dae77b1c676d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 3.69M/3.69M [00:00<00:00, 46.9MB/s]
Downloading data: 100%|██████████| 12.5M/12.5M [00:00<00:00, 76.1MB/s]
Downloading data: 100%|██████████| 2.22M/2.22M [00:00<00:00, 36.2MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.42it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2407.29it/s]
                                                                             

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--kick_starter_funding_ordinal-bc45dae77b1c676d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


100it [00:00, 379575.02it/s]
21626it [00:00, 334115.53it/s]



ensemble_25 on kick
Test accuaracy: 0.6999445112364746
Test accuaracy (sample): 0.66
Test AUC: 0.7410626481173135
Test AUC (sample): 0.7507507507507508
Test % by label: [0.680384722093776, 0.319615277906224]
Test % by label (sample): [0.63, 0.37]
Using dataset kick, ordinal version


Downloading readme: 100%|██████████| 841/841 [00:00<00:00, 389kB/s]


Downloading and preparing dataset None/None (download: 17.58 MiB, generated: 29.54 MiB, post-processed: Unknown size, total: 47.11 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--kick_starter_funding_ordinal-bc45dae77b1c676d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 3.69M/3.69M [00:00<00:00, 44.5MB/s]
Downloading data: 100%|██████████| 12.5M/12.5M [00:00<00:00, 77.6MB/s]
Downloading data: 100%|██████████| 2.22M/2.22M [00:00<00:00, 37.1MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.31it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2418.86it/s]
                                                                             

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--kick_starter_funding_ordinal-bc45dae77b1c676d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 841/841 [00:00<00:00, 280kB/s]


Downloading and preparing dataset None/None (download: 17.58 MiB, generated: 29.54 MiB, post-processed: Unknown size, total: 47.11 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--kick_starter_funding_ordinal-bc45dae77b1c676d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 3.69M/3.69M [00:00<00:00, 42.6MB/s]
Downloading data: 100%|██████████| 12.5M/12.5M [00:00<00:00, 74.7MB/s]
Downloading data: 100%|██████████| 2.22M/2.22M [00:00<00:00, 36.5MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.29it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2697.30it/s]
                                                                             

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--kick_starter_funding_ordinal-bc45dae77b1c676d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


100it [00:00, 384798.53it/s]
21626it [00:00, 340815.57it/s]



ensemble_50 on kick
Test accuaracy: 0.7136779802090076
Test accuaracy (sample): 0.67
Test AUC: 0.7735201424600657
Test AUC (sample): 0.7691977691977692
Test % by label: [0.680384722093776, 0.319615277906224]
Test % by label (sample): [0.63, 0.37]
Using dataset kick, ordinal version


Downloading readme: 100%|██████████| 841/841 [00:00<00:00, 310kB/s]


Downloading and preparing dataset None/None (download: 17.58 MiB, generated: 29.54 MiB, post-processed: Unknown size, total: 47.11 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--kick_starter_funding_ordinal-bc45dae77b1c676d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 3.69M/3.69M [00:00<00:00, 42.3MB/s]
Downloading data: 100%|██████████| 12.5M/12.5M [00:00<00:00, 75.8MB/s]
Downloading data: 100%|██████████| 2.22M/2.22M [00:00<00:00, 35.3MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.31it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2402.69it/s]
                                                                             

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--kick_starter_funding_ordinal-bc45dae77b1c676d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 841/841 [00:00<00:00, 377kB/s]


Downloading and preparing dataset None/None (download: 17.58 MiB, generated: 29.54 MiB, post-processed: Unknown size, total: 47.11 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--kick_starter_funding_ordinal-bc45dae77b1c676d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 3.69M/3.69M [00:00<00:00, 43.2MB/s]
Downloading data: 100%|██████████| 12.5M/12.5M [00:00<00:00, 79.6MB/s]
Downloading data: 100%|██████████| 2.22M/2.22M [00:00<00:00, 36.4MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.09it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2652.94it/s]
                                                                             

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--kick_starter_funding_ordinal-bc45dae77b1c676d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


100it [00:00, 442904.33it/s]
21626it [00:00, 346552.73it/s]



ensemble_75 on kick
Test accuaracy: 0.7177471562008694
Test accuaracy (sample): 0.69
Test AUC: 0.7679046143380706
Test AUC (sample): 0.7301587301587301
Test % by label: [0.680384722093776, 0.319615277906224]
Test % by label (sample): [0.63, 0.37]
Using dataset kick, all as text version


Downloading readme: 100%|██████████| 840/840 [00:00<00:00, 376kB/s]


Downloading and preparing dataset None/None (download: 18.14 MiB, generated: 30.71 MiB, post-processed: Unknown size, total: 48.85 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--kick_starter_funding_all_text-359fb6748cb2726a/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 3.81M/3.81M [00:00<00:00, 44.8MB/s]
Downloading data: 100%|██████████| 12.9M/12.9M [00:00<00:00, 77.5MB/s]
Downloading data: 100%|██████████| 2.29M/2.29M [00:00<00:00, 37.8MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.31it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2503.56it/s]
                                                                             

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--kick_starter_funding_all_text-359fb6748cb2726a/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 840/840 [00:00<00:00, 270kB/s]


Downloading and preparing dataset None/None (download: 18.14 MiB, generated: 30.71 MiB, post-processed: Unknown size, total: 48.85 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--kick_starter_funding_all_text-359fb6748cb2726a/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 3.81M/3.81M [00:00<00:00, 45.3MB/s]
Downloading data: 100%|██████████| 12.9M/12.9M [00:00<00:00, 79.2MB/s]
Downloading data: 100%|██████████| 2.29M/2.29M [00:00<00:00, 38.0MB/s]
Downloading data files: 100%|██████████| 3/3 [00:01<00:00,  1.50it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2643.47it/s]
                                                                             

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--kick_starter_funding_all_text-359fb6748cb2726a/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.

all_text on kick
Test accuaracy: 0.0676500508646999
Test accuaracy (sample): 0.05
Test AUC: 0.5144450465889125
Test AUC (sample): 0.5066495066495067
Test % by label: [0.680384722093776, 0.319615277906224]
Test % by label (sample): [0.63, 0.37]
Using dataset kick, ordinal version


Downloading readme: 100%|██████████| 841/841 [00:00<00:00, 1.06MB/s]


Downloading and preparing dataset None/None (download: 17.58 MiB, generated: 29.54 MiB, post-processed: Unknown size, total: 47.11 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--kick_starter_funding_ordinal-bc45dae77b1c676d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 3.69M/3.69M [00:00<00:00, 43.7MB/s]
Downloading data: 100%|██████████| 12.5M/12.5M [00:00<00:00, 73.5MB/s]
Downloading data: 100%|██████████| 2.22M/2.22M [00:00<00:00, 39.8MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.14it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2955.12it/s]
                                                                             

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--kick_starter_funding_ordinal-bc45dae77b1c676d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 841/841 [00:00<00:00, 266kB/s]


Downloading and preparing dataset None/None (download: 17.58 MiB, generated: 29.54 MiB, post-processed: Unknown size, total: 47.11 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--kick_starter_funding_ordinal-bc45dae77b1c676d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 3.69M/3.69M [00:00<00:00, 43.2MB/s]
Downloading data: 100%|██████████| 12.5M/12.5M [00:00<00:00, 76.5MB/s]
Downloading data: 100%|██████████| 2.22M/2.22M [00:00<00:00, 38.9MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.36it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2554.39it/s]
                                                                             

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--kick_starter_funding_ordinal-bc45dae77b1c676d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 841/841 [00:00<00:00, 1.40MB/s]


Downloading and preparing dataset None/None (download: 17.58 MiB, generated: 29.54 MiB, post-processed: Unknown size, total: 47.11 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--kick_starter_funding_ordinal-bc45dae77b1c676d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 3.69M/3.69M [00:00<00:00, 43.6MB/s]
Downloading data: 100%|██████████| 12.5M/12.5M [00:00<00:00, 80.4MB/s]
Downloading data: 100%|██████████| 2.22M/2.22M [00:00<00:00, 36.4MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.30it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2755.18it/s]
                                                                             

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--kick_starter_funding_ordinal-bc45dae77b1c676d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
100it [00:00, 356658.50it/s]
21626it [00:00, 337129.42it/s]



stack on kick
Test accuaracy: 0.7072967724035882
Test accuaracy (sample): 0.65
Test AUC: 0.7491010899483486
Test AUC (sample): 0.7211497211497212
Test % by label: [0.680384722093776, 0.319615277906224]
Test % by label (sample): [0.63, 0.37]


## Jigsaw (ROC-AUC)

In [6]:
ds_type = "jigsaw"

for model_type in ["ensemble_25", "ensemble_50", "ensemble_75", "all_text", "stack"]:
    sample_preds, sample_y, preds, y = run_model(
        model_type, ds_type=ds_type, tab_scale_factor=1
    )

    print(f"\n{model_type} on {ds_type}")
    print(f"Test accuaracy: {np.mean(np.argmax(preds, axis=1) == y)}")
    print(
        f"Test accuaracy (sample): {np.mean(np.argmax(sample_preds, axis=1) == sample_y)}"
    )
    print(f"Test AUC: {roc_auc_score(y, preds[:, 1])}")
    print(f"Test AUC (sample): {roc_auc_score(sample_y, sample_preds[:, 1])}")
    print(
        f"Test % by label: {[np.mean(y == label) for label in np.unique(y)]}")
    print(
        f"Test % by label (sample): {[np.mean(sample_y == label) for label in np.unique(y)]}"
    )

Using dataset jigsaw, ordinal version


Downloading readme: 100%|██████████| 1.77k/1.77k [00:00<00:00, 803kB/s]


Downloading and preparing dataset None/None (download: 27.70 MiB, generated: 66.00 MiB, post-processed: Unknown size, total: 93.70 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--jigsaw_unintended_bias100K_ordinal-8e97391c4f489562/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 5.82M/5.82M [00:00<00:00, 61.6MB/s]
Downloading data: 100%|██████████| 19.7M/19.7M [00:00<00:00, 90.5MB/s]
Downloading data: 100%|██████████| 3.48M/3.48M [00:00<00:00, 40.4MB/s]
Downloading data files: 100%|██████████| 3/3 [00:03<00:00,  1.09s/it]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2733.63it/s]
                                                                                        

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--jigsaw_unintended_bias100K_ordinal-8e97391c4f489562/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 1.77k/1.77k [00:00<00:00, 573kB/s]


Downloading and preparing dataset None/None (download: 27.70 MiB, generated: 66.00 MiB, post-processed: Unknown size, total: 93.70 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--jigsaw_unintended_bias100K_ordinal-8e97391c4f489562/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 5.82M/5.82M [00:00<00:00, 58.5MB/s]
Downloading data: 100%|██████████| 19.7M/19.7M [00:00<00:00, 86.8MB/s]
Downloading data: 100%|██████████| 3.48M/3.48M [00:00<00:00, 44.7MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.15it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2793.72it/s]
                                                                                        

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--jigsaw_unintended_bias100K_ordinal-8e97391c4f489562/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


100it [00:00, 440578.15it/s]
25000it [00:00, 181158.47it/s]



ensemble_25 on jigsaw
Test accuaracy: 0.94256
Test accuaracy (sample): 0.95
Test AUC: 0.9321638495642937
Test AUC (sample): 0.8378947368421054
Test % by label: [0.9426, 0.0574]
Test % by label (sample): [0.95, 0.05]
Using dataset jigsaw, ordinal version


Downloading readme: 100%|██████████| 1.77k/1.77k [00:00<00:00, 767kB/s]


Downloading and preparing dataset None/None (download: 27.70 MiB, generated: 66.00 MiB, post-processed: Unknown size, total: 93.70 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--jigsaw_unintended_bias100K_ordinal-8e97391c4f489562/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 5.82M/5.82M [00:00<00:00, 63.4MB/s]
Downloading data: 100%|██████████| 19.7M/19.7M [00:00<00:00, 84.6MB/s]
Downloading data: 100%|██████████| 3.48M/3.48M [00:00<00:00, 41.6MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.24it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2777.07it/s]
                                                                                        

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--jigsaw_unintended_bias100K_ordinal-8e97391c4f489562/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 1.77k/1.77k [00:00<00:00, 768kB/s]


Downloading and preparing dataset None/None (download: 27.70 MiB, generated: 66.00 MiB, post-processed: Unknown size, total: 93.70 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--jigsaw_unintended_bias100K_ordinal-8e97391c4f489562/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 5.82M/5.82M [00:00<00:00, 57.3MB/s]
Downloading data: 100%|██████████| 19.7M/19.7M [00:00<00:00, 87.4MB/s]
Downloading data: 100%|██████████| 3.48M/3.48M [00:00<00:00, 40.4MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.16it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2936.50it/s]
                                                                                        

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--jigsaw_unintended_bias100K_ordinal-8e97391c4f489562/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


100it [00:00, 440115.84it/s]
25000it [00:00, 469621.69it/s]



ensemble_50 on jigsaw
Test accuaracy: 0.9448
Test accuaracy (sample): 0.95
Test AUC: 0.9456286008527085
Test AUC (sample): 0.8631578947368422
Test % by label: [0.9426, 0.0574]
Test % by label (sample): [0.95, 0.05]
Using dataset jigsaw, ordinal version


Downloading readme: 100%|██████████| 1.77k/1.77k [00:00<00:00, 737kB/s]


Downloading and preparing dataset None/None (download: 27.70 MiB, generated: 66.00 MiB, post-processed: Unknown size, total: 93.70 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--jigsaw_unintended_bias100K_ordinal-8e97391c4f489562/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 5.82M/5.82M [00:00<00:00, 58.7MB/s]
Downloading data: 100%|██████████| 19.7M/19.7M [00:00<00:00, 87.3MB/s]
Downloading data: 100%|██████████| 3.48M/3.48M [00:00<00:00, 40.5MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.22it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2961.38it/s]
                                                                                        

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--jigsaw_unintended_bias100K_ordinal-8e97391c4f489562/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 1.77k/1.77k [00:00<00:00, 723kB/s]


Downloading and preparing dataset None/None (download: 27.70 MiB, generated: 66.00 MiB, post-processed: Unknown size, total: 93.70 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--jigsaw_unintended_bias100K_ordinal-8e97391c4f489562/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 5.82M/5.82M [00:00<00:00, 56.8MB/s]
Downloading data: 100%|██████████| 19.7M/19.7M [00:00<00:00, 85.9MB/s]
Downloading data: 100%|██████████| 3.48M/3.48M [00:00<00:00, 41.7MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.24it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2653.50it/s]
                                                                             

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--jigsaw_unintended_bias100K_ordinal-8e97391c4f489562/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


100it [00:00, 447631.16it/s]
25000it [00:00, 181678.88it/s]



ensemble_75 on jigsaw
Test accuaracy: 0.9608
Test accuaracy (sample): 0.96
Test AUC: 0.9511750950554884
Test AUC (sample): 0.9515789473684211
Test % by label: [0.9426, 0.0574]
Test % by label (sample): [0.95, 0.05]
Using dataset jigsaw, all as text version


Downloading readme: 100%|██████████| 1.76k/1.76k [00:00<00:00, 762kB/s]


Downloading and preparing dataset None/None (download: 27.75 MiB, generated: 61.00 MiB, post-processed: Unknown size, total: 88.75 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--jigsaw_unintended_bias100K_all_text-351e9b1e029b8621/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 5.83M/5.83M [00:00<00:00, 61.7MB/s]
Downloading data: 100%|██████████| 19.8M/19.8M [00:00<00:00, 85.3MB/s]
Downloading data: 100%|██████████| 3.49M/3.49M [00:00<00:00, 38.6MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.15it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2887.31it/s]
                                                                                        

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--jigsaw_unintended_bias100K_all_text-351e9b1e029b8621/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 1.76k/1.76k [00:00<00:00, 598kB/s]


Downloading and preparing dataset None/None (download: 27.75 MiB, generated: 61.00 MiB, post-processed: Unknown size, total: 88.75 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--jigsaw_unintended_bias100K_all_text-351e9b1e029b8621/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 5.83M/5.83M [00:00<00:00, 62.4MB/s]
Downloading data: 100%|██████████| 19.8M/19.8M [00:00<00:00, 85.4MB/s]
Downloading data: 100%|██████████| 3.49M/3.49M [00:00<00:00, 41.4MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.27it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 3007.39it/s]
                                                                                        

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--jigsaw_unintended_bias100K_all_text-351e9b1e029b8621/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.

all_text on jigsaw
Test accuaracy: 0.00184
Test accuaracy (sample): 0.0
Test AUC: 0.5108651509539557
Test AUC (sample): 0.6252631578947369
Test % by label: [0.9426, 0.0574]
Test % by label (sample): [0.95, 0.05]
Using dataset jigsaw, ordinal version


Downloading readme: 100%|██████████| 1.77k/1.77k [00:00<00:00, 4.12MB/s]


Downloading and preparing dataset None/None (download: 27.70 MiB, generated: 66.00 MiB, post-processed: Unknown size, total: 93.70 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--jigsaw_unintended_bias100K_ordinal-8e97391c4f489562/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 5.82M/5.82M [00:00<00:00, 65.0MB/s]
Downloading data: 100%|██████████| 19.7M/19.7M [00:00<00:00, 89.9MB/s]
Downloading data: 100%|██████████| 3.48M/3.48M [00:00<00:00, 40.2MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.22it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 3063.77it/s]
                                                                                        

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--jigsaw_unintended_bias100K_ordinal-8e97391c4f489562/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 1.77k/1.77k [00:00<00:00, 634kB/s]


Downloading and preparing dataset None/None (download: 27.70 MiB, generated: 66.00 MiB, post-processed: Unknown size, total: 93.70 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--jigsaw_unintended_bias100K_ordinal-8e97391c4f489562/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 5.82M/5.82M [00:00<00:00, 63.6MB/s]
Downloading data: 100%|██████████| 19.7M/19.7M [00:00<00:00, 92.3MB/s]
Downloading data: 100%|██████████| 3.48M/3.48M [00:00<00:00, 41.1MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.09it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 3044.50it/s]
                                                                                        

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--jigsaw_unintended_bias100K_ordinal-8e97391c4f489562/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 1.77k/1.77k [00:00<00:00, 831kB/s]


Downloading and preparing dataset None/None (download: 27.70 MiB, generated: 66.00 MiB, post-processed: Unknown size, total: 93.70 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--jigsaw_unintended_bias100K_ordinal-8e97391c4f489562/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 5.82M/5.82M [00:00<00:00, 64.8MB/s]
Downloading data: 100%|██████████| 19.7M/19.7M [00:00<00:00, 85.7MB/s]
Downloading data: 100%|██████████| 3.48M/3.48M [00:00<00:00, 44.3MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.38it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2938.56it/s]
                                                                                        

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--jigsaw_unintended_bias100K_ordinal-8e97391c4f489562/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
100it [00:00, 451485.90it/s]
25000it [00:00, 184091.18it/s]



stack on jigsaw
Test accuaracy: 0.93208
Test accuaracy (sample): 0.95
Test AUC: 0.9158202495728693
Test AUC (sample): 0.9515789473684211
Test % by label: [0.9426, 0.0574]
Test % by label (sample): [0.95, 0.05]


## Product Sentiment

In [7]:
ds_type = "prod_sent"

for model_type in ["ensemble_25", "ensemble_50", "ensemble_75", "all_text", "stack"]:
    sample_preds, sample_y, preds, y = run_model(
        model_type, ds_type=ds_type, tab_scale_factor=1
    )

    print(f"\n{model_type} on {ds_type}")
    print(f"Test accuaracy: {np.mean(np.argmax(preds, axis=1) == y)}")
    print(
        f"Test accuaracy (sample): {np.mean(np.argmax(sample_preds, axis=1) == sample_y)}"
    )
    # print(f"Test AUC: {roc_auc_score(y, preds[:, 1])}")
    # print(f"Test AUC (sample): {roc_auc_score(sample_y, sample_preds[:, 1])}")
    print(
        f"Test % by label: {[np.mean(y == label) for label in np.unique(y)]}")
    print(
        f"Test % by label (sample): {[np.mean(sample_y == label) for label in np.unique(y)]}"
    )

Using dataset prod_sent, ordinal version


Downloading readme: 100%|██████████| 590/590 [00:00<00:00, 268kB/s]


Downloading and preparing dataset None/None (download: 431.83 KiB, generated: 777.70 KiB, post-processed: Unknown size, total: 1.18 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--product_sentiment_machine_hack_ordinal-bbe910d41184c5e7/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 90.6k/90.6k [00:00<00:00, 5.44MB/s]
Downloading data: 100%|██████████| 297k/297k [00:00<00:00, 9.63MB/s]]
Downloading data: 100%|██████████| 54.2k/54.2k [00:00<00:00, 5.37MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.10it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2489.69it/s]
                                                                           

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--product_sentiment_machine_hack_ordinal-bbe910d41184c5e7/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 590/590 [00:00<00:00, 266kB/s]


Downloading and preparing dataset None/None (download: 431.83 KiB, generated: 777.70 KiB, post-processed: Unknown size, total: 1.18 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--product_sentiment_machine_hack_ordinal-bbe910d41184c5e7/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 90.6k/90.6k [00:00<00:00, 5.47MB/s]
Downloading data: 100%|██████████| 297k/297k [00:00<00:00, 9.59MB/s]]
Downloading data: 100%|██████████| 54.2k/54.2k [00:00<00:00, 5.61MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.35it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 3010.99it/s]
                                                                           

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--product_sentiment_machine_hack_ordinal-bbe910d41184c5e7/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


100it [00:00, 884874.26it/s]
1273it [00:00, 1164271.48it/s]



ensemble_25 on prod_sent
Test accuaracy: 0.8908091123330715
Test accuaracy (sample): 0.95
Test % by label: [0.010997643362136685, 0.0589159465828751, 0.5875883739198743, 0.3424980361351139]
Test % by label (sample): [0.01, 0.02, 0.62, 0.35]
Using dataset prod_sent, ordinal version


Downloading readme: 100%|██████████| 590/590 [00:00<00:00, 848kB/s]


Downloading and preparing dataset None/None (download: 431.83 KiB, generated: 777.70 KiB, post-processed: Unknown size, total: 1.18 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--product_sentiment_machine_hack_ordinal-bbe910d41184c5e7/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 90.6k/90.6k [00:00<00:00, 5.57MB/s]
Downloading data: 100%|██████████| 297k/297k [00:00<00:00, 10.8MB/s]]
Downloading data: 100%|██████████| 54.2k/54.2k [00:00<00:00, 7.40MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.46it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2511.06it/s]
                                                                           

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--product_sentiment_machine_hack_ordinal-bbe910d41184c5e7/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 590/590 [00:00<00:00, 269kB/s]


Downloading and preparing dataset None/None (download: 431.83 KiB, generated: 777.70 KiB, post-processed: Unknown size, total: 1.18 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--product_sentiment_machine_hack_ordinal-bbe910d41184c5e7/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 90.6k/90.6k [00:00<00:00, 5.35MB/s]
Downloading data: 100%|██████████| 297k/297k [00:00<00:00, 10.3MB/s]]
Downloading data: 100%|██████████| 54.2k/54.2k [00:00<00:00, 5.61MB/s]
Downloading data files: 100%|██████████| 3/3 [00:01<00:00,  1.64it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 3006.67it/s]
                                                                           

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--product_sentiment_machine_hack_ordinal-bbe910d41184c5e7/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


100it [00:00, 953250.91it/s]
1273it [00:00, 1418530.55it/s]



ensemble_50 on prod_sent
Test accuaracy: 0.8695993715632364
Test accuaracy (sample): 0.9
Test % by label: [0.010997643362136685, 0.0589159465828751, 0.5875883739198743, 0.3424980361351139]
Test % by label (sample): [0.01, 0.02, 0.62, 0.35]
Using dataset prod_sent, ordinal version


Downloading readme: 100%|██████████| 590/590 [00:00<00:00, 263kB/s]


Downloading and preparing dataset None/None (download: 431.83 KiB, generated: 777.70 KiB, post-processed: Unknown size, total: 1.18 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--product_sentiment_machine_hack_ordinal-bbe910d41184c5e7/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 90.6k/90.6k [00:00<00:00, 5.47MB/s]
Downloading data: 100%|██████████| 297k/297k [00:00<00:00, 9.79MB/s]]
Downloading data: 100%|██████████| 54.2k/54.2k [00:00<00:00, 7.81MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.37it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2860.40it/s]
                                                                           

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--product_sentiment_machine_hack_ordinal-bbe910d41184c5e7/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 590/590 [00:00<00:00, 247kB/s]


Downloading and preparing dataset None/None (download: 431.83 KiB, generated: 777.70 KiB, post-processed: Unknown size, total: 1.18 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--product_sentiment_machine_hack_ordinal-bbe910d41184c5e7/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 90.6k/90.6k [00:00<00:00, 7.30MB/s]
Downloading data: 100%|██████████| 297k/297k [00:00<00:00, 9.59MB/s]]
Downloading data: 100%|██████████| 54.2k/54.2k [00:00<00:00, 7.39MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.49it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 3334.99it/s]
                                                                           

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--product_sentiment_machine_hack_ordinal-bbe910d41184c5e7/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


100it [00:00, 873813.33it/s]
1273it [00:00, 994847.96it/s]



ensemble_75 on prod_sent
Test accuaracy: 0.6614296936370778
Test accuaracy (sample): 0.64
Test % by label: [0.010997643362136685, 0.0589159465828751, 0.5875883739198743, 0.3424980361351139]
Test % by label (sample): [0.01, 0.02, 0.62, 0.35]
Using dataset prod_sent, all as text version


Downloading readme: 100%|██████████| 590/590 [00:00<00:00, 271kB/s]


Downloading and preparing dataset None/None (download: 430.99 KiB, generated: 758.28 KiB, post-processed: Unknown size, total: 1.16 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--product_sentiment_machine_hack_all_text-64fff8d5159768bc/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 90.4k/90.4k [00:00<00:00, 5.87MB/s]
Downloading data: 100%|██████████| 297k/297k [00:00<00:00, 9.67MB/s]]
Downloading data: 100%|██████████| 54.1k/54.1k [00:00<00:00, 5.67MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.28it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2648.48it/s]
                                                                           

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--product_sentiment_machine_hack_all_text-64fff8d5159768bc/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 590/590 [00:00<00:00, 249kB/s]


Downloading and preparing dataset None/None (download: 430.99 KiB, generated: 758.28 KiB, post-processed: Unknown size, total: 1.16 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--product_sentiment_machine_hack_all_text-64fff8d5159768bc/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 90.4k/90.4k [00:00<00:00, 5.93MB/s]
Downloading data: 100%|██████████| 297k/297k [00:00<00:00, 9.58MB/s]]
Downloading data: 100%|██████████| 54.1k/54.1k [00:00<00:00, 5.58MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.46it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2466.75it/s]
                                                                           

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--product_sentiment_machine_hack_all_text-64fff8d5159768bc/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.

all_text on prod_sent
Test accuaracy: 0.002356637863315004
Test accuaracy (sample): 0.0
Test % by label: [0.010997643362136685, 0.0589159465828751, 0.5875883739198743, 0.3424980361351139]
Test % by label (sample): [0.01, 0.02, 0.62, 0.35]
Using dataset prod_sent, ordinal version


Downloading readme: 100%|██████████| 590/590 [00:00<00:00, 236kB/s]


Downloading and preparing dataset None/None (download: 431.83 KiB, generated: 777.70 KiB, post-processed: Unknown size, total: 1.18 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--product_sentiment_machine_hack_ordinal-bbe910d41184c5e7/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 90.6k/90.6k [00:00<00:00, 5.58MB/s]
Downloading data: 100%|██████████| 297k/297k [00:00<00:00, 9.67MB/s]]
Downloading data: 100%|██████████| 54.2k/54.2k [00:00<00:00, 7.82MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.46it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2697.88it/s]
                                                                           

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--product_sentiment_machine_hack_ordinal-bbe910d41184c5e7/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 590/590 [00:00<00:00, 262kB/s]


Downloading and preparing dataset None/None (download: 431.83 KiB, generated: 777.70 KiB, post-processed: Unknown size, total: 1.18 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--product_sentiment_machine_hack_ordinal-bbe910d41184c5e7/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 90.6k/90.6k [00:00<00:00, 6.05MB/s]
Downloading data: 100%|██████████| 297k/297k [00:00<00:00, 9.70MB/s]]
Downloading data: 100%|██████████| 54.2k/54.2k [00:00<00:00, 7.71MB/s]
Downloading data files: 100%|██████████| 3/3 [00:01<00:00,  1.54it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2605.70it/s]
                                                                           

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--product_sentiment_machine_hack_ordinal-bbe910d41184c5e7/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 590/590 [00:00<00:00, 287kB/s]


Downloading and preparing dataset None/None (download: 431.83 KiB, generated: 777.70 KiB, post-processed: Unknown size, total: 1.18 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--product_sentiment_machine_hack_ordinal-bbe910d41184c5e7/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 90.6k/90.6k [00:00<00:00, 6.04MB/s]
Downloading data: 100%|██████████| 297k/297k [00:00<00:00, 9.62MB/s]]
Downloading data: 100%|██████████| 54.2k/54.2k [00:00<00:00, 5.85MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.37it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2957.21it/s]
                                                                           

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--product_sentiment_machine_hack_ordinal-bbe910d41184c5e7/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
100it [00:00, 940426.91it/s]
1273it [00:00, 1408427.59it/s]



stack on prod_sent
Test accuaracy: 0.6496465043205027
Test accuaracy (sample): 0.68
Test % by label: [0.010997643362136685, 0.0589159465828751, 0.5875883739198743, 0.3424980361351139]
Test % by label (sample): [0.01, 0.02, 0.62, 0.35]


## Wine

def run_model(model_type, ds_type, test_set_size=100, tab_scale_factor=2):
    di = get_dataset_info(ds_type, model_type)
    # Data
    train_df = load_dataset(
        di.ds_name,
        split="train",  # download_mode="force_redownload"
    ).to_pandas()
    y_train = train_df[di.label_col]

    test_df = load_dataset(
        di.ds_name,
        split="test",  # download_mode="force_redownload"
    ).to_pandas()
    test_df_sample = test_df.sample(test_set_size, random_state=55)

    # Models
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
    if model_type == "all_text":
        text_pipeline = pipeline(
            "text-classification",
            # model=di.text_model_name,
            model="../models/wine/glowing-morning-9/checkpoint-6705",
            tokenizer=tokenizer,
            device="cuda:0",
            truncation=True,
            padding=True,
            top_k=None,
        )
        # Define how to convert all columns to a single string

        def cols_to_str_fn(array):
            return " | ".join(
                [f"{col}: {val}" for col, val in zip(di.tab_cols + di.text_cols, array)]
            )

        model = AllAsTextModel(
            text_pipeline=text_pipeline,
            cols_to_str_fn=cols_to_str_fn,
            # cols=di.tab_cols + di.text_cols
        )
    else:
        text_pipeline = pipeline(
            "text-classification",
            model=di.text_model_name,
            tokenizer=tokenizer,
            device="cuda:0",
            truncation=True,
            padding=True,
            top_k=None,
        )
        # Define how to convert the text columns to a single string
        if len(di.text_cols) == 1:

            def cols_to_str_fn(array):
                return array[0]

        else:

            def cols_to_str_fn(array):
                return " | ".join(
                    [f"{col}: {val}" for col, val in zip(di.text_cols, array)]
                )

        # LightGBM requires explicitly marking categorical features
        train_df[di.categorical_cols] = train_df[di.categorical_cols].astype("category")
        test_df_sample[di.categorical_cols] = test_df_sample[
            di.categorical_cols
        ].astype("category")

        tab_model = lgb.LGBMClassifier(random_state=42)
        tab_model.fit(train_df[di.tab_cols], y_train)

        if model_type == "ensemble_50":
            model = WeightedEnsemble(
                tab_model=tab_model,
                text_pipeline=text_pipeline,
                text_weight=0.5,
                cols_to_str_fn=cols_to_str_fn,
            )
        elif model_type == "ensemble_75":
            model = WeightedEnsemble(
                tab_model=tab_model,
                text_pipeline=text_pipeline,
                text_weight=0.75,
                cols_to_str_fn=cols_to_str_fn,
            )
        elif model_type == "ensemble_25":
            model = WeightedEnsemble(
                tab_model=tab_model,
                text_pipeline=text_pipeline,
                text_weight=0.25,
                cols_to_str_fn=cols_to_str_fn,
            )
        elif model_type == "stack":
            """
            For the stack model, we make predictions on the validation set. These predictions
            are then used as features for the stack model (another LightGBM model) along with
            the other tabular features. In doing so the stack model learns, depending on the
            tabular features, when to trust the tabular model and when to trust the text model.
            """
            val_df = load_dataset(
                di.ds_name, split="validation", download_mode="force_redownload"
            ).to_pandas()
            val_df[di.categorical_cols] = val_df[di.categorical_cols].astype("category")
            y_val = val_df[di.label_col]
            val_text = list(map(cols_to_str_fn, val_df[di.text_cols].values))

            # Training set is the preditions from the tabular and text models on the validation set
            # plus the tabular features from the validation set
            text_val_preds = text_pipeline(val_text)
            # text_val_preds = np.array(
            #     [format_text_pred(pred) for pred in text_val_preds]
            # )
            text_val_preds = np.array(
                [[lab["score"] for lab in pred] for pred in text_val_preds]
            )

            # add text and tabular predictions to the val_df
            stack_val_df = val_df[di.tab_cols]
            tab_val_preds = tab_model.predict_proba(stack_val_df)
            for i in range(text_val_preds.shape[1]):
                stack_val_df[f"text_pred_{i}"] = text_val_preds[:, i]
            for i in range(tab_val_preds.shape[1]):
                stack_val_df[f"tab_pred_{i}"] = tab_val_preds[:, i]

            stack_model = lgb.LGBMClassifier(random_state=42)
            stack_model.fit(stack_val_df, y_val)

            model = StackModel(
                tab_model=tab_model,
                text_pipeline=text_pipeline,
                stack_model=stack_model,
                cols_to_str_fn=cols_to_str_fn,
            )
        else:
            raise ValueError(f"Invalid model type of {model_type}")

    np.random.seed(1)
    test_sample_vals = test_df_sample[di.tab_cols + di.text_cols].values
    test_vals = test_df[di.tab_cols + di.text_cols].iloc[:10].values
    # train_sample = train_df.sample(1000, random_state=55)
    # train_sample_y = train_sample[di.label_col]
    # train_sample_vals = train_sample[di.tab_cols + di.text_cols].values
    # return model.predict(train_sample_vals), train_sample_y
    return model.predict(test_vals), test_df[di.label_col].iloc[:10].values
    return (
        model.predict(test_sample_vals),
        test_df_sample[di.label_col].values,
        model.predict(test_vals),
        test_df[di.label_col].values,
    )

def run_model(model_type, ds_type, test_set_size=100, tab_scale_factor=2):
    di = get_dataset_info(ds_type, model_type)
    # Data
    train_df = load_dataset(
        di.ds_name, split="train", download_mode="force_redownload"
    ).to_pandas()
    y_train = train_df[di.label_col]

    test_df = load_dataset(
        di.ds_name, split="test", download_mode="force_redownload"
    ).to_pandas()
    test_df_sample = test_df.sample(test_set_size, random_state=55)

    # Models
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
    if model_type == "all_text":
        text_pipeline = pipeline(
            "text-classification",
            # model=di.text_model_name,
            model="../models/wine/glowing-morning-9/checkpoint-6705",
            tokenizer=tokenizer,
            device="cuda:0",
            truncation=True,
            padding=True,
            top_k=None,
        )
        # Define how to convert all columns to a single string

        def cols_to_str_fn(array):
            return " | ".join(
                [f"{col}: {val}" for col, val in zip(
                    di.tab_cols + di.text_cols, array)]
            )

        model = AllAsTextModel(
            text_pipeline=text_pipeline,
            cols_to_str_fn=cols_to_str_fn,
            # cols=di.tab_cols + di.text_cols
        )
    else:
        text_pipeline = pipeline(
            "text-classification",
            model=di.text_model_name,
            tokenizer=tokenizer,
            device="cuda:0",
            truncation=True,
            padding=True,
            top_k=None,
        )
        # Define how to convert the text columns to a single string
        if len(di.text_cols) == 1:

            def cols_to_str_fn(array):
                return array[0]

        else:

            def cols_to_str_fn(array):
                return " | ".join(
                    [f"{col}: {val}" for col, val in zip(di.text_cols, array)]
                )

        # LightGBM requires explicitly marking categorical features
        train_df[di.categorical_cols] = train_df[di.categorical_cols].astype(
            "category")
        test_df_sample[di.categorical_cols] = test_df_sample[
            di.categorical_cols
        ].astype("category")

        tab_model = lgb.LGBMClassifier(random_state=42)
        tab_model.fit(train_df[di.tab_cols], y_train)

        if model_type == "ensemble_50":
            model = WeightedEnsemble(
                tab_model=tab_model,
                text_pipeline=text_pipeline,
                text_weight=0.5,
                cols_to_str_fn=cols_to_str_fn,
            )
        elif model_type == "ensemble_75":
            model = WeightedEnsemble(
                tab_model=tab_model,
                text_pipeline=text_pipeline,
                text_weight=0.75,
                cols_to_str_fn=cols_to_str_fn,
            )
        elif model_type == "ensemble_25":
            model = WeightedEnsemble(
                tab_model=tab_model,
                text_pipeline=text_pipeline,
                text_weight=0.25,
                cols_to_str_fn=cols_to_str_fn,
            )
        elif model_type == "stack":
            """
            For the stack model, we make predictions on the validation set. These predictions
            are then used as features for the stack model (another LightGBM model) along with
            the other tabular features. In doing so the stack model learns, depending on the
            tabular features, when to trust the tabular model and when to trust the text model.
            """
            val_df = load_dataset(
                di.ds_name, split="validation", download_mode="force_redownload"
            ).to_pandas()
            val_df[di.categorical_cols] = val_df[di.categorical_cols].astype(
                "category")
            y_val = val_df[di.label_col]
            val_text = list(map(cols_to_str_fn, val_df[di.text_cols].values))

            # Training set is the preditions from the tabular and text models on the validation set
            # plus the tabular features from the validation set
            text_val_preds = text_pipeline(val_text)
            # text_val_preds = np.array(
            #     [format_text_pred(pred) for pred in text_val_preds]
            # )
            text_val_preds = np.array(
                [[lab["score"] for lab in pred] for pred in text_val_preds]
            )

            # add text and tabular predictions to the val_df
            stack_val_df = val_df[di.tab_cols]
            tab_val_preds = tab_model.predict_proba(stack_val_df)
            for i in range(text_val_preds.shape[1]):
                stack_val_df[f"text_pred_{i}"] = text_val_preds[:, i]
            for i in range(tab_val_preds.shape[1]):
                stack_val_df[f"tab_pred_{i}"] = tab_val_preds[:, i]

            stack_model = lgb.LGBMClassifier(random_state=42)
            stack_model.fit(stack_val_df, y_val)

            model = StackModel(
                tab_model=tab_model,
                text_pipeline=text_pipeline,
                stack_model=stack_model,
                cols_to_str_fn=cols_to_str_fn,
            )
        else:
            raise ValueError(f"Invalid model type of {model_type}")

    np.random.seed(1)
    test_sample_vals = test_df_sample[di.tab_cols + di.text_cols].values
    test_vals = test_df[di.tab_cols + di.text_cols].values

    return (
        model.predict(test_sample_vals),
        test_df_sample[di.label_col].values,
        model.predict(test_vals),
        test_df[di.label_col].values,
    )

In [8]:
ds_type = "wine"

for model_type in [
    # "all_text",
    "ensemble_25",
    #    "ensemble_50",
    #    "ensemble_75",
    #    "stack"
]:
    # sample_preds, sample_y, preds, y = run_model(
    #     model_type, ds_type=ds_type, tab_scale_factor=1
    # )
    preds, y = run_model(model_type, ds_type=ds_type, tab_scale_factor=1)

    print(f"\n{model_type} on {ds_type}")
    print(f"Test accuaracy: {np.mean(np.argmax(preds, axis=1) == y)}")
    print(
        f"Test accuaracy (sample): {np.mean(np.argmax(sample_preds, axis=1) == sample_y)}"
    )
    # print(f"Test AUC: {roc_auc_score(y, preds[:, 1])}")
    # print(f"Test AUC (sample): {roc_auc_score(sample_y, sample_preds[:, 1])}")
    print(f"Test % by label: {[np.mean(y == label) for label in np.unique(y)]}")
    print(
        f"Test % by label (sample): {[np.mean(sample_y == label) for label in np.unique(y)]}"
    )

Using dataset wine, ordinal version


Downloading readme: 100%|██████████| 673/673 [00:00<00:00, 261kB/s]


Downloading and preparing dataset None/None (download: 15.47 MiB, generated: 29.51 MiB, post-processed: Unknown size, total: 44.99 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--wine_reviews_ordinal-c036c1f97c7da848/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 3.24M/3.24M [00:00<00:00, 47.9MB/s]
Downloading data: 100%|██████████| 11.0M/11.0M [00:00<00:00, 75.8MB/s]
Downloading data: 100%|██████████| 1.95M/1.95M [00:00<00:00, 34.4MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.06it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2845.53it/s]
                                                                             

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--wine_reviews_ordinal-c036c1f97c7da848/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 673/673 [00:00<00:00, 299kB/s]


Downloading and preparing dataset None/None (download: 15.47 MiB, generated: 29.51 MiB, post-processed: Unknown size, total: 44.99 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--wine_reviews_ordinal-c036c1f97c7da848/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 3.24M/3.24M [00:00<00:00, 38.6MB/s]
Downloading data: 100%|██████████| 11.0M/11.0M [00:00<00:00, 72.8MB/s]
Downloading data: 100%|██████████| 1.95M/1.95M [00:00<00:00, 37.8MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.18it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2864.31it/s]
                                                                             

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--wine_reviews_ordinal-c036c1f97c7da848/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


100it [00:00, 648269.55it/s]
21031it [00:00, 541480.41it/s]


ValueError: too many values to unpack (expected 2)

In [None]:
y

array([14, 29,  5, ...,  5,  3, 26])

In [None]:
np.argmax(preds, axis=1)

array([ 9,  9, 23, ..., 23,  2, 15])