# Model performance

In [1]:
import numpy as np
import shap
import pickle
from datasets import load_dataset
from src.dataset_info import get_dataset_info
from transformers import pipeline, AutoTokenizer
import pandas as pd
from datasets import load_dataset, Dataset
import os
from tqdm import tqdm
from src.utils import token_segments, text_ft_index_ends

# from src.models import Model
import lightgbm as lgb
from src.models import WeightedEnsemble, StackModel, AllAsTextModel
from src.joint_masker import JointMasker
import argparse
import scipy as sp
from sklearn.metrics import roc_auc_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def run_model(model_type, ds_type, test_set_size=100, tab_scale_factor=2):
    di = get_dataset_info(ds_type, model_type)
    # Data
    train_df = load_dataset(
        di.ds_name, split="train", download_mode="force_redownload"
    ).to_pandas()
    y_train = train_df[di.label_col]

    test_df = load_dataset(
        di.ds_name, split="test", download_mode="force_redownload"
    ).to_pandas()
    test_df_sample = test_df.sample(test_set_size, random_state=55)

    # Models
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
    if model_type in [
        "all_text",
        "all_as_text_tnt_reorder",
        "all_as_text_base_reorder",
    ]:
        text_pipeline = pipeline(
            "text-classification",
            model=di.text_model_name,
            tokenizer=tokenizer,
            device="cuda:0",
            truncation=True,
            padding=True,
            top_k=None,
        )
        # Define how to convert all columns to a single string
        if model_type == "all_text":

            def cols_to_str_fn(array):
                return " | ".join(
                    [
                        f"{col}: {val}"
                        for col, val in zip(di.tab_cols + di.text_cols, array)
                    ]
                )

        elif model_type == "all_as_text_tnt_reorder":
            # Reorder based on the new index order in di
            def cols_to_str_fn(array):
                original = di.tab_cols + di.text_cols
                new = di.tnt_reorder_cols
                new_array = np.array([array[original.index(x)] for x in new])
                return " | ".join([f"{col}: {val}" for col, val in zip(new, new_array)])

        else:
            # Reorder based on the new index order in di
            def cols_to_str_fn(array):
                original = di.tab_cols + di.text_cols
                new = di.base_reorder_cols
                new_array = np.array([array[original.index(x)] for x in new])
                return " | ".join([f"{col}: {val}" for col, val in zip(new, new_array)])

        model = AllAsTextModel(
            text_pipeline=text_pipeline,
            cols_to_str_fn=cols_to_str_fn,
            # cols=di.tab_cols + di.text_cols
        )
    else:
        text_pipeline = pipeline(
            "text-classification",
            model=di.text_model_name,
            tokenizer=tokenizer,
            device="cuda:0",
            truncation=True,
            padding=True,
            top_k=None,
        )
        # Define how to convert the text columns to a single string
        if len(di.text_cols) == 1:

            def cols_to_str_fn(array):
                return array[0]

        else:

            def cols_to_str_fn(array):
                return " | ".join(
                    [f"{col}: {val}" for col, val in zip(di.text_cols, array)]
                )

        # LightGBM requires explicitly marking categorical features
        train_df[di.categorical_cols] = train_df[di.categorical_cols].astype("category")
        test_df_sample[di.categorical_cols] = test_df_sample[
            di.categorical_cols
        ].astype("category")

        tab_model = lgb.LGBMClassifier(random_state=42)
        tab_model.fit(train_df[di.tab_cols], y_train)

        if model_type == "ensemble_50":
            model = WeightedEnsemble(
                tab_model=tab_model,
                text_pipeline=text_pipeline,
                text_weight=0.5,
                cols_to_str_fn=cols_to_str_fn,
            )
        elif model_type == "ensemble_75":
            model = WeightedEnsemble(
                tab_model=tab_model,
                text_pipeline=text_pipeline,
                text_weight=0.75,
                cols_to_str_fn=cols_to_str_fn,
            )
        elif model_type == "ensemble_25":
            model = WeightedEnsemble(
                tab_model=tab_model,
                text_pipeline=text_pipeline,
                text_weight=0.25,
                cols_to_str_fn=cols_to_str_fn,
            )
        elif model_type == "stack":
            """
            For the stack model, we make predictions on the validation set. These predictions
            are then used as features for the stack model (another LightGBM model) along with
            the other tabular features. In doing so the stack model learns, depending on the
            tabular features, when to trust the tabular model and when to trust the text model.
            """
            val_df = load_dataset(
                di.ds_name, split="validation", download_mode="force_redownload"
            ).to_pandas()
            val_df[di.categorical_cols] = val_df[di.categorical_cols].astype("category")
            y_val = val_df[di.label_col]
            val_text = list(map(cols_to_str_fn, val_df[di.text_cols].values))

            # Training set is the preditions from the tabular and text models on the validation set
            # plus the tabular features from the validation set
            text_val_preds = text_pipeline(val_text)
            # text_val_preds = np.array(
            #     [format_text_pred(pred) for pred in text_val_preds]
            # )
            text_val_preds = np.array(
                [[lab["score"] for lab in pred] for pred in text_val_preds]
            )

            # add text and tabular predictions to the val_df
            stack_val_df = val_df[di.tab_cols]
            tab_val_preds = tab_model.predict_proba(stack_val_df)
            for i in range(text_val_preds.shape[1]):
                stack_val_df[f"text_pred_{i}"] = text_val_preds[:, i]
            for i in range(tab_val_preds.shape[1]):
                stack_val_df[f"tab_pred_{i}"] = tab_val_preds[:, i]

            stack_model = lgb.LGBMClassifier(random_state=42)
            stack_model.fit(stack_val_df, y_val)

            model = StackModel(
                tab_model=tab_model,
                text_pipeline=text_pipeline,
                stack_model=stack_model,
                cols_to_str_fn=cols_to_str_fn,
            )
        else:
            raise ValueError(f"Invalid model type of {model_type}")

    np.random.seed(1)
    test_sample_vals = test_df_sample[di.tab_cols + di.text_cols].values
    test_vals = test_df[di.tab_cols + di.text_cols].values

    return (
        model.predict(test_sample_vals),
        test_df_sample[di.label_col].values,
        model.predict(test_vals),
        test_df[di.label_col].values,
    )

## Fake Job Postings (ROC-AUC)

In [3]:
ds_type = "fake"

for model_type in [
    "ensemble_25",
    "ensemble_50",
    "ensemble_75",
    "stack",
    "all_text",
    "all_as_text_tnt_reorder",
    # "all_as_text_base_reorder",
]:
    sample_preds, sample_y, preds, y = run_model(
        model_type, ds_type=ds_type, tab_scale_factor=1
    )

    print(f"\n{model_type} on {ds_type}")
    print(f"Test accuaracy: {np.mean(np.argmax(preds, axis=1) == y)}")
    print(
        f"Test accuaracy (sample): {np.mean(np.argmax(sample_preds, axis=1) == sample_y)}"
    )
    print(f"Test AUC: {roc_auc_score(y, preds[:, 1])}")
    print(f"Test AUC (sample): {roc_auc_score(sample_y, sample_preds[:, 1])}")
    print(f"Test % by label: {[np.mean(y == label) for label in np.unique(y)]}")
    print(
        f"Test % by label (sample): {[np.mean(sample_y == label) for label in np.unique(y)]}"
    )

Using dataset fake, ordinal version


Downloading readme: 100%|██████████| 717/717 [00:00<00:00, 219kB/s]


Downloading and preparing dataset None/None (download: 11.54 MiB, generated: 20.39 MiB, post-processed: Unknown size, total: 31.94 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--fake_job_postings2_ordinal-d873cc356e36f3d4/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 1.41M/1.41M [00:00<00:00, 27.7MB/s]
Downloading data: 100%|██████████| 8.23M/8.23M [00:00<00:00, 72.5MB/s]
Downloading data: 100%|██████████| 2.46M/2.46M [00:00<00:00, 41.5MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.49it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2876.75it/s]
                                                                            

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--fake_job_postings2_ordinal-d873cc356e36f3d4/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 717/717 [00:00<00:00, 250kB/s]


Downloading and preparing dataset None/None (download: 11.54 MiB, generated: 20.39 MiB, post-processed: Unknown size, total: 31.94 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--fake_job_postings2_ordinal-d873cc356e36f3d4/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 1.41M/1.41M [00:00<00:00, 27.7MB/s]
Downloading data: 100%|██████████| 8.23M/8.23M [00:00<00:00, 65.8MB/s]
Downloading data: 100%|██████████| 2.46M/2.46M [00:00<00:00, 41.8MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.16it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2955.82it/s]
                                                                            

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--fake_job_postings2_ordinal-d873cc356e36f3d4/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


100it [00:00, 69626.56it/s]
3182it [00:00, 54455.03it/s]


KeyboardInterrupt: 

## IMDB  (ROC-AUC)

In [None]:
ds_type = "imdb_genre"

for model_type in [
    "ensemble_25",
    "ensemble_50",
    "ensemble_75",
    "stack",
    "all_text",
    "all_as_text_tnt_reorder",
    "all_as_text_base_reorder",
]:
    sample_preds, sample_y, preds, y = run_model(
        model_type, ds_type=ds_type, tab_scale_factor=1
    )

    print(f"\n{model_type} on {ds_type}")
    print(f"Test accuaracy: {np.mean(np.argmax(preds, axis=1) == y)}")
    print(
        f"Test accuaracy (sample): {np.mean(np.argmax(sample_preds, axis=1) == sample_y)}"
    )
    print(f"Test AUC: {roc_auc_score(y, preds[:, 1])}")
    print(f"Test AUC (sample): {roc_auc_score(sample_y, sample_preds[:, 1])}")
    print(f"Test % by label: {[np.mean(y == label) for label in np.unique(y)]}")
    print(
        f"Test % by label (sample): {[np.mean(sample_y == label) for label in np.unique(y)]}"
    )

Using dataset imdb_genre, ordinal version


Downloading readme: 100%|██████████| 904/904 [00:00<00:00, 320kB/s]


Downloading and preparing dataset None/None (download: 224.75 KiB, generated: 322.65 KiB, post-processed: Unknown size, total: 547.40 KiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_ordinal-95c476e18d2d7064/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 51.1k/51.1k [00:00<00:00, 5.31MB/s]
Downloading data: 100%|██████████| 34.6k/34.6k [00:00<00:00, 27.2MB/s]
Downloading data: 100%|██████████| 144k/144k [00:00<00:00, 7.23MB/s]]
Downloading data files: 100%|██████████| 3/3 [00:01<00:00,  1.62it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2772.18it/s]
                                                                           

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_ordinal-95c476e18d2d7064/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 904/904 [00:00<00:00, 830kB/s]


Downloading and preparing dataset None/None (download: 224.75 KiB, generated: 322.65 KiB, post-processed: Unknown size, total: 547.40 KiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_ordinal-95c476e18d2d7064/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 51.1k/51.1k [00:00<00:00, 7.32MB/s]
Downloading data: 100%|██████████| 34.6k/34.6k [00:00<00:00, 42.6MB/s]
Downloading data: 100%|██████████| 144k/144k [00:00<00:00, 7.78MB/s]]
Downloading data files: 100%|██████████| 3/3 [00:01<00:00,  1.59it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2697.30it/s]
                                                                           

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_ordinal-95c476e18d2d7064/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


100it [00:00, 611414.58it/s]
200it [00:00, 750994.45it/s]



ensemble_25 on imdb_genre
Test accuaracy: 0.78
Test accuaracy (sample): 0.79
Test AUC: 0.8530677609848865
Test AUC (sample): 0.8262626262626263
Test % by label: [0.485, 0.515]
Test % by label (sample): [0.55, 0.45]
Using dataset imdb_genre, ordinal version


Downloading readme: 100%|██████████| 904/904 [00:00<00:00, 976kB/s]


Downloading and preparing dataset None/None (download: 224.75 KiB, generated: 322.65 KiB, post-processed: Unknown size, total: 547.40 KiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_ordinal-95c476e18d2d7064/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 51.1k/51.1k [00:00<00:00, 8.50MB/s]
Downloading data: 100%|██████████| 34.6k/34.6k [00:00<00:00, 27.1MB/s]
Downloading data: 100%|██████████| 144k/144k [00:00<00:00, 8.04MB/s]]
Downloading data files: 100%|██████████| 3/3 [00:01<00:00,  1.84it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2730.07it/s]
                                                                           

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_ordinal-95c476e18d2d7064/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 904/904 [00:00<00:00, 261kB/s]


Downloading and preparing dataset None/None (download: 224.75 KiB, generated: 322.65 KiB, post-processed: Unknown size, total: 547.40 KiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_ordinal-95c476e18d2d7064/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 51.1k/51.1k [00:00<00:00, 8.50MB/s]
Downloading data: 100%|██████████| 34.6k/34.6k [00:00<00:00, 34.3MB/s]
Downloading data: 100%|██████████| 144k/144k [00:00<00:00, 7.14MB/s]]
Downloading data files: 100%|██████████| 3/3 [00:01<00:00,  1.86it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2673.23it/s]
                                                                           

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_ordinal-95c476e18d2d7064/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


100it [00:00, 747647.77it/s]
200it [00:00, 821607.05it/s]



ensemble_50 on imdb_genre
Test accuaracy: 0.785
Test accuaracy (sample): 0.79
Test AUC: 0.8646782103893504
Test AUC (sample): 0.8513131313131314
Test % by label: [0.485, 0.515]
Test % by label (sample): [0.55, 0.45]
Using dataset imdb_genre, ordinal version


Downloading readme: 100%|██████████| 904/904 [00:00<00:00, 471kB/s]


Downloading and preparing dataset None/None (download: 224.75 KiB, generated: 322.65 KiB, post-processed: Unknown size, total: 547.40 KiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_ordinal-95c476e18d2d7064/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 51.1k/51.1k [00:00<00:00, 8.82MB/s]
Downloading data: 100%|██████████| 34.6k/34.6k [00:00<00:00, 38.4MB/s]
Downloading data: 100%|██████████| 144k/144k [00:00<00:00, 7.86MB/s]]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.43it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2734.82it/s]
                                                                           

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_ordinal-95c476e18d2d7064/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 904/904 [00:00<00:00, 871kB/s]


Downloading and preparing dataset None/None (download: 224.75 KiB, generated: 322.65 KiB, post-processed: Unknown size, total: 547.40 KiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_ordinal-95c476e18d2d7064/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 51.1k/51.1k [00:00<00:00, 7.72MB/s]
Downloading data: 100%|██████████| 34.6k/34.6k [00:00<00:00, 36.7MB/s]
Downloading data: 100%|██████████| 144k/144k [00:00<00:00, 8.06MB/s]]
Downloading data files: 100%|██████████| 3/3 [00:01<00:00,  1.87it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2499.09it/s]
                                                                           

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_ordinal-95c476e18d2d7064/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


100it [00:00, 618628.91it/s]
200it [00:00, 757778.50it/s]



ensemble_75 on imdb_genre
Test accuaracy: 0.775
Test accuaracy (sample): 0.75
Test AUC: 0.8520668601741568
Test AUC (sample): 0.8387878787878787
Test % by label: [0.485, 0.515]
Test % by label (sample): [0.55, 0.45]
Using dataset imdb_genre, ordinal version


Downloading readme: 100%|██████████| 904/904 [00:00<00:00, 510kB/s]


Downloading and preparing dataset None/None (download: 224.75 KiB, generated: 322.65 KiB, post-processed: Unknown size, total: 547.40 KiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_ordinal-95c476e18d2d7064/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 51.1k/51.1k [00:00<00:00, 8.22MB/s]
Downloading data: 100%|██████████| 34.6k/34.6k [00:00<00:00, 34.3MB/s]
Downloading data: 100%|██████████| 144k/144k [00:00<00:00, 8.07MB/s]]
Downloading data files: 100%|██████████| 3/3 [00:01<00:00,  1.91it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2655.18it/s]
                                                                           

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_ordinal-95c476e18d2d7064/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 904/904 [00:00<00:00, 333kB/s]


Downloading and preparing dataset None/None (download: 224.75 KiB, generated: 322.65 KiB, post-processed: Unknown size, total: 547.40 KiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_ordinal-95c476e18d2d7064/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 51.1k/51.1k [00:00<00:00, 7.22MB/s]
Downloading data: 100%|██████████| 34.6k/34.6k [00:00<00:00, 27.9MB/s]
Downloading data: 100%|██████████| 144k/144k [00:00<00:00, 6.97MB/s]]
Downloading data files: 100%|██████████| 3/3 [00:01<00:00,  1.87it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2577.94it/s]
                                                                           

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_ordinal-95c476e18d2d7064/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 904/904 [00:00<00:00, 1.36MB/s]


Downloading and preparing dataset None/None (download: 224.75 KiB, generated: 322.65 KiB, post-processed: Unknown size, total: 547.40 KiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_ordinal-95c476e18d2d7064/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 51.1k/51.1k [00:00<00:00, 8.34MB/s]
Downloading data: 100%|██████████| 34.6k/34.6k [00:00<00:00, 28.9MB/s]
Downloading data: 100%|██████████| 144k/144k [00:00<00:00, 7.63MB/s]]
Downloading data files: 100%|██████████| 3/3 [00:01<00:00,  1.68it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2450.42it/s]
                                                                           

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_ordinal-95c476e18d2d7064/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/


stack on imdb_genre
Test accuaracy: 0.74
Test accuaracy (sample): 0.73
Test AUC: 0.8106295666099489
Test AUC (sample): 0.7846464646464647
Test % by label: [0.485, 0.515]
Test % by label (sample): [0.55, 0.45]
Using dataset imdb_genre, all as text version


Downloading readme: 100%|██████████| 906/906 [00:00<00:00, 315kB/s]


Downloading and preparing dataset None/None (download: 225.28 KiB, generated: 320.45 KiB, post-processed: Unknown size, total: 545.73 KiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_all_text-e7768922c61ebd55/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 51.0k/51.0k [00:00<00:00, 1.02MB/s]
Downloading data: 100%|██████████| 34.4k/34.4k [00:00<00:00, 37.5MB/s]
Downloading data: 100%|██████████| 145k/145k [00:00<00:00, 6.95MB/s]]
Downloading data files: 100%|██████████| 3/3 [00:01<00:00,  1.80it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2971.87it/s]
                                                                           

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_all_text-e7768922c61ebd55/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 906/906 [00:00<00:00, 1.09MB/s]


Downloading and preparing dataset None/None (download: 225.28 KiB, generated: 320.45 KiB, post-processed: Unknown size, total: 545.73 KiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_all_text-e7768922c61ebd55/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 51.0k/51.0k [00:00<00:00, 5.41MB/s]
Downloading data: 100%|██████████| 34.4k/34.4k [00:00<00:00, 32.4MB/s]
Downloading data: 100%|██████████| 145k/145k [00:00<00:00, 8.24MB/s]]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.44it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2818.12it/s]
                                                                           

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_all_text-e7768922c61ebd55/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.

all_text on imdb_genre
Test accuaracy: 0.72
Test accuaracy (sample): 0.69
Test AUC: 0.8146331698528675
Test AUC (sample): 0.8004040404040405
Test % by label: [0.485, 0.515]
Test % by label (sample): [0.55, 0.45]
Using dataset imdb_genre, ordinal version


Downloading readme: 100%|██████████| 904/904 [00:00<00:00, 344kB/s]


Downloading and preparing dataset None/None (download: 224.75 KiB, generated: 322.65 KiB, post-processed: Unknown size, total: 547.40 KiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_ordinal-95c476e18d2d7064/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 51.1k/51.1k [00:00<00:00, 6.85MB/s]
Downloading data: 100%|██████████| 34.6k/34.6k [00:00<00:00, 38.4MB/s]
Downloading data: 100%|██████████| 144k/144k [00:00<00:00, 8.03MB/s]]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.46it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2789.38it/s]
                                                                           

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_ordinal-95c476e18d2d7064/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 904/904 [00:00<00:00, 282kB/s]


Downloading and preparing dataset None/None (download: 224.75 KiB, generated: 322.65 KiB, post-processed: Unknown size, total: 547.40 KiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_ordinal-95c476e18d2d7064/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 51.1k/51.1k [00:00<00:00, 5.37MB/s]
Downloading data: 100%|██████████| 34.6k/34.6k [00:00<00:00, 38.1MB/s]
Downloading data: 100%|██████████| 144k/144k [00:00<00:00, 7.92MB/s]]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.45it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2451.85it/s]
                                                                           

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_ordinal-95c476e18d2d7064/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.

all_as_text_tnt_reorder on imdb_genre
Test accuaracy: 0.735
Test accuaracy (sample): 0.73
Test AUC: 0.816634971474327
Test AUC (sample): 0.8311111111111111
Test % by label: [0.485, 0.515]
Test % by label (sample): [0.55, 0.45]
Using dataset imdb_genre, ordinal version


Downloading readme: 100%|██████████| 904/904 [00:00<00:00, 359kB/s]


Downloading and preparing dataset None/None (download: 224.75 KiB, generated: 322.65 KiB, post-processed: Unknown size, total: 547.40 KiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_ordinal-95c476e18d2d7064/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 51.1k/51.1k [00:00<00:00, 5.36MB/s]
Downloading data: 100%|██████████| 34.6k/34.6k [00:00<00:00, 33.9MB/s]
Downloading data: 100%|██████████| 144k/144k [00:00<00:00, 7.32MB/s]]
Downloading data files: 100%|██████████| 3/3 [00:01<00:00,  1.84it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2922.18it/s]
                                                                           

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_ordinal-95c476e18d2d7064/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 904/904 [00:00<00:00, 346kB/s]


Downloading and preparing dataset None/None (download: 224.75 KiB, generated: 322.65 KiB, post-processed: Unknown size, total: 547.40 KiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_ordinal-95c476e18d2d7064/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 51.1k/51.1k [00:00<00:00, 7.95MB/s]
Downloading data: 100%|██████████| 34.6k/34.6k [00:00<00:00, 39.6MB/s]
Downloading data: 100%|██████████| 144k/144k [00:00<00:00, 8.02MB/s]]
Downloading data files: 100%|██████████| 3/3 [00:01<00:00,  1.85it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2762.44it/s]
                                                                           

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_ordinal-95c476e18d2d7064/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.

all_as_text_base_reorder on imdb_genre
Test accuaracy: 0.735
Test accuaracy (sample): 0.73
Test AUC: 0.8170353317986188
Test AUC (sample): 0.8335353535353536
Test % by label: [0.485, 0.515]
Test % by label (sample): [0.55, 0.45]


## Kickstarter  (ROC-AUC)

In [None]:
ds_type = "kick"

for model_type in [
    "ensemble_25",
    "ensemble_50",
    "ensemble_75",
    "stack",
    "all_text",
    "all_as_text_tnt_reorder",
    "all_as_text_base_reorder",
]:
    sample_preds, sample_y, preds, y = run_model(
        model_type, ds_type=ds_type, tab_scale_factor=1
    )

    print(f"\n{model_type} on {ds_type}")
    print(f"Test accuaracy: {np.mean(np.argmax(preds, axis=1) == y)}")
    print(
        f"Test accuaracy (sample): {np.mean(np.argmax(sample_preds, axis=1) == sample_y)}"
    )
    print(f"Test AUC: {roc_auc_score(y, preds[:, 1])}")
    print(f"Test AUC (sample): {roc_auc_score(sample_y, sample_preds[:, 1])}")
    print(f"Test % by label: {[np.mean(y == label) for label in np.unique(y)]}")
    print(
        f"Test % by label (sample): {[np.mean(sample_y == label) for label in np.unique(y)]}"
    )

Using dataset kick, ordinal version


Downloading readme: 100%|██████████| 841/841 [00:00<00:00, 864kB/s]


Downloading and preparing dataset None/None (download: 17.58 MiB, generated: 29.54 MiB, post-processed: Unknown size, total: 47.11 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--kick_starter_funding_ordinal-bc45dae77b1c676d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 3.69M/3.69M [00:00<00:00, 29.9MB/s]
Downloading data: 100%|██████████| 2.22M/2.22M [00:00<00:00, 26.8MB/s]
Downloading data: 100%|██████████| 12.5M/12.5M [00:00<00:00, 35.1MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.34it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 1915.50it/s]
                                                                             

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--kick_starter_funding_ordinal-bc45dae77b1c676d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 841/841 [00:00<00:00, 582kB/s]


Downloading and preparing dataset None/None (download: 17.58 MiB, generated: 29.54 MiB, post-processed: Unknown size, total: 47.11 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--kick_starter_funding_ordinal-bc45dae77b1c676d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 3.69M/3.69M [00:00<00:00, 29.9MB/s]
Downloading data: 100%|██████████| 2.22M/2.22M [00:00<00:00, 24.1MB/s]
Downloading data: 100%|██████████| 12.5M/12.5M [00:00<00:00, 30.0MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.14it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 1787.85it/s]
                                                                             

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--kick_starter_funding_ordinal-bc45dae77b1c676d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


TypeError: sequence item 0: expected str instance, NoneType found

## Jigsaw (ROC-AUC)

In [None]:
ds_type = "jigsaw"

for model_type in [
    "ensemble_25",
    "ensemble_50",
    "ensemble_75",
    "stack",
    "all_text",
    "all_as_text_tnt_reorder",
    "all_as_text_base_reorder",
]:
    sample_preds, sample_y, preds, y = run_model(
        model_type, ds_type=ds_type, tab_scale_factor=1
    )

    print(f"\n{model_type} on {ds_type}")
    print(f"Test accuaracy: {np.mean(np.argmax(preds, axis=1) == y)}")
    print(
        f"Test accuaracy (sample): {np.mean(np.argmax(sample_preds, axis=1) == sample_y)}"
    )
    print(f"Test AUC: {roc_auc_score(y, preds[:, 1])}")
    print(f"Test AUC (sample): {roc_auc_score(sample_y, sample_preds[:, 1])}")
    print(f"Test % by label: {[np.mean(y == label) for label in np.unique(y)]}")
    print(
        f"Test % by label (sample): {[np.mean(sample_y == label) for label in np.unique(y)]}"
    )

Using dataset jigsaw, ordinal version


Downloading readme: 100%|██████████| 1.77k/1.77k [00:00<00:00, 482kB/s]


Downloading and preparing dataset None/None (download: 27.70 MiB, generated: 66.00 MiB, post-processed: Unknown size, total: 93.70 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--jigsaw_unintended_bias100K_ordinal-8e97391c4f489562/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 3.48M/3.48M [00:00<00:00, 6.56MB/s]
Downloading data: 100%|██████████| 19.7M/19.7M [00:00<00:00, 22.9MB/s]
Downloading data: 100%|██████████| 5.82M/5.82M [00:00<00:00, 60.8MB/s]
Downloading data files: 100%|██████████| 3/3 [00:04<00:00,  1.54s/it]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 1978.45it/s]
                                                                                        

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--jigsaw_unintended_bias100K_ordinal-8e97391c4f489562/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 1.77k/1.77k [00:00<00:00, 2.13MB/s]


Downloading and preparing dataset None/None (download: 27.70 MiB, generated: 66.00 MiB, post-processed: Unknown size, total: 93.70 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--jigsaw_unintended_bias100K_ordinal-8e97391c4f489562/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 3.48M/3.48M [00:00<00:00, 42.0MB/s]
Downloading data: 100%|██████████| 19.7M/19.7M [00:00<00:00, 86.1MB/s]
Downloading data: 100%|██████████| 5.82M/5.82M [00:00<00:00, 58.3MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.30it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2778.91it/s]
                                                                                        

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--jigsaw_unintended_bias100K_ordinal-8e97391c4f489562/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


100it [00:00, 358487.52it/s]
25000it [00:00, 169892.97it/s]



ensemble_25 on jigsaw
Test accuaracy: 0.94256
Test accuaracy (sample): 0.95
Test AUC: 0.9321638495642937
Test AUC (sample): 0.8378947368421054
Test % by label: [0.9426, 0.0574]
Test % by label (sample): [0.95, 0.05]
Using dataset jigsaw, ordinal version


Downloading readme: 100%|██████████| 1.77k/1.77k [00:00<00:00, 1.81MB/s]


Downloading and preparing dataset None/None (download: 27.70 MiB, generated: 66.00 MiB, post-processed: Unknown size, total: 93.70 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--jigsaw_unintended_bias100K_ordinal-8e97391c4f489562/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 3.48M/3.48M [00:00<00:00, 40.7MB/s]
Downloading data: 100%|██████████| 19.7M/19.7M [00:00<00:00, 81.9MB/s]
Downloading data: 100%|██████████| 5.82M/5.82M [00:00<00:00, 57.3MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.43it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2931.03it/s]
                                                                                        

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--jigsaw_unintended_bias100K_ordinal-8e97391c4f489562/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 1.77k/1.77k [00:00<00:00, 555kB/s]


Downloading and preparing dataset None/None (download: 27.70 MiB, generated: 66.00 MiB, post-processed: Unknown size, total: 93.70 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--jigsaw_unintended_bias100K_ordinal-8e97391c4f489562/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 3.48M/3.48M [00:00<00:00, 41.7MB/s]
Downloading data: 100%|██████████| 19.7M/19.7M [00:00<00:00, 92.4MB/s]
Downloading data: 100%|██████████| 5.82M/5.82M [00:00<00:00, 61.7MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.35it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2933.08it/s]
                                                                                        

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--jigsaw_unintended_bias100K_ordinal-8e97391c4f489562/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


100it [00:00, 440115.84it/s]
25000it [00:00, 466658.06it/s]



ensemble_50 on jigsaw
Test accuaracy: 0.9448
Test accuaracy (sample): 0.95
Test AUC: 0.9456286008527085
Test AUC (sample): 0.8631578947368422
Test % by label: [0.9426, 0.0574]
Test % by label (sample): [0.95, 0.05]
Using dataset jigsaw, ordinal version


Downloading readme: 100%|██████████| 1.77k/1.77k [00:00<00:00, 1.96MB/s]


Downloading and preparing dataset None/None (download: 27.70 MiB, generated: 66.00 MiB, post-processed: Unknown size, total: 93.70 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--jigsaw_unintended_bias100K_ordinal-8e97391c4f489562/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 3.48M/3.48M [00:00<00:00, 42.4MB/s]
Downloading data: 100%|██████████| 19.7M/19.7M [00:00<00:00, 86.3MB/s]
Downloading data: 100%|██████████| 5.82M/5.82M [00:00<00:00, 61.0MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.47it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2820.02it/s]
                                                                                        

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--jigsaw_unintended_bias100K_ordinal-8e97391c4f489562/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 1.77k/1.77k [00:00<00:00, 2.34MB/s]


Downloading and preparing dataset None/None (download: 27.70 MiB, generated: 66.00 MiB, post-processed: Unknown size, total: 93.70 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--jigsaw_unintended_bias100K_ordinal-8e97391c4f489562/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 3.48M/3.48M [00:00<00:00, 41.6MB/s]
Downloading data: 100%|██████████| 19.7M/19.7M [00:00<00:00, 87.4MB/s]
Downloading data: 100%|██████████| 5.82M/5.82M [00:00<00:00, 62.6MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.38it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2864.31it/s]
                                                                                        

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--jigsaw_unintended_bias100K_ordinal-8e97391c4f489562/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


100it [00:00, 405638.68it/s]
25000it [00:00, 347226.69it/s]



ensemble_75 on jigsaw
Test accuaracy: 0.9608
Test accuaracy (sample): 0.96
Test AUC: 0.9511750950554884
Test AUC (sample): 0.9515789473684211
Test % by label: [0.9426, 0.0574]
Test % by label (sample): [0.95, 0.05]
Using dataset jigsaw, all as text version


Downloading readme: 100%|██████████| 1.76k/1.76k [00:00<00:00, 593kB/s]


Downloading and preparing dataset None/None (download: 27.75 MiB, generated: 61.00 MiB, post-processed: Unknown size, total: 88.75 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--jigsaw_unintended_bias100K_all_text-351e9b1e029b8621/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 3.49M/3.49M [00:00<00:00, 41.4MB/s]
Downloading data: 100%|██████████| 19.8M/19.8M [00:00<00:00, 87.8MB/s]
Downloading data: 100%|██████████| 5.83M/5.83M [00:00<00:00, 63.0MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.34it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2605.70it/s]
                                                                                        

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--jigsaw_unintended_bias100K_all_text-351e9b1e029b8621/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 1.76k/1.76k [00:00<00:00, 2.03MB/s]


Downloading and preparing dataset None/None (download: 27.75 MiB, generated: 61.00 MiB, post-processed: Unknown size, total: 88.75 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--jigsaw_unintended_bias100K_all_text-351e9b1e029b8621/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 3.49M/3.49M [00:00<00:00, 43.8MB/s]
Downloading data: 100%|██████████| 19.8M/19.8M [00:00<00:00, 86.2MB/s]
Downloading data: 100%|██████████| 5.83M/5.83M [00:00<00:00, 61.8MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.23it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2644.02it/s]
                                                                                        

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--jigsaw_unintended_bias100K_all_text-351e9b1e029b8621/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading (…)"pytorch_model.bin";: 100%|██████████| 268M/268M [00:06<00:00, 42.3MB/s] 



all_text on jigsaw
Test accuaracy: 0.9612
Test accuaracy (sample): 0.95
Test AUC: 0.9624062142594692
Test AUC (sample): 0.9494736842105264
Test % by label: [0.9426, 0.0574]
Test % by label (sample): [0.95, 0.05]
Using dataset jigsaw, ordinal version


Downloading readme: 100%|██████████| 1.77k/1.77k [00:00<00:00, 2.36MB/s]


Downloading and preparing dataset None/None (download: 27.70 MiB, generated: 66.00 MiB, post-processed: Unknown size, total: 93.70 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--jigsaw_unintended_bias100K_ordinal-8e97391c4f489562/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 3.48M/3.48M [00:00<00:00, 41.9MB/s]
Downloading data: 100%|██████████| 19.7M/19.7M [00:00<00:00, 84.2MB/s]
Downloading data: 100%|██████████| 5.82M/5.82M [00:00<00:00, 56.4MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.31it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2563.23it/s]
                                                                                        

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--jigsaw_unintended_bias100K_ordinal-8e97391c4f489562/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 1.77k/1.77k [00:00<00:00, 650kB/s]


Downloading and preparing dataset None/None (download: 27.70 MiB, generated: 66.00 MiB, post-processed: Unknown size, total: 93.70 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--jigsaw_unintended_bias100K_ordinal-8e97391c4f489562/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 3.48M/3.48M [00:00<00:00, 46.7MB/s]
Downloading data: 100%|██████████| 19.7M/19.7M [00:00<00:00, 85.2MB/s]
Downloading data: 100%|██████████| 5.82M/5.82M [00:00<00:00, 64.1MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.22it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2744.36it/s]
                                                                                        

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--jigsaw_unintended_bias100K_ordinal-8e97391c4f489562/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 1.77k/1.77k [00:00<00:00, 496kB/s]


Downloading and preparing dataset None/None (download: 27.70 MiB, generated: 66.00 MiB, post-processed: Unknown size, total: 93.70 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--jigsaw_unintended_bias100K_ordinal-8e97391c4f489562/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 3.48M/3.48M [00:00<00:00, 40.9MB/s]
Downloading data: 100%|██████████| 19.7M/19.7M [00:00<00:00, 84.2MB/s]
Downloading data: 100%|██████████| 5.82M/5.82M [00:00<00:00, 58.2MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.35it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 3000.22it/s]
                                                                                        

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--jigsaw_unintended_bias100K_ordinal-8e97391c4f489562/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
100it [00:00, 430185.03it/s]
25000it [00:00, 446485.64it/s]



stack on jigsaw
Test accuaracy: 0.93208
Test accuaracy (sample): 0.95
Test AUC: 0.9158202495728693
Test AUC (sample): 0.9515789473684211
Test % by label: [0.9426, 0.0574]
Test % by label (sample): [0.95, 0.05]


## Product Sentiment

In [None]:
ds_type = "prod_sent"

for model_type in [
    "ensemble_25",
    "ensemble_50",
    "ensemble_75",
    "stack",
    "all_text",
    # "all_as_text_tnt_reorder",
    # "all_as_text_base_reorder",
]:
    sample_preds, sample_y, preds, y = run_model(
        model_type, ds_type=ds_type, tab_scale_factor=1
    )

    print(f"\n{model_type} on {ds_type}")
    print(f"Test accuaracy: {np.mean(np.argmax(preds, axis=1) == y)}")
    print(
        f"Test accuaracy (sample): {np.mean(np.argmax(sample_preds, axis=1) == sample_y)}"
    )
    # print(f"Test AUC: {roc_auc_score(y, preds[:, 1])}")
    # print(f"Test AUC (sample): {roc_auc_score(sample_y, sample_preds[:, 1])}")
    print(f"Test % by label: {[np.mean(y == label) for label in np.unique(y)]}")
    print(
        f"Test % by label (sample): {[np.mean(sample_y == label) for label in np.unique(y)]}"
    )

Using dataset prod_sent, ordinal version


Downloading readme: 100%|██████████| 590/590 [00:00<00:00, 489kB/s]


Downloading and preparing dataset None/None (download: 431.83 KiB, generated: 777.70 KiB, post-processed: Unknown size, total: 1.18 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--product_sentiment_machine_hack_ordinal-bbe910d41184c5e7/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 54.2k/54.2k [00:00<00:00, 6.48MB/s]
Downloading data: 100%|██████████| 297k/297k [00:00<00:00, 1.27MB/s]]
Downloading data: 100%|██████████| 90.6k/90.6k [00:00<00:00, 595kB/s]
Downloading data files: 100%|██████████| 3/3 [00:03<00:00,  1.24s/it]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2763.05it/s]
                                                                           

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--product_sentiment_machine_hack_ordinal-bbe910d41184c5e7/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 590/590 [00:00<00:00, 211kB/s]


Downloading and preparing dataset None/None (download: 431.83 KiB, generated: 777.70 KiB, post-processed: Unknown size, total: 1.18 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--product_sentiment_machine_hack_ordinal-bbe910d41184c5e7/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 54.2k/54.2k [00:00<00:00, 5.41MB/s]
Downloading data: 100%|██████████| 297k/297k [00:00<00:00, 11.6MB/s]]
Downloading data: 100%|██████████| 90.6k/90.6k [00:00<00:00, 4.83MB/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.50it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2990.24it/s]
                                                                           

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--product_sentiment_machine_hack_ordinal-bbe910d41184c5e7/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


100it [00:00, 921825.05it/s]
1273it [00:00, 1303233.83it/s]



ensemble_25 on prod_sent
Test accuaracy: 0.8908091123330715
Test accuaracy (sample): 0.95
Test % by label: [0.010997643362136685, 0.0589159465828751, 0.5875883739198743, 0.3424980361351139]
Test % by label (sample): [0.01, 0.02, 0.62, 0.35]
Using dataset prod_sent, ordinal version


Downloading readme: 100%|██████████| 590/590 [00:00<00:00, 666kB/s]


Downloading and preparing dataset None/None (download: 431.83 KiB, generated: 777.70 KiB, post-processed: Unknown size, total: 1.18 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--product_sentiment_machine_hack_ordinal-bbe910d41184c5e7/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 54.2k/54.2k [00:00<00:00, 5.56MB/s]
Downloading data: 100%|██████████| 297k/297k [00:00<00:00, 9.70MB/s]]
Downloading data: 100%|██████████| 90.6k/90.6k [00:00<00:00, 5.73MB/s]
Downloading data files: 100%|██████████| 3/3 [00:01<00:00,  1.84it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2791.24it/s]
                                                                           

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--product_sentiment_machine_hack_ordinal-bbe910d41184c5e7/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 590/590 [00:00<00:00, 677kB/s]


Downloading and preparing dataset None/None (download: 431.83 KiB, generated: 777.70 KiB, post-processed: Unknown size, total: 1.18 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--product_sentiment_machine_hack_ordinal-bbe910d41184c5e7/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 54.2k/54.2k [00:00<00:00, 5.58MB/s]
Downloading data: 100%|██████████| 297k/297k [00:00<00:00, 9.66MB/s]]
Downloading data: 100%|██████████| 90.6k/90.6k [00:00<00:00, 5.53MB/s]
Downloading data files: 100%|██████████| 3/3 [00:01<00:00,  1.56it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2907.33it/s]
                                                                           

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--product_sentiment_machine_hack_ordinal-bbe910d41184c5e7/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


100it [00:00, 731990.23it/s]
1273it [00:00, 1375411.90it/s]



ensemble_50 on prod_sent
Test accuaracy: 0.8695993715632364
Test accuaracy (sample): 0.9
Test % by label: [0.010997643362136685, 0.0589159465828751, 0.5875883739198743, 0.3424980361351139]
Test % by label (sample): [0.01, 0.02, 0.62, 0.35]
Using dataset prod_sent, ordinal version


Downloading readme: 100%|██████████| 590/590 [00:00<00:00, 761kB/s]


Downloading and preparing dataset None/None (download: 431.83 KiB, generated: 777.70 KiB, post-processed: Unknown size, total: 1.18 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--product_sentiment_machine_hack_ordinal-bbe910d41184c5e7/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 54.2k/54.2k [00:00<00:00, 5.63MB/s]
Downloading data: 100%|██████████| 297k/297k [00:00<00:00, 9.73MB/s]]
Downloading data: 100%|██████████| 90.6k/90.6k [00:00<00:00, 5.67MB/s]
Downloading data files: 100%|██████████| 3/3 [00:01<00:00,  1.82it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2988.82it/s]
                                                                           

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--product_sentiment_machine_hack_ordinal-bbe910d41184c5e7/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 590/590 [00:00<00:00, 704kB/s]


Downloading and preparing dataset None/None (download: 431.83 KiB, generated: 777.70 KiB, post-processed: Unknown size, total: 1.18 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--product_sentiment_machine_hack_ordinal-bbe910d41184c5e7/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 54.2k/54.2k [00:00<00:00, 5.59MB/s]
Downloading data: 100%|██████████| 297k/297k [00:00<00:00, 10.1MB/s]]
Downloading data: 100%|██████████| 90.6k/90.6k [00:00<00:00, 5.72MB/s]
Downloading data files: 100%|██████████| 3/3 [00:01<00:00,  1.71it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2956.51it/s]
                                                                           

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--product_sentiment_machine_hack_ordinal-bbe910d41184c5e7/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


100it [00:00, 530924.56it/s]
1273it [00:00, 1430310.47it/s]



ensemble_75 on prod_sent
Test accuaracy: 0.6614296936370778
Test accuaracy (sample): 0.64
Test % by label: [0.010997643362136685, 0.0589159465828751, 0.5875883739198743, 0.3424980361351139]
Test % by label (sample): [0.01, 0.02, 0.62, 0.35]
Using dataset prod_sent, all as text version


Downloading readme: 100%|██████████| 590/590 [00:00<00:00, 736kB/s]


Downloading and preparing dataset None/None (download: 430.99 KiB, generated: 758.28 KiB, post-processed: Unknown size, total: 1.16 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--product_sentiment_machine_hack_all_text-64fff8d5159768bc/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 54.1k/54.1k [00:00<00:00, 8.07MB/s]
Downloading data: 100%|██████████| 297k/297k [00:00<00:00, 10.0MB/s]]
Downloading data: 100%|██████████| 90.4k/90.4k [00:00<00:00, 5.61MB/s]
Downloading data files: 100%|██████████| 3/3 [00:01<00:00,  1.65it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2615.99it/s]
                                                                           

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--product_sentiment_machine_hack_all_text-64fff8d5159768bc/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 590/590 [00:00<00:00, 197kB/s]


Downloading and preparing dataset None/None (download: 430.99 KiB, generated: 758.28 KiB, post-processed: Unknown size, total: 1.16 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--product_sentiment_machine_hack_all_text-64fff8d5159768bc/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 54.1k/54.1k [00:00<00:00, 5.69MB/s]
Downloading data: 100%|██████████| 297k/297k [00:00<00:00, 9.85MB/s]]
Downloading data: 100%|██████████| 90.4k/90.4k [00:00<00:00, 5.66MB/s]
Downloading data files: 100%|██████████| 3/3 [00:01<00:00,  1.54it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2977.50it/s]
                                                                           

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--product_sentiment_machine_hack_all_text-64fff8d5159768bc/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading (…)"pytorch_model.bin";: 100%|██████████| 268M/268M [00:02<00:00, 108MB/s]  



all_text on prod_sent
Test accuaracy: 0.8476040848389631
Test accuaracy (sample): 0.88
Test % by label: [0.010997643362136685, 0.0589159465828751, 0.5875883739198743, 0.3424980361351139]
Test % by label (sample): [0.01, 0.02, 0.62, 0.35]
Using dataset prod_sent, ordinal version


Downloading readme: 100%|██████████| 590/590 [00:00<00:00, 448kB/s]


Downloading and preparing dataset None/None (download: 431.83 KiB, generated: 777.70 KiB, post-processed: Unknown size, total: 1.18 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--product_sentiment_machine_hack_ordinal-bbe910d41184c5e7/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 54.2k/54.2k [00:00<00:00, 7.92MB/s]
Downloading data: 100%|██████████| 297k/297k [00:00<00:00, 9.80MB/s]]
Downloading data: 100%|██████████| 90.6k/90.6k [00:00<00:00, 5.66MB/s]
Downloading data files: 100%|██████████| 3/3 [00:01<00:00,  1.81it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2966.97it/s]
                                                                           

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--product_sentiment_machine_hack_ordinal-bbe910d41184c5e7/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 590/590 [00:00<00:00, 632kB/s]


Downloading and preparing dataset None/None (download: 431.83 KiB, generated: 777.70 KiB, post-processed: Unknown size, total: 1.18 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--product_sentiment_machine_hack_ordinal-bbe910d41184c5e7/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 54.2k/54.2k [00:00<00:00, 5.55MB/s]
Downloading data: 100%|██████████| 297k/297k [00:00<00:00, 9.79MB/s]]
Downloading data: 100%|██████████| 90.6k/90.6k [00:00<00:00, 6.21MB/s]
Downloading data files: 100%|██████████| 3/3 [00:01<00:00,  1.81it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2580.58it/s]
                                                                           

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--product_sentiment_machine_hack_ordinal-bbe910d41184c5e7/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 590/590 [00:00<00:00, 216kB/s]


Downloading and preparing dataset None/None (download: 431.83 KiB, generated: 777.70 KiB, post-processed: Unknown size, total: 1.18 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--product_sentiment_machine_hack_ordinal-bbe910d41184c5e7/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 54.2k/54.2k [00:00<00:00, 5.76MB/s]
Downloading data: 100%|██████████| 297k/297k [00:00<00:00, 9.82MB/s]]
Downloading data: 100%|██████████| 90.6k/90.6k [00:00<00:00, 5.73MB/s]
Downloading data files: 100%|██████████| 3/3 [00:01<00:00,  1.77it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2945.44it/s]
                                                                           

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--product_sentiment_machine_hack_ordinal-bbe910d41184c5e7/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
100it [00:00, 907858.01it/s]
1273it [00:00, 1387204.21it/s]



stack on prod_sent
Test accuaracy: 0.6496465043205027
Test accuaracy (sample): 0.68
Test % by label: [0.010997643362136685, 0.0589159465828751, 0.5875883739198743, 0.3424980361351139]
Test % by label (sample): [0.01, 0.02, 0.62, 0.35]


## Wine

In [None]:
ds_type = "wine"

for model_type in [
    "ensemble_25",
    "ensemble_50",
    "ensemble_75",
    "stack",
    "all_text",
    "all_as_text_tnt_reorder",
    # "all_as_text_base_reorder",
]:
    # sample_preds, sample_y, preds, y = run_model(
    #     model_type, ds_type=ds_type, tab_scale_factor=1
    # )
    preds, y = run_model(model_type, ds_type=ds_type, tab_scale_factor=1)

    print(f"\n{model_type} on {ds_type}")
    print(f"Test accuaracy: {np.mean(np.argmax(preds, axis=1) == y)}")
    print(
        f"Test accuaracy (sample): {np.mean(np.argmax(sample_preds, axis=1) == sample_y)}"
    )
    # print(f"Test AUC: {roc_auc_score(y, preds[:, 1])}")
    # print(f"Test AUC (sample): {roc_auc_score(sample_y, sample_preds[:, 1])}")
    print(
        f"Test % by label: {[np.mean(y == label) for label in np.unique(y)]}")
    print(
        f"Test % by label (sample): {[np.mean(sample_y == label) for label in np.unique(y)]}"
    )

Using dataset wine, ordinal version


Downloading readme: 100%|██████████| 673/673 [00:00<00:00, 721kB/s]


Downloading and preparing dataset None/None (download: 15.47 MiB, generated: 29.51 MiB, post-processed: Unknown size, total: 44.99 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--wine_reviews_ordinal-c036c1f97c7da848/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 1.95M/1.95M [00:00<00:00, 4.27MB/s]
Downloading data: 100%|██████████| 11.0M/11.0M [00:00<00:00, 15.9MB/s]
Downloading data: 100%|██████████| 3.24M/3.24M [00:00<00:00, 6.04MB/s]
Downloading data files: 100%|██████████| 3/3 [00:04<00:00,  1.67s/it]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 1963.63it/s]
                                                                             

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--wine_reviews_ordinal-c036c1f97c7da848/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Downloading readme: 100%|██████████| 673/673 [00:00<00:00, 410kB/s]


Downloading and preparing dataset None/None (download: 15.47 MiB, generated: 29.51 MiB, post-processed: Unknown size, total: 44.99 MiB) to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--wine_reviews_ordinal-c036c1f97c7da848/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 1.95M/1.95M [00:00<00:00, 36.0MB/s]
Downloading data: 100%|██████████| 11.0M/11.0M [00:00<00:00, 72.1MB/s]
Downloading data: 100%|██████████| 3.24M/3.24M [00:00<00:00, 43.6MB/s]
Downloading data files: 100%|██████████| 3/3 [00:01<00:00,  1.54it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2816.23it/s]
                                                                             

Dataset parquet downloaded and prepared to /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--wine_reviews_ordinal-c036c1f97c7da848/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


100it [00:00, 645277.54it/s]
21031it [00:00, 470854.79it/s]


ValueError: too many values to unpack (expected 2)

In [None]:
y


array([14, 29,  5, ...,  5,  3, 26])

In [None]:
np.argmax(preds, axis=1)


array([ 9,  9, 23, ..., 23,  2, 15])