In [1]:
import numpy as np
from datasets import load_dataset
from transformers import pipeline, AutoTokenizer

import lightgbm as lgb
from src.models import WeightedEnsemble, StackModel, AllAsTextModel
from src.utils import ConfigLoader, compute_metrics
import xgboost as xgb
from transformers import AutoModelForSequenceClassification
from src.utils import format_text_pred


In [2]:
def get_prediction(config_type):
    # Shap args
    args = ConfigLoader(
        config_type, "../configs/shap_configs.yaml", "../configs/dataset_default.yaml"
    )
    # Data
    all_text_versions = [
        "all_text",
        "all_as_text",
        "all_as_text_base_reorder",
        "all_as_text_tnt_reorder",
    ]
    ds_name = args.ds_name
    train_df = load_dataset(
        ds_name,
        split="train",  # download_mode="force_redownload"
    ).to_pandas()
    y_train = train_df[args.label_col]

    test_df = load_dataset(
        ds_name,
        split="test",  # download_mode="force_redownload"
    ).to_pandas()

    # Models
    tokenizer = AutoTokenizer.from_pretrained(
        args.text_model_base, model_max_length=512
    )
    if args.model_type in all_text_versions:
        text_pipeline = pipeline(
            "text-classification",
            model=args.my_text_model,
            tokenizer=tokenizer,
            device="cuda:0",
            truncation=True,
            padding=True,
            top_k=None,
        )
        # Define how to convert all columns to a single string
        if args.model_type in ["all_as_text", "all_text"]:
            def cols_to_str_fn(array): return " | ".join(
                [
                    f"{col}: {val}"
                    for col, val in zip(
                        args.categorical_cols + args.numerical_cols + args.text_cols,
                        array,
                    )
                ]
            )
        else:
            # # Reorder based on the new index order in di
            # cols_to_str_fn = lambda array: " | ".join(
            #     [
            #         f"{col}: {val}"
            #         for _, col, val in sorted(
            #             zip(args.new_idx_order, args.tab_cols + args.text_cols, array)
            #         )
            #     ]
            # )
            raise NotImplementedError(
                "Shouldn't need much as the column ordering is in dataset info,\
                just need to update the cols_to_str_fn"
            )

        model = AllAsTextModel(
            text_pipeline=text_pipeline,
            cols_to_str_fn=cols_to_str_fn,
        )
    else:
        text_pipeline = pipeline(
            "text-classification",
            model=args.my_text_model,
            tokenizer=tokenizer,
            device="cuda:0",
            truncation=True,
            padding=True,
            top_k=None,
        )
        # Define how to convert the text columns to a single string
        if len(args.text_cols) == 1:

            def cols_to_str_fn(array):
                return array[0]

        else:

            def cols_to_str_fn(array):
                return " | ".join(
                    [f"{col}: {val}" for col, val in zip(
                        args.text_cols, array)]
                )

        # LightGBM requires explicitly marking categorical features
        train_df[args.categorical_cols] = train_df[args.categorical_cols].astype(
            "category"
        )
        test_df[args.categorical_cols] = test_df[args.categorical_cols].astype(
            "category"
        )

        tab_model = lgb.LGBMClassifier(random_state=42, max_depth=3)
        tab_model.fit(train_df[args.categorical_cols +
                      args.numerical_cols], y_train)

        if args.model_type in ["ensemble_25", "ensemble_50", "ensemble_75"]:
            text_weight = float(args.model_type.split("_")[-1]) / 100
            model = WeightedEnsemble(
                tab_model=tab_model,
                text_pipeline=text_pipeline,
                text_weight=text_weight,
                cols_to_str_fn=cols_to_str_fn,
            )
        elif args.model_type == "stack":
            """
            For the stack model, we make predictions on the validation set. These predictions
            are then used as features for the stack model (another LightGBM model) along with
            the other tabular features. In doing so the stack model learns, depending on the
            tabular features, when to trust the tabular model and when to trust the text model.
            """
            val_df = load_dataset(
                ds_name,
                split="validation",  # download_mode="force_redownload"
            ).to_pandas()
            val_df[args.categorical_cols] = val_df[args.categorical_cols].astype(
                "category"
            )
            y_val = val_df[args.label_col]
            val_text = list(map(cols_to_str_fn, val_df[args.text_cols].values))

            # Training set is the preditions from the tabular and text models on the validation set
            # plus the tabular features from the validation set
            text_val_preds = text_pipeline(val_text)
            text_val_preds = np.array(
                [format_text_pred(pred) for pred in text_val_preds]
            )
            # text_val_preds = np.array(
            #     [[lab["score"] for lab in pred] for pred in text_val_preds]
            # )

            # add text and tabular predictions to the val_df
            stack_val_df = val_df[args.categorical_cols + args.numerical_cols]
            tab_val_preds = tab_model.predict_proba(
                val_df[args.categorical_cols + args.numerical_cols]
            )
            stack_val_df[f"text_pred"] = text_val_preds[:, 1]
            stack_val_df[f"tab_pred"] = tab_val_preds[:, 1]

            stack_model = lgb.LGBMClassifier(
                random_state=42, max_depth=2, learning_rate=0.01
            )
            stack_model.fit(stack_val_df, y_val)

            model = StackModel(
                tab_model=tab_model,
                text_pipeline=text_pipeline,
                stack_model=stack_model,
                cols_to_str_fn=cols_to_str_fn,
                all_labels=False,
            )
        else:
            raise ValueError(f"Invalid model type of {args.model_type}")

    np.random.seed(1)
    x = test_df[args.categorical_cols +
                args.numerical_cols + args.text_cols].values
    preds = model.predict(x)
    labels = test_df[args.label_col].values
    return preds, labels

In [3]:
for config_type in [
    # "vet_59c_stack",
    # "vet_59c_ensemble_25",
    # "vet_59c_ensemble_50",
    # "vet_59c_ensemble_75",
    # "vet_50c_all_text",
    # "vet_19c_stack",
    # "vet_19c_ensemble_25",
    # "vet_19c_ensemble_50",
    # "vet_19c_ensemble_75",
    "vet_10c_all_text",
]:
    print(config_type)
    preds, labels = get_prediction(config_type)
    try:
        results = compute_metrics((preds, labels), argmax=True)
    except:
        results = compute_metrics((preds, labels), argmax=False)
    print(results)
# preds, labels = get_prediction("vet_59b_ensemble_50")
# results = compute_metrics((preds, labels))


vet_10c_all_text
Updating with:
{'config': 'vet_10c_all_text', 'my_text_model': 'james-burton/vet_10c', 'ds_name': 'james-burton/vet_month_1c_all_text', 'text_model_base': 'bert-base-uncased', 'model_type': 'all_text', 'ord_ds_name': 'james-burton/vet_month_1c_ordinal', 'text_cols': ['breed', 'region', 'record']}


{'categorical_cols': ['gender', 'neutered', 'species', 'insured'], 'numerical_cols_long': ['age_at_consult', 'Diseases of the ear or mastoid process', 'Mental, behavioural or neurodevelopmental disorders', 'Diseases of the blood or blood-forming organs', 'Diseases of the circulatory system', 'Dental', 'Developmental anomalies', 'Diseases of the digestive system', 'Endocrine, nutritional or metabolic diseases', 'Diseases of the Immune system', 'Certain infectious or parasitic diseases', 'Diseases of the skin', 'Diseases of the musculoskeletal system or connective tissue', 'Neoplasms', 'Diseases of the nervous system', 'Diseases of the visual system', 'Certain conditions origi

## Stack model

In [14]:
config_type = "vet_19c_stack"

di = ConfigLoader(
    config_type, "../configs/shap_configs.yaml", "../configs/dataset_default.yaml"
)
# Data
train_df = load_dataset(
    di.ds_name,
    split="train",  # download_mode="force_redownload"
).to_pandas()
y_train = train_df[di.label_col]

test_df = load_dataset(
    di.ds_name,
    split="test",  # download_mode="force_redownload"
).to_pandas()
# test_df = test_df.sample(test_set_size, random_state=55)

# Models
tokenizer = AutoTokenizer.from_pretrained(
    di.text_model_base, model_max_length=512)
text_pipeline = pipeline(
    "text-classification",
    model=di.my_text_model,
    tokenizer=tokenizer,
    device="cuda:0",
    truncation=True,
    padding=True,
    top_k=None,
)
# Define how to convert the text columns to a single string
if len(di.text_cols) == 1:

    def cols_to_str_fn(array):
        return array[0]

else:

    def cols_to_str_fn(array):
        return " | ".join([f"{col}: {val}" for col, val in zip(di.text_cols, array)])


# LightGBM requires explicitly marking categorical features
train_df[di.categorical_cols] = train_df[di.categorical_cols].astype(
    "category")
test_df[di.categorical_cols] = test_df[di.categorical_cols].astype("category")

tab_model = lgb.LGBMClassifier(random_state=42, max_depth=3)
tab_model.fit(train_df[di.categorical_cols + di.numerical_cols], y_train)

"""
For the stack model, we make predictions on the validation set. These
predictions are then used as features for the stack model (another LightGBM
model) along with the other tabular features. In doing so the stack model
learns, depending on the tabular features, when to trust the tabular model
and when to trust the text model.
"""
val_df = load_dataset(
    di.ds_name,
    split="validation",  # download_mode="force_redownload"
).to_pandas()
val_df[di.categorical_cols] = val_df[di.categorical_cols].astype("category")
y_val = val_df[di.label_col]
val_text = list(map(cols_to_str_fn, val_df[di.text_cols].values))

# Training set is the preditions from the tabular and text models on the
# validation set plus the tabular features from the validation set
text_val_preds = text_pipeline(val_text)
text_val_preds = np.array([format_text_pred(pred) for pred in text_val_preds])

# add text and tabular predictions to the val_df
stack_val_df = val_df[di.categorical_cols + di.numerical_cols]
# stack_val_df = val_df[di.numerical_cols].copy()
tab_val_preds = tab_model.predict_proba(
    val_df[di.categorical_cols + di.numerical_cols])
stack_val_df[f"text_pred"] = text_val_preds[:, 1]
stack_val_df[f"tab_pred"] = tab_val_preds[:, 1]

Updating with:
{'config': 'vet_19c_stack', 'my_text_model': 'james-burton/vet_19c', 'ds_name': 'james-burton/vet_month_1c_ordinal', 'text_model_base': 'bert-base-uncased', 'model_type': 'stack', 'ord_ds_name': 'james-burton/vet_month_1c_ordinal', 'text_cols': ['breed', 'region', 'record']}


{'categorical_cols': ['gender', 'neutered', 'species', 'insured'], 'numerical_cols_long': ['age_at_consult', 'Diseases of the ear or mastoid process', 'Mental, behavioural or neurodevelopmental disorders', 'Diseases of the blood or blood-forming organs', 'Diseases of the circulatory system', 'Dental', 'Developmental anomalies', 'Diseases of the digestive system', 'Endocrine, nutritional or metabolic diseases', 'Diseases of the Immune system', 'Certain infectious or parasitic diseases', 'Diseases of the skin', 'Diseases of the musculoskeletal system or connective tissue', 'Neoplasms', 'Diseases of the nervous system', 'Diseases of the visual system', 'Certain conditions originating in the perinatal 

Downloading (…)lve/main/config.json: 100%|██████████| 727/727 [00:00<00:00, 1.10MB/s]
Downloading pytorch_model.bin: 100%|██████████| 438M/438M [00:10<00:00, 43.8MB/s] 


[LightGBM] [Info] Number of positive: 3676, number of negative: 3530
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 525
[LightGBM] [Info] Number of data points in the train set: 7206, number of used features: 24
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.510130 -> initscore=0.040527
[LightGBM] [Info] Start training from score 0.040527



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [16]:
test_df[di.categorical_cols] = test_df[di.categorical_cols].astype("category")
test_text = list(map(cols_to_str_fn, test_df[di.text_cols].values))

text_test_preds = text_pipeline(test_text)
text_test_preds = np.array([format_text_pred(pred)
                           for pred in text_test_preds])

# add text and tabular predictions to the test_df
stack_test_df = test_df[di.categorical_cols + di.numerical_cols]
# stack_test_df = test_df[di.numerical_cols].copy()
tab_test_preds = tab_model.predict_proba(
    test_df[di.categorical_cols + di.numerical_cols]
)
stack_test_df[f"text_pred"] = text_test_preds[:, 1]
stack_test_df[f"tab_pred"] = tab_test_preds[:, 1]
y_test = test_df[di.label_col]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [28]:
# Validation set
stack_model = lgb.LGBMClassifier(
    random_state=42,
    max_depth=2,
    learning_rate=0.01,
)
stack_model.fit(stack_val_df, y_val)

preds = stack_model.predict(stack_val_df)
labels = y_val.values
print(compute_metrics((preds, labels), argmax=False))

# test set
preds = stack_model.predict(stack_test_df)
labels = y_test.values
print(compute_metrics((preds, labels), argmax=False))


[LightGBM] [Info] Number of positive: 649, number of negative: 623
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 945
[LightGBM] [Info] Number of data points in the train set: 1272, number of used features: 25
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.510220 -> initscore=0.040886
[LightGBM] [Info] Start training from score 0.040886
{'accuracy': 0.7099056603773585, 'precision': 0.6983002832861189, 'recall': 0.7596302003081664, 'f1': 0.7276752767527676}
{'accuracy': 0.6762820512820513, 'precision': 0.6674718196457327, 'recall': 0.7382012466607302, 'f1': 0.7010570824524314}


## Sean simple tree combiner

In [18]:
from sklearn.tree import DecisionTreeClassifier


In [19]:
tree_model = DecisionTreeClassifier(random_state=42, max_depth=3)
tree_val_df = np.vstack(
    [text_val_preds[:, 1], tab_val_preds[:, 1]]).reshape(-1, 2)
tree_model.fit(tree_val_df, y_val)

preds = tree_model.predict(tree_val_df)
labels = y_val.values
print(compute_metrics((preds, labels), argmax=False))

tree_test_df = np.vstack(
    [text_test_preds[:, 1], tab_test_preds[:, 1]]).reshape(-1, 2)
preds = tree_model.predict(tree_test_df)
labels = y_test.values
print(compute_metrics((preds, labels), argmax=False))

# Text only
print(compute_metrics((text_test_preds, y_test.values), argmax=True))

# Tabular only
print(compute_metrics((tab_test_preds, y_test.values), argmax=True))

{'accuracy': 0.5408805031446541, 'precision': 0.5276595744680851, 'recall': 0.9553158705701078, 'f1': 0.6798245614035087}
{'accuracy': 0.5018315018315018, 'precision': 0.5086934923000497, 'recall': 0.9118432769367765, 'f1': 0.653061224489796}
{'accuracy': 0.6515567765567766, 'precision': 0.6348733233979136, 'recall': 0.7586821015138023, 'f1': 0.6912778904665314}
{'accuracy': 0.6364468864468864, 'precision': 0.6362883181441591, 'recall': 0.6838824577025824, 'f1': 0.6592274678111587}


## Training set accuracy

In [20]:
tab_train_preds = tab_model.predict(
    train_df[di.categorical_cols + di.numerical_cols])
print(compute_metrics((tab_train_preds, y_train.values), argmax=False))
text_train_preds = text_pipeline(
    list(map(cols_to_str_fn, train_df[di.text_cols].values))
)
text_train_preds = np.array([format_text_pred(pred)
                            for pred in text_train_preds])
print(compute_metrics((text_train_preds, y_train.values), argmax=True))

{'accuracy': 0.6770746600055509, 'precision': 0.669685534591195, 'recall': 0.7241566920565833, 'f1': 0.6958567507515357}
{'accuracy': 0.7212045517624202, 'precision': 0.6906886296042095, 'recall': 0.8212731229597389, 'f1': 0.7503417422641978}
