In [4]:
import pickle
from transformers import AutoTokenizer
import numpy as np
import shap
from src.plot_text import text, get_grouped_vals

# from src.utils import format_fts_for_plotting, text_ft_index_ends, token_segments
# from src.utils import legacy_get_dataset_info
from datasets import load_dataset

# import matplotlib.pyplot as plt
from src.run_shap import load_shap_vals
from tqdm import tqdm

# import re
# import seaborn as sns
# from scipy import stats
from src.utils import (
    ConfigLoader,
    text_ft_index_ends,
    token_segments,
    format_fts_for_plotting,
    format_text_fts_too,
)

In [12]:
args = ConfigLoader(
    config_name, "../configs/shap_configs.yaml", "../configs/dataset_default.yaml"
)
shap_vals = load_shap_vals(config_name, add_parent_dir=True)

Updating with:
{'config': 'vet_50c_all_text', 'my_text_model': 'james-burton/vet_50c', 'ds_name': 'james-burton/vet_month_1c_all_text', 'text_model_base': 'SAVSNET/PetBERT_pretrained', 'model_type': 'all_text', 'ord_ds_name': 'james-burton/vet_month_1c_ordinal', 'text_cols': ['breed', 'region', 'record']}


{'categorical_cols': ['gender', 'neutered', 'species', 'insured'], 'numerical_cols_long': ['age_at_consult', 'Diseases of the ear or mastoid process', 'Mental, behavioural or neurodevelopmental disorders', 'Diseases of the blood or blood-forming organs', 'Diseases of the circulatory system', 'Dental', 'Developmental anomalies', 'Diseases of the digestive system', 'Endocrine, nutritional or metabolic diseases', 'Diseases of the Immune system', 'Certain infectious or parasitic diseases', 'Diseases of the skin', 'Diseases of the musculoskeletal system or connective tissue', 'Neoplasms', 'Diseases of the nervous system', 'Diseases of the visual system', 'Certain conditions originating i

array([['female', 'yes', 'cat', ..., 'missing', 'Rotherham',
        'owner rep not quite hersel.  either really hungry or eating very little.  occ v+ chronic hx-last time tuesday, not after food, no blood. time before that last week.  been on bland white fish for 1 week. no d+last few weeks, occ has had.  drinking lots last 3 days.  no changes in urination.  no cough, very occ sneezing.  or after used meds AK recc for teeth been sick - so stopped using, last time she used it was 2 weeks ago.  clinical exam. very limited as +++agressive. body condition score 7/9.  skin eyes ears nothing abnormal detected. abdomen palp tense. temperature not possible, some gingivitis g2 on bottom L carnassial.  mmp+m, ln within normal limits.  differential diagnosis - HT, ckd, DM.  ideally check bloods , owner hapy to do this. bloods not possibel today, too agressive. owner understands. rebook w nurse 2hrs after gaba given, 12 hrs starved.  any issues call before.'],
       ['female', 'yes', 'cat', ...,

In [18]:
sum([len(d) < 20 for d in shap_vals.data[:, -1]]) / len(shap_vals.data)

0.021

All empty text records have a label of 1: dead (looking in the csv file)

In [20]:
import numpy as np
from datasets import load_dataset
from transformers import pipeline, AutoTokenizer

import lightgbm as lgb
from src.models import WeightedEnsemble, StackModel, AllAsTextModel
from src.utils import ConfigLoader, compute_metrics
import xgboost as xgb
from transformers import AutoModelForSequenceClassification
from src.utils import format_text_pred

In [25]:
def get_prediction(config_type):
    # Shap args
    args = ConfigLoader(
        config_type, "../configs/shap_configs.yaml", "../configs/dataset_default.yaml"
    )
    # Data
    all_text_versions = [
        "all_text",
        "all_as_text",
        "all_as_text_base_reorder",
        "all_as_text_tnt_reorder",
    ]
    ds_name = args.ds_name
    train_df = load_dataset(
        ds_name,
        split="train",  # download_mode="force_redownload"
    ).to_pandas()
    # train_df = train_df[train_df['text_record'] == '""']
    y_train = train_df[args.label_col]

    test_df = load_dataset(
        ds_name,
        split="test",  # download_mode="force_redownload"
    ).to_pandas()
    test_df = test_df[test_df["record"] == '""']

    # Models
    tokenizer = AutoTokenizer.from_pretrained(
        args.text_model_base, model_max_length=512
    )
    if args.model_type in all_text_versions:
        text_pipeline = pipeline(
            "text-classification",
            model=args.my_text_model,
            tokenizer=tokenizer,
            device="cuda:0",
            truncation=True,
            padding=True,
            top_k=None,
        )
        # Define how to convert all columns to a single string
        if args.model_type in ["all_as_text", "all_text"]:
            cols_to_str_fn = lambda array: " | ".join(
                [
                    f"{col}: {val}"
                    for col, val in zip(
                        args.categorical_cols + args.numerical_cols + args.text_cols,
                        array,
                    )
                ]
            )
        else:
            # # Reorder based on the new index order in di
            # cols_to_str_fn = lambda array: " | ".join(
            #     [
            #         f"{col}: {val}"
            #         for _, col, val in sorted(
            #             zip(args.new_idx_order, args.tab_cols + args.text_cols, array)
            #         )
            #     ]
            # )
            raise NotImplementedError(
                "Shouldn't need much as the column ordering is in dataset info,\
                just need to update the cols_to_str_fn"
            )

        model = AllAsTextModel(
            text_pipeline=text_pipeline,
            cols_to_str_fn=cols_to_str_fn,
        )
    else:
        text_pipeline = pipeline(
            "text-classification",
            model=args.my_text_model,
            tokenizer=tokenizer,
            device="cuda:0",
            truncation=True,
            padding=True,
            top_k=None,
        )
        # Define how to convert the text columns to a single string
        if len(args.text_cols) == 1:

            def cols_to_str_fn(array):
                return array[0]

        else:

            def cols_to_str_fn(array):
                return " | ".join(
                    [f"{col}: {val}" for col, val in zip(args.text_cols, array)]
                )

        # LightGBM requires explicitly marking categorical features
        train_df[args.categorical_cols] = train_df[args.categorical_cols].astype(
            "category"
        )
        test_df[args.categorical_cols] = test_df[args.categorical_cols].astype(
            "category"
        )

        tab_model = lgb.LGBMClassifier(random_state=42, max_depth=3)
        tab_model.fit(train_df[args.categorical_cols + args.numerical_cols], y_train)

        if args.model_type in ["ensemble_25", "ensemble_50", "ensemble_75"]:
            text_weight = float(args.model_type.split("_")[-1]) / 100
            model = WeightedEnsemble(
                tab_model=tab_model,
                text_pipeline=text_pipeline,
                text_weight=text_weight,
                cols_to_str_fn=cols_to_str_fn,
            )
        elif args.model_type == "stack":
            """
            For the stack model, we make predictions on the validation set. These predictions
            are then used as features for the stack model (another LightGBM model) along with
            the other tabular features. In doing so the stack model learns, depending on the
            tabular features, when to trust the tabular model and when to trust the text model.
            """
            val_df = load_dataset(
                ds_name,
                split="validation",  # download_mode="force_redownload"
            ).to_pandas()
            val_df[args.categorical_cols] = val_df[args.categorical_cols].astype(
                "category"
            )
            y_val = val_df[args.label_col]
            val_text = list(map(cols_to_str_fn, val_df[args.text_cols].values))

            # Training set is the preditions from the tabular and text models on the validation set
            # plus the tabular features from the validation set
            text_val_preds = text_pipeline(val_text)
            text_val_preds = np.array(
                [format_text_pred(pred) for pred in text_val_preds]
            )
            # text_val_preds = np.array(
            #     [[lab["score"] for lab in pred] for pred in text_val_preds]
            # )

            # add text and tabular predictions to the val_df
            stack_val_df = val_df[args.categorical_cols + args.numerical_cols]
            tab_val_preds = tab_model.predict_proba(
                val_df[args.categorical_cols + args.numerical_cols]
            )
            stack_val_df[f"text_pred"] = text_val_preds[:, 1]
            stack_val_df[f"tab_pred"] = tab_val_preds[:, 1]

            stack_model = lgb.LGBMClassifier(
                random_state=42, max_depth=2, learning_rate=0.01
            )
            stack_model.fit(stack_val_df, y_val)

            model = StackModel(
                tab_model=tab_model,
                text_pipeline=text_pipeline,
                stack_model=stack_model,
                cols_to_str_fn=cols_to_str_fn,
                all_labels=False,
            )
        else:
            raise ValueError(f"Invalid model type of {args.model_type}")

    np.random.seed(1)
    x = test_df[args.categorical_cols + args.numerical_cols + args.text_cols].values
    preds = model.predict(x)
    labels = test_df[args.label_col].values
    return preds, labels

In [26]:
for config_type in [
    # "vet_59c_stack",
    # "vet_59c_ensemble_25",
    # "vet_59c_ensemble_50",
    # "vet_59c_ensemble_75",
    "vet_50c_all_text",
    # "vet_19c_stack",
    # "vet_19c_ensemble_25",
    # "vet_19c_ensemble_50",
    # "vet_19c_ensemble_75",
    "vet_10c_all_text",
]:
    print(config_type)
    preds, labels = get_prediction(config_type)
    try:
        results = compute_metrics((preds, labels), argmax=True)
    except:
        results = compute_metrics((preds, labels), argmax=False)
    print(results)

vet_50c_all_text
Updating with:
{'config': 'vet_50c_all_text', 'my_text_model': 'james-burton/vet_50c', 'ds_name': 'james-burton/vet_month_1c_all_text', 'text_model_base': 'SAVSNET/PetBERT_pretrained', 'model_type': 'all_text', 'ord_ds_name': 'james-burton/vet_month_1c_ordinal', 'text_cols': ['breed', 'region', 'record']}


{'categorical_cols': ['gender', 'neutered', 'species', 'insured'], 'numerical_cols_long': ['age_at_consult', 'Diseases of the ear or mastoid process', 'Mental, behavioural or neurodevelopmental disorders', 'Diseases of the blood or blood-forming organs', 'Diseases of the circulatory system', 'Dental', 'Developmental anomalies', 'Diseases of the digestive system', 'Endocrine, nutritional or metabolic diseases', 'Diseases of the Immune system', 'Certain infectious or parasitic diseases', 'Diseases of the skin', 'Diseases of the musculoskeletal system or connective tissue', 'Neoplasms', 'Diseases of the nervous system', 'Diseases of the visual system', 'Certain conditi