In [1]:
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
)
from datasets import load_from_disk, load_dataset
import wandb
import pandas as pd
import os
import yaml
import argparse
from transformers.trainer_callback import EarlyStoppingCallback
import numpy as np
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from datasets import Dataset, DatasetDict
from src.utils import ConfigLoader

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Import yaml file
with open("../configs/train_default.yaml") as f:
    args = yaml.safe_load(f)

config_type = "all_text"
# Update default args with chosen config
if config_type != "default":
    with open("../configs/train_configs.yaml") as f:
        yaml_configs = yaml.safe_load_all(f)
        yaml_args = next(conf for conf in yaml_configs if conf["config"] == config_type)
    args.update(yaml_args)
    print(f"Updating with:\n{yaml_args}\n")
print(f"\n{args}\n")

# Dataset
di = Config("../configs/dataset_info.yaml")
dataset = load_dataset(
    args["ds_name"],
    download_mode="force_redownload",
)
dataset = prepare_text(
    dataset=dataset,
    di=di,
    version=args["version"],
)

# Load model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(
    args["model_base"],
    num_labels=2,
)
tokenizer = AutoTokenizer.from_pretrained(args["model_base"])


# Tokenize the dataset
def encode(examples):
    return {
        "label": np.array([examples["labels"]]),
        **tokenizer(
            examples["text"],
            # truncation=True,
            # padding="max_length",
            # max_length=args["max_length"],
        ),
    }


dataset = dataset.map(encode)  # , load_from_cache_file=True)

Updating with:
{'config': 'all_text', 'fast_dev_run': False, 'batch_size': 32, 'ds_name': 'james-burton/vet_month_1_all_text', 'version': 'all_text'}


{'config': 'all_text', 'fast_dev_run': False, 'do_train': True, 'do_predict': True, 'tags': ['bert', '1 month'], 'batch_size': 32, 'model_base': 'bert-base-uncased', 'output_root': 'models/', 'num_epochs': 50, 'early_stopping_patience': 3, 'grad_accumulation_steps': 1, 'seed': 42, 'logging_steps': 10, 'lr_scheduler': 'linear', 'warmup_ratio': 0, 'device': 'cuda', 'num_workers': 1, 'resume_from_checkpoint': False, 'predict_batch_size': 16, 'save_total_limit': 1, 'pytorch2.0': True, 'max_length': 512, 'ds_name': 'james-burton/vet_month_1_all_text', 'version': 'all_text'}



Downloading readme: 100%|██████████| 1.95k/1.95k [00:00<00:00, 3.46MB/s]
Downloading data: 100%|██████████| 739k/739k [00:00<00:00, 2.45MB/s]
Downloading data: 100%|██████████| 2.42M/2.42M [00:00<00:00, 14.6MB/s]
Downloading data: 100%|██████████| 433k/433k [00:00<00:00, 2.87MB/s]]
Downloading data files: 100%|██████████| 3/3 [00:00<00:00,  4.71it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 3245.53it/s]
Generating test split: 100%|██████████| 2184/2184 [00:00<00:00, 274681.70 examples/s]
Generating train split: 100%|██████████| 7206/7206 [00:00<00:00, 487274.16 examples/s]
Generating validation split: 100%|██████████| 1272/1272 [00:00<00:00, 323029.47 examples/s]
Map: 100%|██████████| 2184/2184 [00:00<00:00, 3351.31 examples/s]
Map: 100%|██████████| 7206/7206 [00:02<00:00, 3284.74 examples/s]
Map: 100%|██████████| 1272/1272 [00:00<00:00, 3386.87 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased

In [8]:
sum([len(ids) > 512 for ids in dataset["train"]["input_ids"]]) / len(
    dataset["train"]["input_ids"]
)

0.055925617540938104

In [12]:
config_type = "vet_10b_baseline"

di = ConfigLoader(
    config_type, "../configs/shap_configs.yaml", "../configs/dataset_default.yaml"
)
# Data
test_df = load_dataset(
    di.ds_name, split="test", download_mode="force_redownload"
).to_pandas()
test_df = test_df.sample(1000, random_state=55)
tokenizer = AutoTokenizer.from_pretrained(
    di.text_model_base, model_max_length=512, truncation=True
)


# Define how to convert all columns to a single string
def cols_to_str_fn(array):
    return " | ".join(
        [
            f"{col}: {val}"
            for col, val in zip(
                di.categorical_cols + di.numerical_cols + di.text_cols, array
            )
        ]
    )


np.random.seed(1)
x = list(
    map(
        cols_to_str_fn,
        test_df[di.categorical_cols + di.numerical_cols + di.text_cols].values,
    )
)


def encode(examples):
    return {
        "label": np.array([examples["labels"]]),
        **tokenizer(
            examples["text"],
            # truncation=True,
            # padding="max_length",
            # max_length=args["max_length"],
        ),
    }

Updating with:
{'config': 'vet_10b_baseline', 'my_text_model': 'james-burton/vet_50b', 'ds_name': 'james-burton/vet_month_1b_all_text', 'text_model_base': 'bert-base-uncased', 'model_type': 'all_text', 'ord_ds_name': 'james-burton/vet_month_1b_ordinal'}


{'categorical_cols': ['gender', 'neutered', 'species', 'insured'], 'numerical_cols_long': ['age_at_consult', 'Diseases of the ear or mastoid process', 'Mental, behavioural or neurodevelopmental disorders', 'Diseases of the blood or blood-forming organs', 'Diseases of the circulatory system', 'Dental', 'Developmental anomalies', 'Diseases of the digestive system', 'Endocrine, nutritional or metabolic diseases', 'Diseases of the Immune system', 'Certain infectious or parasitic diseases', 'Diseases of the skin', 'Diseases of the musculoskeletal system or connective tissue', 'Neoplasms', 'Diseases of the nervous system', 'Diseases of the visual system', 'Certain conditions originating in the perinatal period', 'Pregnancy, childbirth or th

Downloading readme: 100%|██████████| 1.94k/1.94k [00:00<00:00, 16.5MB/s]
Downloading data: 100%|██████████| 754k/754k [00:00<00:00, 4.36MB/s]
Downloading data: 100%|██████████| 2.44M/2.44M [00:00<00:00, 16.8MB/s]
Downloading data: 100%|██████████| 453k/453k [00:00<00:00, 3.54MB/s]]
Downloading data files: 100%|██████████| 3/3 [00:00<00:00,  6.47it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 3086.32it/s]
Generating test split: 100%|██████████| 2184/2184 [00:00<00:00, 337224.27 examples/s]
Generating train split: 100%|██████████| 7206/7206 [00:00<00:00, 473443.42 examples/s]
Generating validation split: 100%|██████████| 1272/1272 [00:00<00:00, 279181.30 examples/s]


In [9]:
sum([len(ids) > 512 for ids in x["input_ids"]]) / len(x["input_ids"])

TypeError: list indices must be integers or slices, not str

In [20]:
x1 = [len(tokenizer.tokenize(sentence)) for sentence in x]

In [22]:
sum([sent > 512 for sent in x1]) / len(x1)

0.039