In [1]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from src.transformer_funcs import CustomDataset, new_input_to_prediction
from src.utils import injury_codes
import torch
import pandas as pd
import numpy as np
import evaluate
import random

random.seed(35418)

MODEL = "answerdotai/ModernBERT-base"
TRAIN_DATA = "C:/Users/gioc4/Documents/blog/data/falls/neis.csv"
MAX_TOKEN_LENGTH = 128
DATA_SIZE = 1500
TRAIN_SIZE = .90

# init some values
tokenizer = AutoTokenizer.from_pretrained(MODEL)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
label_encoder = LabelEncoder()

# load data
neis_data = pd.read_csv(TRAIN_DATA).head(DATA_SIZE)

# get top 5 diagnoses
values = neis_data.groupby('Diagnosis').size().sort_values(ascending=False)[:5]
train_data = neis_data[neis_data['Diagnosis'].isin(values.index.values)]


  from .autonotebook import tqdm as notebook_tqdm


In [15]:
# some local funcs
def prep_data(dataframe):
    X = dataframe['Narrative_1'].to_list()
    y = label_encoder.fit_transform(dataframe['Diagnosis'].map(injury_codes))

    # return dict of encoded labels
    keys = label_encoder.classes_
    values = label_encoder.transform(label_encoder.classes_)
    value_dict = dict(zip(keys, map(int, values)))


    return X, y, value_dict

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    # get preds using just the max predicted value
    acc = accuracy.compute(predictions=predictions, references=labels)

    return acc


In [13]:
# set up data
# prepare the text and labels, train-test split, and init torch datasets

X, y, value_dict = prep_data(train_data)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=TRAIN_SIZE, random_state=42
)


train_dataset = CustomDataset(X_train, y_train, tokenizer, MAX_TOKEN_LENGTH)
test_dataset = CustomDataset(X_test, y_test, tokenizer, MAX_TOKEN_LENGTH)

In [14]:
# set up model

# set labels for inputs
id2label = dict((v,k) for k,v in value_dict.items())
label2id = value_dict

model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=5, id2label=id2label, label2id=label2id)

training_args = TrainingArguments(
    output_dir="models",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 50%|█████     | 66/132 [24:29<24:29, 22.27s/it]


[A[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

AttributeError: 'float' object has no attribute 'size'

In [6]:
# load model
model = AutoModelForSequenceClassification.from_pretrained("models/checkpoint-132", num_labels=5)
tokenizer = AutoTokenizer.from_pretrained("models/checkpoint-132")

In [7]:
newdata = pd.read_csv(TRAIN_DATA)
newdata = newdata.iloc[1000:1350]
newdata = newdata[newdata['Diagnosis'].isin(values.index)]

new_text_input = newdata['Narrative_1'].tolist()

In [42]:
# to get new preds we pass the input through the tokenizer
# and get the tokenized input and attention mask
outputs = new_input_to_prediction(model, new_text_input, tokenizer, MAX_TOKEN_LENGTH)

# convert predictions to probabilities, then get max probability as label
predictions = torch.nn.functional.softmax(outputs.logits, dim=1)
df_preds = pd.DataFrame(
    {
        "text": new_text_input,
        "label": newdata["Diagnosis"],
        "pred": predictions.argmax(1),
    }
)

In [41]:
df_preds['pred'] = df_preds['pred'].map(id2label)
df_preds

Unnamed: 0,text,label,pred
1003,"46 YOF INJURED TOE, STRUCK TOE AGAINST BED POS...",57,Laceration
1004,"54 YOM INJURED TOE, STUBBED TOE ON DRESSER, 2 ...",71,"Contusions, Abrasions"
1005,"11 YOM INJURED HAND, ACCIDENTALLY CUT HAND WIT...",59,Internal organ injury
1006,48YOM PRESENTS TO ED FOR GROIN PAIN. PT STARTE...,71,"Contusions, Abrasions"
1007,84YOM PRESENTS TOE D W/ CRAMPING PAIN ON LEFT ...,71,"Contusions, Abrasions"
...,...,...,...
1340,5YOM AT HOME PLAYING WITH IS NINE YEAR OLD BRO...,71,"Contusions, Abrasions"
1342,28 YOM WITH LACERATION TO LEFT 4TH FINGER AFTE...,59,Internal organ injury
1343,57 YOF WITH LEFT SHOULDER PAIN S/P SLIPPING ON...,57,Laceration
1345,72 YOF WITH COUGH X 1 WEEK. LAST NIGHT SHE HAD...,71,"Contusions, Abrasions"
