In [8]:
import torch.nn as nn
from torch.nn import CrossEntropyLoss
from transformers import BertModel
# from torch.utils.data import Dataset, DataLoader
from transformers import Trainer, TrainingArguments
from transformers import BertModel, BertConfig
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM,EarlyStoppingCallback, AutoModelForTokenClassification
from transformers import T5ForConditionalGeneration, T5Tokenizer
# from torchcrf import CRF
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
import ast
from transformers import DataCollatorForTokenClassification
import matplotlib.pyplot as plt
import json

In [2]:
import os

# Change the current working directory
os.chdir('/home/ozan/lara/Med-Highlight')

In [3]:
label2id = {'O': 0, 'I-Treatment': 1, 'I-Test': 2, 'I-Problem': 3, 'I-Background': 4, 'I-Other': 5}
id2label = {0: 'O', 1: 'I-Treatment', 2: 'I-Test', 3: 'I-Problem', 4: 'I-Background', 5: 'I-Other'}

In [6]:
def transform(example_batch):
    example_batch['sentence'] = ast.literal_eval(example_batch['sentence'])
    example_batch['tag'] = ast.literal_eval(example_batch['tag'])
    example_batch['tag'] = [label2id[label] for label in example_batch['tag']]
    return example_batch

def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            new_labels.append(label)

    return new_labels

def tokenize_and_align_labels(examples, tokenizer):
    tokenized_inputs = tokenizer(
        examples["sentence"], truncation=True, is_split_into_words=True
    )
    
    all_labels = examples["tag"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

def tokenize_and_align_labels_wrapper(examples, tokenizer):
    return tokenize_and_align_labels(examples, tokenizer)

def untokenize_labels_predictions(word_ids, true_labels, predictions):
    untokenized_true_labels = []
    untokenized_predictions = []

    for sublist_word_ids, sublist_true_labels, sublist_predictions in zip(word_ids, true_labels, predictions):
        current_labels = []
        current_predictions = []
        last_word_id = None

        for word_id, label, prediction in zip(sublist_word_ids[1:-1], sublist_true_labels, sublist_predictions):
            # Skip if this word_id is the same as the last one (it's a subword)
            if word_id == last_word_id:
                continue

            current_labels.append(label)
            current_predictions.append(prediction)
            last_word_id = word_id

        untokenized_true_labels.append(current_labels)
        untokenized_predictions.append(current_predictions)

    return untokenized_true_labels, untokenized_predictions

class TrainingMonitor:
    def __init__(self):
        self.best_confusion_matrix = None

    def compute_metrics_factory(self, word_ids, fold_no, dataset_name, model_name):
        # Define the actual compute_metrics function
        def compute_metrics(eval_preds):
            logits, labels = eval_preds
            print(logits)
            print(logits.shape)
            predictions = np.argmax(logits, axis=-1)

            # Remove ignored index (special tokens) and convert to labels
            true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
            true_predictions = [
                [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
                for prediction, label in zip(predictions, labels)
            ]

            untokenized_true_labels, untokenized_predictions = untokenize_labels_predictions(word_ids, true_labels, true_predictions)

            unflat_true = [label for seq in untokenized_true_labels for label in seq]
            unflat_pred = [label for seq in untokenized_predictions for label in seq]
            unreport = classification_report(y_pred=unflat_pred, y_true=unflat_true, output_dict=True)
            unreport['macro_wo_O'] = {'precision': (unreport['I-Background']['precision'] + unreport['I-Other']['precision'] + unreport['I-Problem']['precision'] + unreport['I-Test']['precision'] + unreport['I-Treatment']['precision']) / 5,
            'recall': (unreport['I-Background']['recall'] + unreport['I-Other']['recall'] + unreport['I-Problem']['recall'] + unreport['I-Test']['recall'] + unreport['I-Treatment']['recall']) / 5,
            'f1-score': (unreport['I-Background']['f1-score'] + unreport['I-Other']['f1-score'] + unreport['I-Problem']['f1-score'] + unreport['I-Test']['f1-score'] + unreport['I-Treatment']['f1-score']) / 5,
            'support': (unreport['I-Background']['support'] + unreport['I-Other']['support'] + unreport['I-Problem']['support'] + unreport['I-Test']['support'] + unreport['I-Treatment']['support'])}
            
            un_report_df = pd.DataFrame(unreport).round(3).T
    
            cm = confusion_matrix(y_pred=unflat_pred, y_true=unflat_true)
            disp = ConfusionMatrixDisplay(cm, display_labels=np.array(['I-Background','I-Other', 'I-Problem', 'I-Test', 'I-Treatment', 'O']))
            fig, ax = plt.subplots(figsize=(8, 8))
            disp.plot(ax=ax)

            
            binary_predictions = ['0' if label == 'O' else '1' for label in unflat_pred]
            binary_labels = ['0' if label == 'O' else '1' for label in unflat_true]

            # Generate a classification report
            binary_classification_report = classification_report(y_true=binary_labels, y_pred=binary_predictions, target_names=['O', 'I'], digits=3, output_dict=True)
            
            # Save the figure to an image file
            plt.savefig(f'analysis/{model_name}/{dataset_name}/graphs/fold{fold_no}/confusion_matrix.png')
            plt.close()
            
            with open(f"analysis/{model_name}/{dataset_name}/reports/fold{fold_no}/multiclass_classification_report.json", "w") as f:
                json.dump(un_report_df.to_dict(), f, indent=4)
            with open(f"analysis/{model_name}/{dataset_name}/reports/fold{fold_no}/binary_classification_report.json", "w") as f:
                json.dump(binary_classification_report, f, indent=4)
                

            return {
                "precision": unreport['macro_wo_O']['precision'],
                "recall": unreport['macro_wo_O']['recall'],
                "f1": unreport['macro_wo_O']['f1-score'],
                "accuracy": unreport['accuracy'],
            }
        return compute_metrics


In [9]:
dataset_name = 'phee'
model_name = 'sci5'
for i in range(1):
    data = load_dataset('csv', data_files={'train': f'data/processed/{dataset_name}/fold{i}/train.csv'})
    
    dataset = data['train'].train_test_split(test_size=0.15, seed=42) # 85% training, 15% validation

    for file_type in ['train', 'test']:
        dataset[file_type] = dataset[file_type].shuffle(seed=42).select(range(int(0.1 * len(dataset[file_type]))))  # Select 10%
        dataset[file_type] = dataset[file_type].map(transform)

    tokenizer = AutoTokenizer.from_pretrained("razent/SciFive-base-Pubmed") 
    data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

    tokenized_datasets = dataset.map(
    lambda examples: tokenize_and_align_labels_wrapper(examples, tokenizer),
    batched=True,
    remove_columns=dataset['train'].column_names)

    # word ids of the tokenized test set to find the original words
    test_sentences = dataset['test']['sentence']
    test_word_ids = []
    for sentence in test_sentences:
        test_word_ids.append(tokenizer(sentence, truncation=True, is_split_into_words=True).word_ids())

    
 
    model = AutoModelForTokenClassification.from_pretrained("razent/SciFive-base-Pubmed")


    # Training arguments
    training_args = TrainingArguments(
        output_dir=f'./models/{model_name}',
        num_train_epochs=50,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        logging_strategy="epoch",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        overwrite_output_dir=True,
        learning_rate=3e-5,
        metric_for_best_model="f1",
        load_best_model_at_end=True,
        report_to="none"
    )

    early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=5, early_stopping_threshold=0.01)
    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets['train'],
        eval_dataset=tokenized_datasets['test'],
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=TrainingMonitor().compute_metrics_factory(test_word_ids, fold_no=i, dataset_name=dataset_name, model_name=model_name),
        callbacks=[early_stopping_callback]
    )

    # Train the model
    trainer.train()
    eval_result = trainer.evaluate()
    metric  = eval_result["eval_f1"]



Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


ValueError: Unrecognized configuration class <class 'transformers.models.t5.configuration_t5.T5Config'> for this kind of AutoModel: AutoModelForTokenClassification.
Model type should be one of AlbertConfig, BertConfig, BigBirdConfig, BioGptConfig, BloomConfig, BrosConfig, CamembertConfig, CanineConfig, ConvBertConfig, Data2VecTextConfig, DebertaConfig, DebertaV2Config, DistilBertConfig, ElectraConfig, ErnieConfig, ErnieMConfig, EsmConfig, FalconConfig, FlaubertConfig, FNetConfig, FunnelConfig, GPT2Config, GPT2Config, GPTBigCodeConfig, GPTNeoConfig, GPTNeoXConfig, IBertConfig, LayoutLMConfig, LayoutLMv2Config, LayoutLMv3Config, LiltConfig, LongformerConfig, LukeConfig, MarkupLMConfig, MegaConfig, MegatronBertConfig, MobileBertConfig, MPNetConfig, MptConfig, MraConfig, NezhaConfig, NystromformerConfig, QDQBertConfig, RemBertConfig, RobertaConfig, RobertaPreLayerNormConfig, RoCBertConfig, RoFormerConfig, SqueezeBertConfig, XLMConfig, XLMRobertaConfig, XLMRobertaXLConfig, XLNetConfig, XmodConfig, YosoConfig.

## Evaluation

In [None]:
# use the whole train data for training with best hyperparameters
# evaluate on test set

