In [None]:
!pip install -q transformers
!pip install -q torchinfo
!pip install -U -q datasets fsspec huggingface_hub # Hugging Face's dataset library
!pip install -q evaluate

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m558.8/558.8 kB[0m [31m38.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
#@title Imports

import numpy as np

import transformers
import evaluate

from datasets import load_dataset
from datasets import Dataset
from torchinfo import summary
import torch

from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer

import itertools
from sklearn.utils.class_weight import compute_class_weight

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from itertools import combinations
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix


In [None]:
import os
import xml.etree.ElementTree as ET

def parse_drugbank_corpus(drugbank_dir):
    """
    Parses all XML files in the drugbank_dir.
    Returns a list of dicts with sentences, entities, and DDIs.
    """
    data = []

    for filename in os.listdir(drugbank_dir):
        #print(filename)
        if filename.endswith(".xml"):
            file_path = os.path.join(drugbank_dir, filename)
            tree = ET.parse(file_path)
            root = tree.getroot()

            for sentence in root.iter("sentence"):
                sent_text = sentence.attrib.get("text")
                sent_id = sentence.attrib.get("id")

                entities = {}
                for entity in sentence.iter("entity"):
                    ent_id = entity.attrib["id"]
                    ent_text = entity.attrib["text"]
                    ent_char_offset = entity.attrib.get("charOffset", "")
                    entities[ent_id] = {
                        "id": ent_id,
                        "text": ent_text,
                        "char_offset": ent_char_offset
                    }

                ddilist = []
                for pair in sentence.iter("pair"):
                    ddi_label = pair.attrib["ddi"]
                    if ddi_label == "true":
                        e1 = pair.attrib["e1"]
                        e2 = pair.attrib["e2"]
                        ddi_type = pair.attrib.get("type", "")
                        ddilist.append({
                            "drug1": entities[e1]["text"],
                            "drug2": entities[e2]["text"],
                            "interaction_type": ddi_type
                        })

                data.append({
                    "sentence_id": sent_id,
                    "sentence_text": sent_text,
                    "entities": list(entities.values()),
                    "ddis": ddilist
                })

    return data

# Parse your specific folder
drugbank_dir_train = "/content/drive/MyDrive/w266 Final Project/Train/DrugBank"
drugbank_dir_test = "/content/drive/MyDrive/w266 Final Project/Test/Test for DDI Extraction task/DrugBank"
unfiltered_drugbank_sentences_train = parse_drugbank_corpus(drugbank_dir_train)
unfiltered_drugbank_sentences_test = parse_drugbank_corpus(drugbank_dir_test)

print(f"Parsed {len(unfiltered_drugbank_sentences_train)} sentences from DrugBank train.")
print(f"Parsed {len(unfiltered_drugbank_sentences_test)} sentences from DrugBank test.")

Parsed 5675 sentences from DrugBank train.
Parsed 973 sentences from DrugBank test.


In [None]:
# Filter sentences with 2 or more entities in the sentence
drugbank_sentences_train = [s for s in unfiltered_drugbank_sentences_train if len(s['entities']) >= 2]
drugbank_sentences_test = [s for s in unfiltered_drugbank_sentences_test if len(s['entities']) >= 2]

print(f"{len(drugbank_sentences_train)} filtered sentences from DrugBank Train.")
print(f"{len(drugbank_sentences_test)} filtered sentences from DrugBank Test.")

3256 filtered sentences from DrugBank Train.
620 filtered sentences from DrugBank Test.


In [None]:
count_with_ddis_train = sum(1 for s in drugbank_sentences_train if len(s['ddis']) >= 1)
count_with_ddis_test = sum(1 for s in drugbank_sentences_test if len(s['ddis']) >= 1)

print("--------------TRAIN--------------------------")
print("Number of entries with at least one DDI:", count_with_ddis_train)
print("Total number of entries:", len(drugbank_sentences_train))
print("Percentage of entries with at least one DDI:", count_with_ddis_train / len(drugbank_sentences_train) * 100, "%")
print("Percentage of entries with at least one DDI - unfiltered:", count_with_ddis_train / len(unfiltered_drugbank_sentences_train) * 100, "%")


print("--------------TEST---------------------------")
print("Number of entries with at least one DDI:", count_with_ddis_test)
print("Total number of entries:", len(drugbank_sentences_test))
print("Percentage of entries with at least one DDI:", count_with_ddis_test / len(drugbank_sentences_test) * 100, "%")
print("Percentage of entries with at least one DDI - unfiltered:", count_with_ddis_test / len(unfiltered_drugbank_sentences_test) * 100, "%")

--------------TRAIN--------------------------
Number of entries with at least one DDI: 1937
Total number of entries: 3256
Percentage of entries with at least one DDI: 59.490171990171994 %
Percentage of entries with at least one DDI - unfiltered: 34.13215859030837 %
--------------TEST---------------------------
Number of entries with at least one DDI: 415
Total number of entries: 620
Percentage of entries with at least one DDI: 66.93548387096774 %
Percentage of entries with at least one DDI - unfiltered: 42.65159301130524 %


In [None]:
#account for sentences that have have multiple DDIs
for i, sentence in enumerate(drugbank_sentences_train[0:30]):
  if len(sentence['ddis']) > 1:
    print("SENTENCE",i,  sentence['sentence_text'])
    print("ENTITIES", len(sentence['entities']))
    print("DDIS", sentence['ddis'])

SENTENCE 16 Bosentan is also expected to reduce plasma concentrations of other statins that have significant metabolism by CYP3A4, such as lovastatin and atorvastatin.
ENTITIES 4
DDIS [{'drug1': 'Bosentan', 'drug2': 'statins', 'interaction_type': 'mechanism'}, {'drug1': 'Bosentan', 'drug2': 'lovastatin', 'interaction_type': 'mechanism'}, {'drug1': 'Bosentan', 'drug2': 'atorvastatin', 'interaction_type': 'mechanism'}]
SENTENCE 18 Warfarin: Co-administration of bosentan 500 mg b.i.d. for 6 days decreased the plasma concentrations of both S-warfarin (a CYP2C9 substrate) and R-warfarin (a CYP3A4 substrate) by 29 and 38%, respectively.
ENTITIES 4
DDIS [{'drug1': 'bosentan', 'drug2': 'S-warfarin', 'interaction_type': 'mechanism'}, {'drug1': 'bosentan', 'drug2': 'R-warfarin', 'interaction_type': 'mechanism'}]
SENTENCE 23 Corticosteroids and Corticotropin (ACTH): may potentiate amphotericin B- induced hypokalemia which may predispose the patient to cardiac dysfunction.
ENTITIES 4
DDIS [{'drug1

In [None]:
def generate_drug_pairs(entities):
  sentence_entities = []
  for entity in entities:
    sentence_entities.append(entity["text"])

  drug_pairs = list(combinations(sentence_entities, 2))
  #print(drug_pairs)
  return drug_pairs

In [None]:
# Helper to parse 'start-end' string into a tuple of integers
def parse_offset(offset_str):
    start, end = offset_str.split('-')
    return int(start), int(end)

In [None]:
train_formatted_data = []

for i,s in enumerate(drugbank_sentences_train):
  try:
    #print(s['sentence_text'])
    #print(s['entities'])
    # print(i, s['ddis'])

    #sentence_text = s['sentence_text']
    #entities = s['entities']
    ddis = s.get('ddis', [])  # interactions
    #print(ddis)

    drug_pairs = generate_drug_pairs(s['entities'])
    #print(drug_pairs)

    for drug_pair in drug_pairs:
      formatted_sentence = s['sentence_text']
      entity_names = []
      start_list = []
      end_list = []
      #print ("DRUG PAIR", drug_pair)

      for entity in s['entities']:
        #print(entity)
        if len(entity_names) > 1:
          #print("done")
          break

        if entity['text'] in drug_pair:
          start, end = parse_offset(entity['char_offset'])
          start_list.append(start)
          end_list.append(end)
          entity_names.append(entity['text'])

        #print(start_list)
        #print(end_list)

      formatted_sentence = formatted_sentence[:end_list[-1]+1] + f"[/E2]" + formatted_sentence[end_list[-1]+1:]
      formatted_sentence = formatted_sentence[:start_list[-1]] + f"[E2]" + formatted_sentence[start_list[-1]:]
      formatted_sentence = formatted_sentence[:end_list[0]+1] + f"[/E1]" + formatted_sentence[end_list[0]+1:]
      formatted_sentence = formatted_sentence[:start_list[0]] + f"[E1]" + formatted_sentence[start_list[0]:]
      #print(formatted_sentence)

      label = "false"
      for ddi in ddis:
        d1 = ddi['drug1'].lower()
        d2 = ddi['drug2'].lower()
        pair_lower = [name.lower() for name in entity_names]
        if (d1 in pair_lower and d2 in pair_lower):
          label = ddi['interaction_type']
          break

      # print("Formatted:", formatted_sentence)
      # print("Label:", label)
      # print("---")

      train_formatted_data.append({
              "sentence": formatted_sentence,
              "labels": label
          })

  except ValueError as e:
    # print(f"Skipping due to ValueError: {e}")
    continue

In [None]:
train_formatted_data

[{'sentence': '[E1]Hormonal Contraceptives[/E1], Including Oral, Injectable, Transdermal, and Implantable [E2]Contraceptives[/E2]: An interaction study demonstrated that co-administration of bosentan and the oral hormonal contraceptive Ortho-Novum produced average decreases of norethindrone and ethinyl estradiol levels of 14% and 31%, respectively.',
  'labels': 'false'},
 {'sentence': '[E1]Hormonal Contraceptives[/E1], Including Oral, Injectable, Transdermal, and Implantable Contraceptives: An interaction study demonstrated that co-administration of [E2]bosentan[/E2] and the oral hormonal contraceptive Ortho-Novum produced average decreases of norethindrone and ethinyl estradiol levels of 14% and 31%, respectively.',
  'labels': 'false'},
 {'sentence': '[E1]Hormonal Contraceptives[/E1], Including Oral, Injectable, Transdermal, and Implantable Contraceptives: An interaction study demonstrated that co-administration of bosentan and the oral [E2]hormonal contraceptive[/E2] Ortho-Novum pr

In [None]:
test_formatted_data = []

for i,s in enumerate(drugbank_sentences_test):
  try:
    #print(s['sentence_text'])
    #print(s['entities'])
    # print(i, s['ddis'])

    #sentence_text = s['sentence_text']
    #entities = s['entities']
    ddis = s.get('ddis', [])  # interactions
    #print(ddis)

    drug_pairs = generate_drug_pairs(s['entities'])
    #print(drug_pairs)

    for drug_pair in drug_pairs:
      formatted_sentence = s['sentence_text']
      entity_names = []
      start_list = []
      end_list = []
      #print ("DRUG PAIR", drug_pair)

      for entity in s['entities']:
        #print(entity)
        if len(entity_names) > 1:
          #print("done")
          break

        if entity['text'] in drug_pair:
          start, end = parse_offset(entity['char_offset'])
          start_list.append(start)
          end_list.append(end)
          entity_names.append(entity['text'])

        #print(start_list)
        #print(end_list)

      formatted_sentence = formatted_sentence[:end_list[-1]+1] + f"[/E2]" + formatted_sentence[end_list[-1]+1:]
      formatted_sentence = formatted_sentence[:start_list[-1]] + f"[E2]" + formatted_sentence[start_list[-1]:]
      formatted_sentence = formatted_sentence[:end_list[0]+1] + f"[/E1]" + formatted_sentence[end_list[0]+1:]
      formatted_sentence = formatted_sentence[:start_list[0]] + f"[E1]" + formatted_sentence[start_list[0]:]
      #print(formatted_sentence)

      label = "false"
      for ddi in ddis:
        d1 = ddi['drug1'].lower()
        d2 = ddi['drug2'].lower()
        pair_lower = [name.lower() for name in entity_names]
        if (d1 in pair_lower and d2 in pair_lower):
          label = ddi['interaction_type']
          break

      # print("Formatted:", formatted_sentence)
      # print("Label:", label)
      # print("---")

      test_formatted_data.append({
              "sentence": formatted_sentence,
              "labels": label
          })

  except ValueError as e:
    # print(f"Skipping due to ValueError: {e}")
    continue

In [None]:
#classes are imbalanced

df_train = pd.DataFrame(train_formatted_data)
df_test = pd.DataFrame(test_formatted_data)


In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', None) # Display full content of columns

df_train

Unnamed: 0,sentence,labels
0,"[E1]Hormonal Contraceptives[/E1], Including Oral, Injectable, Transdermal, and Implantable [E2]Contraceptives[/E2]: An interaction study demonstrated that co-administration of bosentan and the oral hormonal contraceptive Ortho-Novum produced average decreases of norethindrone and ethinyl estradiol levels of 14% and 31%, respectively.",false
1,"[E1]Hormonal Contraceptives[/E1], Including Oral, Injectable, Transdermal, and Implantable Contraceptives: An interaction study demonstrated that co-administration of [E2]bosentan[/E2] and the oral hormonal contraceptive Ortho-Novum produced average decreases of norethindrone and ethinyl estradiol levels of 14% and 31%, respectively.",false
2,"[E1]Hormonal Contraceptives[/E1], Including Oral, Injectable, Transdermal, and Implantable Contraceptives: An interaction study demonstrated that co-administration of bosentan and the oral [E2]hormonal contraceptive[/E2] Ortho-Novum produced average decreases of norethindrone and ethinyl estradiol levels of 14% and 31%, respectively.",false
3,"[E1]Hormonal Contraceptives[/E1], Including Oral, Injectable, Transdermal, and Implantable Contraceptives: An interaction study demonstrated that co-administration of bosentan and the oral hormonal contraceptive [E2]Ortho-Novum[/E2] produced average decreases of norethindrone and ethinyl estradiol levels of 14% and 31%, respectively.",false
4,"[E1]Hormonal Contraceptives[/E1], Including Oral, Injectable, Transdermal, and Implantable Contraceptives: An interaction study demonstrated that co-administration of bosentan and the oral hormonal contraceptive Ortho-Novum produced average decreases of [E2]norethindrone[/E2] and ethinyl estradiol levels of 14% and 31%, respectively.",false
...,...,...
25725,Nephrotoxicity has been reported following concomitant administration of cephalosporins with aminoglycoside antibiotics or potent [E1]diuretics[/E1] such as [E2]furosemide[/E2].,false
25726,"Renal function should be carefully monitored, especially if higher dosages of the [E1]aminoglycosides[/E1] are to be administered or if therapy is prolonged, because of the potential nephrotoxicity and ototoxicity of [E2]aminoglycosidic antibiotics[/E2].",false
25727,"[E1]Chloramphenicol[/E1] has been shown to be antagonistic to [E2]beta-lactam antibiotics[/E2], including ceftazidime, based on in vitro studies and time kill curves with enteric gram-negative bacilli.",effect
25728,"[E1]Chloramphenicol[/E1] has been shown to be antagonistic to beta-lactam antibiotics, including [E2]ceftazidime[/E2], based on in vitro studies and time kill curves with enteric gram-negative bacilli.",effect


In [None]:
df_test

Unnamed: 0,sentence,labels
0,"Usage with [E1]Alcohol[/E1]: Due to the potential for increased CNS depressants effects, [E2]alcohol[/E2] should be used with caution in patients who are currently receiving pentazocine.",false
1,"Usage with [E1]Alcohol[/E1]: Due to the potential for increased CNS depressants effects, alcohol should be used with caution in patients who are currently receiving [E2]pentazocine[/E2].",advise
2,"Usage with Alcohol: Due to the potential for increased CNS depressants effects, [E1]alcohol[/E1] should be used with caution in patients who are currently receiving [E2]pentazocine[/E2].",advise
3,[E1]Aminoglutethimide[/E1] administered concomitantly with [E2]depo-subQ provera 104[/E2] may significantly decrease the serum concentrations of MPA.,mechanism
4,[E1]Aminoglutethimide[/E1] administered concomitantly with depo-subQ provera 104 may significantly decrease the serum concentrations of [E2]MPA[/E2].,false
...,...,...
5171,[E1]Monoamine oxidase inhibitors[/E1] or tricyclic antidepressants may potentiate the action of [E2]sympathomimetic amines[/E2].,effect
5172,Monoamine oxidase inhibitors or [E1]tricyclic antidepressants[/E1] may potentiate the action of [E2]sympathomimetic amines[/E2].,effect
5173,Concurrent use of [E1]alcohol[/E1] and other [E2]CNS depression-producing drugs[/E2] may increase the CNS depressant effects of methyprylon or these other medications.,false
5174,Concurrent use of [E1]alcohol[/E1] and other CNS depression-producing drugs may increase the CNS depressant effects of [E2]methyprylon[/E2] or these other medications.,effect


In [None]:
df_train = df_train[df_train['labels'].astype(bool)]  # filters out empty strings and NaNs
df_test = df_test[df_test['labels'].astype(bool)]

df_train = df_train.dropna(subset=["labels"])
df_test = df_test.dropna(subset=["labels"])

df_train = df_train[df_train['labels'].notna() & (df_train['labels'] != '')]
df_test = df_test[df_test['labels'].notna() & (df_test['labels'] != '')]

In [None]:
print(df_train['labels'].value_counts())

labels
false        21061
mechanism     1821
effect        1735
advise         928
int            184
Name: count, dtype: int64


In [None]:
print(df_test['labels'].value_counts())

labels
false        4108
mechanism     384
effect        340
advise        228
int           116
Name: count, dtype: int64


In [None]:
def compute_metrics(p):
  predictions, labels = p
  predictions = np.argmax(predictions, axis=1)
  return metric.compute(predictions=predictions, references=labels, average="weighted")

In [None]:
#shuffle the results of my training dataframe

df_train = df_train.sample(frac=1, random_state=42).reset_index(drop=True)
df_test = df_test.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
metric = evaluate.load("f1")

def compute_metrics(p):
  predictions, labels = p
  predictions = np.argmax(predictions, axis=1)
  return metric.compute(predictions=predictions, references=labels, average="weighted")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
def preprocess_tokenization(data, tokenizer):
    review_text = data['sentence']
    labels = data['labels']

    encoded = tokenizer.batch_encode_plus(
        review_text,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_token_type_ids=True,
    )

    # Convert each label in the batch to int
    encoded["labels"] = [int(label) for label in labels]

    return encoded

### Attempt 1 - Baseline Bert, BioBERT, SpanBert - no class weights###

In [None]:
df_train_copy = df_train.copy()
df_test_copy = df_test.copy()

In [None]:
df_test_copy

Unnamed: 0,sentence,labels
0,"Methscopolamine may interact with antidepressants (tricyclic type), MAO inhibitors (e.g., phenelzine, [E1]linezolid[/E1], tranylcypromine, isocarboxazid, selegiline, furazolidone), quinidine, amantadine, antihistamines (e.g., diphenhydramine), other anticholinergics, potassium chloride supplements, antacids, absorbent-type anti-diarrhea medicines (e.g., kaolin-pectin), phenothiazines (e.g., chlorpromazine, [E2]promethazine[/E2]).",false
1,"Because there is a theoretical basis that these effects may be additive, use of ergotamine-containing or ergot-type medications (like [E1]dihydroergotamine[/E1] or methysergide) and [E2]sumatriptan[/E2] within 24 hours of each other should be avoided.",advise
2,"Oral Contraceptives: In 10 healthy women, the pharmacokinetic profiles of norethindrone and [E1]ethinyl estradiol[/E1] following administration of a single dose containing 1.0 mg of [E2]norethindrone acetate[/E2] and 75 g of ethinyl estradiol were studied.",false
3,"Before taking this medication, tell your doctor if you are taking a tricyclic antidepressant such as amitriptyline (Elavil), amoxapine (Asendin), doxepin (Sinequan), [E1]nortriptyline[/E1] (Pamelor), imipramine (Tofranil), clomipramine ([E2]Anafranil[/E2]), protriptyline (Vivactil), or desipramine (Norpramin).",false
4,"[E1]Mephenytoin[/E1] may also affect the effects of other drugs, which include some steroid medications, [E2]warfarin[/E2], certain heart medicines, birth control pills, anti-infective medicines, furosemide and theophylline Please note that Mephenytoin may interact with other drugs that are not listed here.",effect
...,...,...
5171,"Interactions for [E1]Vitamin B1[/E1] ([E2]Thiamine[/E2]): Loop Diuretics, Oral Contraceptives, Stavudine, Tricyclic Antidepressants",false
5172,"No significant adverse interactions with common premedications (such as [E1]atropine[/E1], scopolamine, glycopyrrolate, diazepam, [E2]hydroxyzine[/E2], and other muscle relaxants) or local anesthetics have been observed.",false
5173,"Concomitant administration of [E1]terfenadine[/E1] with [E2]clarithromycin[/E2], erythromycin, or troleandomycin is contraindicated: Pending full characterization of potential interactions, concomitant administration of terfenadine with other macrolide antibiotics, including azithromycin, is not recommended.",advise
5174,Patients receiving [E1]sirolimus[/E1] or nifedipine in combination with [E2]MYCAMINE[/E2] should be monitored for sirolimus or nifedipine toxicity and sirolimus or nifedipine dosage should be reduced if necessary.,advise


In [None]:
#original attempt 1

def fine_tune_classification_model(classification_model,
                                   tokenizer,
                                   train_data,
                                   dev_data,
                                   batch_size = 16,
                                   num_epochs = 2):
    """
    Preprocess the data using the given tokenizer (we've give you the code for that part).
    Create the training arguments and trainer for the given model and data (write your code for that).
    Then train it.
    """
    label_map = {'mechanism': 0, 'effect': 1, 'advise': 2, 'int': 3, 'false': 4}
    df_train["labels"] = df_train["labels"].map(label_map)
    df_test["labels"] = df_test["labels"].map(label_map)


    train_data = Dataset.from_pandas(df_train)
    dev_data = Dataset.from_pandas(df_test)

    preprocessed_train_data = train_data.map(preprocess_tokenization, batched=True, fn_kwargs={'tokenizer': tokenizer})
    preprocessed_dev_data = dev_data.map(preprocess_tokenization, batched=True, fn_kwargs={'tokenizer': tokenizer})

    ### YOUR CODE HERE

    save_path = "/content/drive/MyDrive/w266_Final_Project_Output"

    training_args = TrainingArguments(
        output_dir="/content/drive/MyDrive/w266 Final Project",
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        eval_strategy="epoch",
        save_strategy="epoch",
        report_to='none'
    )

    trainer = Trainer(
    model=classification_model,
    args=training_args,
    train_dataset=preprocessed_train_data,
    eval_dataset=preprocessed_dev_data,
    compute_metrics=compute_metrics
    )

    ### END YOUR CODE

    trainer.train()

    trainer.save_model(save_path)
    tokenizer.save_pretrained(save_path)


    print("\nRunning detailed evaluation on dev set...\n")
    predictions_output = trainer.predict(preprocessed_dev_data)


    preds = np.argmax(predictions_output.predictions, axis=1)
    labels = predictions_output.label_ids

    # Print classification report
    report = classification_report(labels, preds, digits=4, target_names=label_map.keys())
    print("Classification Report:\n", report)

    # Print confusion matrix
    cm = confusion_matrix(labels, preds)
    print("Confusion Matrix:\n", cm)


In [None]:
#Attempt 1 - SpanBERT
num_labels = 5


model_checkpoint_name = "SpanBERT/spanbert-base-cased"
spanbert_tokenizer = AutoTokenizer.from_pretrained("SpanBERT/spanbert-base-cased")
spanbert_classification_model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint_name,
    num_labels=num_labels
)

fine_tune_classification_model(spanbert_classification_model, spanbert_tokenizer, df_train_copy, df_test_copy)

config.json:   0%|          | 0.00/413 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/215M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at SpanBERT/spanbert-base-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/25729 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/215M [00:00<?, ?B/s]

Map:   0%|          | 0/5176 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,F1
1,0.2072,0.321486,0.918448
2,0.1118,0.373338,0.914369
3,0.0665,0.379978,0.924341
4,0.0364,0.41419,0.924251



Running detailed evaluation on dev set...



Classification Report:
               precision    recall  f1-score   support

   mechanism     0.8657    0.7891    0.8256       384
      effect     0.6836    0.8324    0.7507       340
      advise     0.8636    0.8333    0.8482       228
         int     0.8140    0.3017    0.4403       116
       false     0.9610    0.9705    0.9657      4108

    accuracy                         0.9270      5176
   macro avg     0.8376    0.7454    0.7661      5176
weighted avg     0.9281    0.9270    0.9243      5176

Confusion Matrix:
 [[ 303   15    9    4   53]
 [  10  283    2    0   45]
 [   0   11  190    3   24]
 [   2   39    0   35   40]
 [  35   66   19    1 3987]]


In [None]:
#Attempt 1 - BioBERT
num_labels = 5  # your number of label classes


model_checkpoint_name = "dmis-lab/biobert-base-cased-v1.1"
biobert_tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
biobert_classification_model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint_name,
    num_labels=num_labels
)

fine_tune_classification_model(biobert_classification_model, biobert_tokenizer, df_train, df_test)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/25729 [00:00<?, ? examples/s]

Map:   0%|          | 0/5176 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,F1
1,0.1351,0.310629,0.93127
2,0.0606,0.329888,0.938408



Running detailed evaluation on dev set...



Classification Report:
               precision    recall  f1-score   support

   mechanism     0.8769    0.8906    0.8837       384
      effect     0.7542    0.7853    0.7695       340
      advise     0.8705    0.8553    0.8628       228
         int     0.7778    0.4224    0.5475       116
       false     0.9684    0.9771    0.9727      4108

    accuracy                         0.9403      5176
   macro avg     0.8496    0.7861    0.8072      5176
weighted avg     0.9390    0.9403    0.9384      5176

Confusion Matrix:
 [[ 342    7    7    4   24]
 [  10  267    7    0   56]
 [   3    2  195    3   25]
 [   4   37    0   49   26]
 [  31   41   15    7 4014]]


In [None]:
#Attempt 1 - BASELINE BERT
num_labels = 5  # your number of label classes


model_checkpoint_name = "bert-base-cased"
bert_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint_name)
bert_classification_model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint_name,
    num_labels=num_labels
)

fine_tune_classification_model(bert_classification_model, bert_tokenizer, df_train, df_test)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/25729 [00:00<?, ? examples/s]

Map:   0%|          | 0/5176 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,F1
1,0.4102,0.528574,0.778373
2,0.3437,0.506541,0.78978



Running detailed evaluation on dev set...



Classification Report:
               precision    recall  f1-score   support

   mechanism     0.5967    0.2812    0.3823       384
      effect     0.6132    0.3824    0.4710       340
      advise     0.6639    0.3465    0.4553       228
         int     0.7500    0.0259    0.0500       116
       false     0.8408    0.9537    0.8937      4108

    accuracy                         0.8188      5176
   macro avg     0.6929    0.3979    0.4505      5176
weighted avg     0.7979    0.8188    0.7898      5176

Confusion Matrix:
 [[ 108    1    0    0  275]
 [   1  130    4    0  205]
 [   0    0   79    0  149]
 [   0    0    0    3  113]
 [  72   81   36    1 3918]]


### Attempt 2 - Bert with Class weights ###

In [None]:
from sklearn.metrics import f1_score

def compute_metrics(p):
    predictions, labels = p
    preds = np.argmax(predictions, axis=1)

    return {
        "eval_f1": f1_score(labels, preds, average="weighted"),
        "eval_macro_f1": f1_score(labels, preds, average="macro")
    }

In [None]:
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    report = classification_report(labels, preds, output_dict=True)
    return {
        "eval_macro_f1": report["macro avg"]["f1-score"],
        "eval_weighted_f1": report["weighted avg"]["f1-score"],
        "eval_int_f1": report.get('3', {}).get('f1-score')
    }

In [None]:
from torch import nn
from transformers import Trainer
from transformers import EarlyStoppingCallback

class CustomTrainer(Trainer):

    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get('logits')
        loss_fct = nn.CrossEntropyLoss(weight=self.class_weights)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [None]:
df_train_copy = df_train.copy()
df_test_copy = df_test.copy()

In [None]:
def fine_tune_classification_model2(classification_model,
                                   tokenizer,
                                   train_data,
                                   dev_data,
                                   batch_size=8,
                                   num_epochs=3):

    label_map = {'mechanism': 0, 'effect': 1, 'advise': 2, 'int': 3, 'false': 4}
    train_data["labels"] = train_data["labels"].map(label_map)
    dev_data["labels"] = dev_data["labels"].map(label_map)

    train_dataset = Dataset.from_pandas(train_data)
    dev_dataset = Dataset.from_pandas(dev_data)
    print(np.unique(train_dataset['labels']))

    preprocessed_train_data = train_dataset.map(preprocess_tokenization, batched=True, fn_kwargs={'tokenizer': tokenizer})
    preprocessed_dev_data = dev_dataset.map(preprocess_tokenization, batched=True, fn_kwargs={'tokenizer': tokenizer})

    unique_labels = np.array(sorted(label_map.values()))  # [0, 1, 2, 3, 4]
    class_weights = compute_class_weight(
        class_weight='balanced',
        classes=unique_labels,
        y=train_data['labels']
    )
    class_weights = np.minimum(class_weights, 5)
    #class_weights[3] *= 2.0
    class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to("cuda")


    for label, idx in label_map.items():
      print(f"Label: {label:10} → ID: {idx} → Weight: {class_weights_tensor[idx]:.4f}")


    ### Training args
    training_args = TrainingArguments(
        output_dir="/content/drive/MyDrive/w266 Final Project",
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        eval_strategy="epoch",
        save_strategy="epoch",
        report_to='none',
        seed=42,
        learning_rate=3e-5,
        warmup_steps=200,
        lr_scheduler_type='linear',
        metric_for_best_model='eval_macro_f1'
    )

    trainer = CustomTrainer(
    model=classification_model,
    args=training_args,
    train_dataset=preprocessed_train_data,
    eval_dataset=preprocessed_dev_data,
    compute_metrics=compute_metrics,
    class_weights=class_weights_tensor
    )

    # print("Final check: Class weights linked to label IDs:")
    # for label, idx in sorted(label_map.items(), key=lambda x: x[1]):
    #   print(f"  ID {idx}: {label:<10} → Weight: {class_weights_tensor[idx].item():.4f}")

    trainer.train()

    save_path = "/content/drive/MyDrive/w266 Final Project"
    trainer.save_model(save_path)
    tokenizer.save_pretrained(save_path)

    predictions_output = trainer.predict(preprocessed_dev_data)
    preds = np.argmax(predictions_output.predictions, axis=1)
    labels = predictions_output.label_ids

    report = classification_report(labels, preds, digits=4, target_names=label_map.keys())
    print("Classification Report:\n", report)

    cm = confusion_matrix(labels, preds)
    print("Confusion Matrix:\n", cm)

In [None]:
#Attempt 1 - SpanBERT #hyperparameter 8 batch size, 2 epochs
num_labels = 5


model_checkpoint_name = "SpanBERT/spanbert-base-cased"
spanbert_tokenizer = AutoTokenizer.from_pretrained("SpanBERT/spanbert-base-cased")
spanbert_classification_model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint_name,
    num_labels=num_labels
)

fine_tune_classification_model2(spanbert_classification_model, spanbert_tokenizer, df_train_copy, df_test_copy)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at SpanBERT/spanbert-base-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0 1 2 3 4]


Map:   0%|          | 0/25729 [00:00<?, ? examples/s]

Map:   0%|          | 0/5176 [00:00<?, ? examples/s]

Label: mechanism  → ID: 0 → Weight: 2.8258
Label: effect     → ID: 1 → Weight: 2.9659
Label: advise     → ID: 2 → Weight: 3.0000
Label: int        → ID: 3 → Weight: 3.0000
Label: false      → ID: 4 → Weight: 0.2443


Epoch,Training Loss,Validation Loss,Macro F1,Weighted F1,Int F1
1,0.3691,0.965028,0.756477,0.916712,0.471204
2,0.1827,0.872766,0.77838,0.925234,0.502618


Classification Report:
               precision    recall  f1-score   support

   mechanism     0.8658    0.8229    0.8438       384
      effect     0.6707    0.8206    0.7381       340
      advise     0.7893    0.9035    0.8425       228
         int     0.6400    0.4138    0.5026       116
       false     0.9707    0.9591    0.9649      4108

    accuracy                         0.9252      5176
   macro avg     0.7873    0.7840    0.7784      5176
weighted avg     0.9278    0.9252    0.9252      5176

Confusion Matrix:
 [[ 316   17    5   10   36]
 [   1  279   17    0   43]
 [   0    7  206    3   12]
 [   2   38    0   48   28]
 [  46   75   33   14 3940]]


In [None]:
#Attempt 2 - SpanBERT #hyperparameter 8 batch size, 3 epochs
num_labels = 5  # your number of label classes


model_checkpoint_name = "SpanBERT/spanbert-base-cased"
spanbert_tokenizer = AutoTokenizer.from_pretrained("SpanBERT/spanbert-base-cased")
spanbert_classification_model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint_name,
    num_labels=num_labels
)

fine_tune_classification_model2(spanbert_classification_model, spanbert_tokenizer, df_train_copy, df_test_copy)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at SpanBERT/spanbert-base-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0 1 2 3 4]


Map:   0%|          | 0/25729 [00:00<?, ? examples/s]

Map:   0%|          | 0/5176 [00:00<?, ? examples/s]

Label: mechanism  → ID: 0 → Weight: 2.8258
Label: effect     → ID: 1 → Weight: 2.9659
Label: advise     → ID: 2 → Weight: 5.0000
Label: int        → ID: 3 → Weight: 5.0000
Label: false      → ID: 4 → Weight: 0.2443


Epoch,Training Loss,Validation Loss,Macro F1,Weighted F1,Int F1
1,0.3594,0.682553,0.791588,0.925594,0.505263
2,0.2012,0.824074,0.795161,0.931006,0.516854
3,0.0958,0.912983,0.796269,0.931629,0.511364


Classification Report:
               precision    recall  f1-score   support

   mechanism     0.7930    0.8880    0.8378       384
      effect     0.7268    0.8294    0.7747       340
      advise     0.8766    0.9035    0.8898       228
         int     0.7500    0.3879    0.5114       116
       false     0.9729    0.9623    0.9676      4108

    accuracy                         0.9326      5176
   macro avg     0.8239    0.7942    0.7963      5176
weighted avg     0.9342    0.9326    0.9316      5176

Confusion Matrix:
 [[ 341    3    5    0   35]
 [  10  282    4    0   44]
 [   1    2  206    3   16]
 [  19   37    0   45   15]
 [  59   64   20   12 3953]]


In [None]:
#Attempt 3 - SpanBERT BEST MODEL #hyperparameter 8 batch size, 3 epochs, 3e-5 learning rate
num_labels = 5  # your number of label classes


model_checkpoint_name = "SpanBERT/spanbert-base-cased"
spanbert_tokenizer = AutoTokenizer.from_pretrained("SpanBERT/spanbert-base-cased")
spanbert_classification_model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint_name,
    num_labels=num_labels
)

fine_tune_classification_model2(spanbert_classification_model, spanbert_tokenizer, df_train_copy, df_test_copy)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at SpanBERT/spanbert-base-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0 1 2 3 4]


Map:   0%|          | 0/25729 [00:00<?, ? examples/s]

Map:   0%|          | 0/5176 [00:00<?, ? examples/s]

Label: mechanism  → ID: 0 → Weight: 2.8258
Label: effect     → ID: 1 → Weight: 2.9659
Label: advise     → ID: 2 → Weight: 5.0000
Label: int        → ID: 3 → Weight: 5.0000
Label: false      → ID: 4 → Weight: 0.2443


Epoch,Training Loss,Validation Loss,Macro F1,Weighted F1,Int F1
1,0.3567,0.858904,0.797243,0.929382,0.52809
2,0.1969,0.940759,0.770529,0.924116,0.449704
3,0.1049,0.967768,0.807336,0.933,0.544379


Classification Report:
               precision    recall  f1-score   support

   mechanism     0.8094    0.8516    0.8299       384
      effect     0.7399    0.8618    0.7962       340
      advise     0.8787    0.9211    0.8994       228
         int     0.8679    0.3966    0.5444       116
       false     0.9696    0.9640    0.9668      4108

    accuracy                         0.9343      5176
   macro avg     0.8531    0.7990    0.8073      5176
weighted avg     0.9364    0.9343    0.9330      5176

Confusion Matrix:
 [[ 327    3    5    0   49]
 [   9  293    2    0   36]
 [   1    0  210    3   14]
 [   5   40    0   46   25]
 [  62   60   22    4 3960]]


In [None]:
#Attempt 4 - SpanBERT #hyperparameter 4 batch size, 4 epochs, 3e-5 learning rate
num_labels = 5  # your number of label classes


model_checkpoint_name = "SpanBERT/spanbert-base-cased"
spanbert_tokenizer = AutoTokenizer.from_pretrained("SpanBERT/spanbert-base-cased")
spanbert_classification_model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint_name,
    num_labels=num_labels
)

fine_tune_classification_model2(spanbert_classification_model, spanbert_tokenizer, df_train_copy, df_test_copy)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at SpanBERT/spanbert-base-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0 1 2 3 4]


Map:   0%|          | 0/25729 [00:00<?, ? examples/s]

Map:   0%|          | 0/5176 [00:00<?, ? examples/s]

Label: mechanism  → ID: 0 → Weight: 2.8258
Label: effect     → ID: 1 → Weight: 2.9659
Label: advise     → ID: 2 → Weight: 5.0000
Label: int        → ID: 3 → Weight: 5.0000
Label: false      → ID: 4 → Weight: 0.2443


Epoch,Training Loss,Validation Loss,Macro F1,Weighted F1,Int F1
1,0.5694,0.97492,0.675585,0.854406,0.433498
2,0.39,1.099869,0.726345,0.893824,0.46729


KeyboardInterrupt: 

In [None]:
#Attempt 5 - SpanBERT #hyperparameter 8 batch size, 4 epochs, 3.5e-5 learning rate, linear scheduler, warm up
num_labels = 5  # your number of label classes


model_checkpoint_name = "SpanBERT/spanbert-base-cased"
spanbert_tokenizer = AutoTokenizer.from_pretrained("SpanBERT/spanbert-base-cased")
spanbert_classification_model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint_name,
    num_labels=num_labels
)

fine_tune_classification_model2(spanbert_classification_model, spanbert_tokenizer, df_train_copy, df_test_copy)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at SpanBERT/spanbert-base-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0 1 2 3 4]


Map:   0%|          | 0/25729 [00:00<?, ? examples/s]

Map:   0%|          | 0/5176 [00:00<?, ? examples/s]

Label: mechanism  → ID: 0 → Weight: 2.8258
Label: effect     → ID: 1 → Weight: 2.9659
Label: advise     → ID: 2 → Weight: 5.0000
Label: int        → ID: 3 → Weight: 5.0000
Label: false      → ID: 4 → Weight: 0.2443


Epoch,Training Loss,Validation Loss,Macro F1,Weighted F1,Int F1
1,0.4399,0.827114,0.75868,0.911817,0.508287
2,0.2295,0.855767,0.804589,0.931949,0.556213
3,0.1232,0.927149,0.80653,0.935959,0.544379


Classification Report:
               precision    recall  f1-score   support

   mechanism     0.8608    0.8854    0.8729       384
      effect     0.7245    0.8353    0.7760       340
      advise     0.8235    0.9211    0.8696       228
         int     0.8679    0.3966    0.5444       116
       false     0.9730    0.9667    0.9698      4108

    accuracy                         0.9372      5176
   macro avg     0.8499    0.8010    0.8065      5176
weighted avg     0.9394    0.9372    0.9360      5176

Confusion Matrix:
 [[ 340    8    9    0   27]
 [   3  284   12    1   40]
 [   0    2  210    3   13]
 [   3   37    0   46   30]
 [  49   61   24    3 3971]]


In [None]:
#Attempt 6 - SpanBERT #hyperparameter 8 batch size, 4 epochs, 2.5e-5 learning rate, linear scheduler, warm up
num_labels = 5  # your number of label classes


model_checkpoint_name = "SpanBERT/spanbert-base-cased"
spanbert_tokenizer = AutoTokenizer.from_pretrained("SpanBERT/spanbert-base-cased")
spanbert_classification_model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint_name,
    num_labels=num_labels
)

fine_tune_classification_model2(spanbert_classification_model, spanbert_tokenizer, df_train_copy, df_test_copy)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at SpanBERT/spanbert-base-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0 1 2 3 4]


Map:   0%|          | 0/25729 [00:00<?, ? examples/s]

Map:   0%|          | 0/5176 [00:00<?, ? examples/s]

Label: mechanism  → ID: 0 → Weight: 2.8258
Label: effect     → ID: 1 → Weight: 2.9659
Label: advise     → ID: 2 → Weight: 5.0000
Label: int        → ID: 3 → Weight: 5.0000
Label: false      → ID: 4 → Weight: 0.2443


Epoch,Training Loss,Validation Loss,Macro F1,Weighted F1,Int F1
1,0.4223,0.828664,0.753272,0.907015,0.482051


Epoch,Training Loss,Validation Loss,Macro F1,Weighted F1,Int F1
1,0.4223,0.828664,0.753272,0.907015,0.482051
2,0.2172,0.937377,0.775187,0.925254,0.448598
3,0.1107,0.99664,0.768063,0.926247,0.393617


Classification Report:
               precision    recall  f1-score   support

   mechanism     0.8000    0.8542    0.8262       384
      effect     0.7624    0.8588    0.8077       340
      advise     0.8285    0.8684    0.8480       228
         int     0.5139    0.3190    0.3936       116
       false     0.9691    0.9606    0.9648      4108

    accuracy                         0.9276      5176
   macro avg     0.7748    0.7722    0.7681      5176
weighted avg     0.9265    0.9276    0.9262      5176

Confusion Matrix:
 [[ 328    5    9   10   32]
 [   2  292    4    2   40]
 [   1    0  198    3   26]
 [  12   39    0   37   28]
 [  67   47   28   20 3946]]


In [None]:
#Attempt 1 - BioBERT
num_labels = 5


model_checkpoint_name = "dmis-lab/biobert-base-cased-v1.1"
biobert_tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
biobert_classification_model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint_name,
    num_labels=num_labels
)

fine_tune_classification_model2(biobert_classification_model, biobert_tokenizer, df_train_copy, df_test_copy)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0 1 2 3 4]


Map:   0%|          | 0/25729 [00:00<?, ? examples/s]

Map:   0%|          | 0/5176 [00:00<?, ? examples/s]

Label: mechanism  → ID: 0 → Weight: 2.8258
Label: effect     → ID: 1 → Weight: 2.9659
Label: advise     → ID: 2 → Weight: 3.0000
Label: int        → ID: 3 → Weight: 3.0000
Label: false      → ID: 4 → Weight: 0.2443


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Macro F1,Weighted F1,Int F1
1,0.3464,0.877134,0.78421,0.928388,0.505495
2,0.2233,0.843524,0.789914,0.928053,0.505376


Classification Report:
               precision    recall  f1-score   support

   mechanism     0.7818    0.8958    0.8350       384
      effect     0.7077    0.8118    0.7562       340
      advise     0.8729    0.9035    0.8879       228
         int     0.6714    0.4052    0.5054       116
       false     0.9733    0.9572    0.9651      4108

    accuracy                         0.9283      5176
   macro avg     0.8014    0.7947    0.7899      5176
weighted avg     0.9304    0.9283    0.9281      5176

Confusion Matrix:
 [[ 344    5    3    0   32]
 [  12  276    7   10   35]
 [   3    2  206    3   14]
 [   5   37    0   47   27]
 [  76   70   20   10 3932]]


In [None]:
#Attempt 2 - BioBERT --> BEST MODEL hyper parameters (8 batch size, 3 epochs, 2.5e-5 learning rate )
num_labels = 5  # your number of label classes


model_checkpoint_name = "dmis-lab/biobert-base-cased-v1.1"
biobert_tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
biobert_classification_model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint_name,
    num_labels=num_labels
)

fine_tune_classification_model2(biobert_classification_model, biobert_tokenizer, df_train_copy, df_test_copy)

config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0 1 2 3 4]


Map:   0%|          | 0/25729 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Map:   0%|          | 0/5176 [00:00<?, ? examples/s]

Label: mechanism  → ID: 0 → Weight: 2.8258
Label: effect     → ID: 1 → Weight: 2.9659
Label: advise     → ID: 2 → Weight: 5.0000
Label: int        → ID: 3 → Weight: 5.0000
Label: false      → ID: 4 → Weight: 0.2443


  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Macro F1,Weighted F1,Int F1
1,0.3166,0.985255,0.769716,0.920271,0.479532
2,0.1608,1.040879,0.788187,0.935647,0.464286
3,0.0944,1.088097,0.801479,0.938377,0.52514


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


Classification Report:
               precision    recall  f1-score   support

   mechanism     0.8557    0.8802    0.8678       384
      effect     0.7811    0.8500    0.8141       340
      advise     0.8050    0.8509    0.8273       228
         int     0.7460    0.4052    0.5251       116
       false     0.9732    0.9730    0.9731      4108

    accuracy                         0.9399      5176
   macro avg     0.8322    0.7918    0.8015      5176
weighted avg     0.9394    0.9399    0.9384      5176

Confusion Matrix:
 [[ 338    5   14    0   27]
 [  13  289    6    2   30]
 [   0    3  194    3   28]
 [   8   36    0   47   25]
 [  36   37   27   11 3997]]


In [None]:
#Attempt 3 - BioBERT --> current hyper paramers (8 batch size, 3 epochs, 3e-5 learning rate )
num_labels = 5  # your number of label classes


model_checkpoint_name = "dmis-lab/biobert-base-cased-v1.1"
biobert_tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
biobert_classification_model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint_name,
    num_labels=num_labels
)

fine_tune_classification_model2(biobert_classification_model, biobert_tokenizer, df_train_copy, df_test_copy)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0 1 2 3 4]


Map:   0%|          | 0/25729 [00:00<?, ? examples/s]

Map:   0%|          | 0/5176 [00:00<?, ? examples/s]

Label: mechanism  → ID: 0 → Weight: 2.8258
Label: effect     → ID: 1 → Weight: 2.9659
Label: advise     → ID: 2 → Weight: 5.0000
Label: int        → ID: 3 → Weight: 5.0000
Label: false      → ID: 4 → Weight: 0.2443


  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Macro F1,Weighted F1,Int F1
1,0.3163,0.91356,0.804798,0.936217,0.530387
2,0.1247,0.952871,0.809164,0.938149,0.559524
3,0.0745,1.052075,0.814913,0.939413,0.568047


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


Classification Report:
               precision    recall  f1-score   support

   mechanism     0.8560    0.8672    0.8616       384
      effect     0.7598    0.8559    0.8050       340
      advise     0.8584    0.8772    0.8677       228
         int     0.9057    0.4138    0.5680       116
       false     0.9711    0.9735    0.9723      4108

    accuracy                         0.9411      5176
   macro avg     0.8702    0.7975    0.8149      5176
weighted avg     0.9423    0.9411    0.9394      5176

Confusion Matrix:
 [[ 333    7    9    0   35]
 [   3  291   11    0   35]
 [   0    0  200    3   25]
 [   8   36    0   48   24]
 [  45   49   13    2 3999]]


In [None]:
#Attempt 4 - BioBERT
num_labels = 5  # your number of label classes


model_checkpoint_name = "dmis-lab/biobert-base-cased-v1.1"
biobert_tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
biobert_classification_model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint_name,
    num_labels=num_labels
)

fine_tune_classification_model2(biobert_classification_model, biobert_tokenizer, df_train_copy, df_test_copy)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0 1 2 3 4]


Map:   0%|          | 0/25729 [00:00<?, ? examples/s]

Map:   0%|          | 0/5176 [00:00<?, ? examples/s]

Label: mechanism  → ID: 0 → Weight: 2.8258
Label: effect     → ID: 1 → Weight: 2.9659
Label: advise     → ID: 2 → Weight: 5.5450
Label: int        → ID: 3 → Weight: 55.9326
Label: false      → ID: 4 → Weight: 0.2443


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Macro F1,Weighted F1,Int F1
1,0.3515,1.180576,0.767009,0.917565,0.510417
2,0.1992,1.271153,0.774687,0.922348,0.507937


Classification Report:
               precision    recall  f1-score   support

   mechanism     0.7533    0.8828    0.8129       384
      effect     0.7177    0.8000    0.7566       340
      advise     0.7860    0.8860    0.8330       228
         int     0.6575    0.4138    0.5079       116
       false     0.9739    0.9523    0.9630      4108

    accuracy                         0.9221      5176
   macro avg     0.7777    0.7870    0.7747      5176
weighted avg     0.9253    0.9221    0.9223      5176

Confusion Matrix:
 [[ 339    6   14    0   25]
 [  15  272    7   10   36]
 [   0    2  202    3   21]
 [   9   36    0   48   23]
 [  87   63   34   12 3912]]


In [None]:
#BioBERT - class weights test
num_labels = 5


model_checkpoint_name = "dmis-lab/biobert-base-cased-v1.1"
biobert_tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
biobert_classification_model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint_name,
    num_labels=num_labels
)

fine_tune_classification_model2(biobert_classification_model, biobert_tokenizer, df_train_copy, df_test_copy)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0 1 2 3 4]


Map:   0%|          | 0/25729 [00:00<?, ? examples/s]

Map:   0%|          | 0/5176 [00:00<?, ? examples/s]

Label: mechanism  → ID: 0 → Weight: 2.8258
Label: effect     → ID: 1 → Weight: 2.9659
Label: advise     → ID: 2 → Weight: 5.5450
Label: int        → ID: 3 → Weight: 27.9663
Label: false      → ID: 4 → Weight: 0.2443


Epoch,Training Loss,Validation Loss,Macro F1,Weighted F1,Int F1
1,0.3654,1.274341,0.773542,0.925444,0.461538
2,0.1401,1.193497,0.794943,0.939903,0.477987


Classification Report:
               precision    recall  f1-score   support

   mechanism     0.8475    0.8828    0.8648       384
      effect     0.7302    0.8676    0.7930       340
      advise     0.8214    0.9079    0.8625       228
         int     0.8837    0.3276    0.4780       116
       false     0.9801    0.9727    0.9764      4108

    accuracy                         0.9418      5176
   macro avg     0.8526    0.7917    0.7949      5176
weighted avg     0.9447    0.9418    0.9399      5176

Confusion Matrix:
 [[ 339    4   12    0   29]
 [  11  295   17    0   17]
 [   1    6  207    3   11]
 [  18   36    0   38   24]
 [  31   63   16    2 3996]]


In [None]:
# BioBERT class weights test
num_labels = 5


model_checkpoint_name = "dmis-lab/biobert-base-cased-v1.1"
biobert_tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
biobert_classification_model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint_name,
    num_labels=num_labels
)

fine_tune_classification_model2(biobert_classification_model, biobert_tokenizer, df_train_copy, df_test_copy)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0 1 2 3 4]


Map:   0%|          | 0/25729 [00:00<?, ? examples/s]

Map:   0%|          | 0/5176 [00:00<?, ? examples/s]

Label: mechanism  → ID: 0 → Weight: 2.8258
Label: effect     → ID: 1 → Weight: 2.9659
Label: advise     → ID: 2 → Weight: 5.5450
Label: int        → ID: 3 → Weight: 15.0000
Label: false      → ID: 4 → Weight: 0.2443


Epoch,Training Loss,Validation Loss,Macro F1,Weighted F1,Int F1
1,0.3625,1.224301,0.767762,0.918936,0.510417
2,0.1581,1.235154,0.783653,0.929169,0.488372


Classification Report:
               precision    recall  f1-score   support

   mechanism     0.8542    0.8698    0.8619       384
      effect     0.7015    0.8088    0.7514       340
      advise     0.8571    0.8421    0.8496       228
         int     0.7500    0.3621    0.4884       116
       false     0.9664    0.9676    0.9670      4108

    accuracy                         0.9308      5176
   macro avg     0.8259    0.7701    0.7837      5176
weighted avg     0.9311    0.9308    0.9292      5176

Confusion Matrix:
 [[ 334    9    7    0   34]
 [  12  275    6    0   47]
 [   0    4  192    3   29]
 [   8   38    0   42   28]
 [  37   66   19   11 3975]]


In [None]:
# BioBERT class weights test
num_labels = 5


model_checkpoint_name = "dmis-lab/biobert-base-cased-v1.1"
biobert_tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
biobert_classification_model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint_name,
    num_labels=num_labels
)

fine_tune_classification_model2(biobert_classification_model, biobert_tokenizer, df_train_copy, df_test_copy)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0 1 2 3 4]


Map:   0%|          | 0/25729 [00:00<?, ? examples/s]

Map:   0%|          | 0/5176 [00:00<?, ? examples/s]

Label: mechanism  → ID: 0 → Weight: 2.8258
Label: effect     → ID: 1 → Weight: 2.9659
Label: advise     → ID: 2 → Weight: 3.0000
Label: int        → ID: 3 → Weight: 3.0000
Label: false      → ID: 4 → Weight: 0.2443


Epoch,Training Loss,Validation Loss,Macro F1,Weighted F1,Int F1
1,0.3116,0.939087,0.776572,0.927345,0.422222
2,0.1528,0.854495,0.801053,0.936898,0.52809


Classification Report:
               precision    recall  f1-score   support

   mechanism     0.8626    0.8828    0.8726       384
      effect     0.7297    0.8735    0.7952       340
      advise     0.8377    0.8377    0.8377       228
         int     0.7581    0.4052    0.5281       116
       false     0.9743    0.9691    0.9717      4108

    accuracy                         0.9380      5176
   macro avg     0.8325    0.7937    0.8011      5176
weighted avg     0.9391    0.9380    0.9369      5176

Confusion Matrix:
 [[ 339    4    7    0   34]
 [  14  297    6    0   23]
 [   0    9  191    3   25]
 [   7   39    0   47   23]
 [  33   58   24   12 3981]]


In [None]:
#Attempt 2.a - weighted loss trainer BERT - weight decay = 0.01, batch size = 16, no warmup
num_labels = 5  # your number of label classes

model_checkpoint_name = "bert-base-cased"
bert_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint_name)
bert_classification_model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint_name,
    num_labels=num_labels
)

fine_tune_classification_model2(bert_classification_model, bert_tokenizer, df_train_copy, df_test_copy)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0 1 2 3 4]


Map:   0%|          | 0/25729 [00:00<?, ? examples/s]

Map:   0%|          | 0/5176 [00:00<?, ? examples/s]

Label: mechanism  → ID: 0 → Weight: 2.8258
Label: effect     → ID: 1 → Weight: 2.9659
Label: advise     → ID: 2 → Weight: 5.5450
Label: int        → ID: 3 → Weight: 27.9663
Label: false      → ID: 4 → Weight: 0.2443


Epoch,Training Loss,Validation Loss,Macro F1,Weighted F1,Int F1
1,0.4442,1.198576,0.69623,0.863915,0.510417
2,0.1946,1.55996,0.758366,0.913906,0.455959


Classification Report:
               precision    recall  f1-score   support

   mechanism     0.8146    0.9036    0.8568       384
      effect     0.6201    0.8353    0.7118       340
      advise     0.8476    0.7807    0.8128       228
         int     0.5714    0.3793    0.4560       116
       false     0.9668    0.9426    0.9545      4108

    accuracy                         0.9129      5176
   macro avg     0.7641    0.7683    0.7584      5176
weighted avg     0.9186    0.9129    0.9139      5176

Confusion Matrix:
 [[ 347    5    0   10   22]
 [   5  284    2    0   49]
 [   0   16  178    2   32]
 [   4   38    0   44   30]
 [  70  115   30   21 3872]]


In [None]:
#Attempt 2.b - weighted loss trainer BERT - weight decay = 0.01, batch size = 16, warmup, int class weight * 2, learning rate 2e-5
num_labels = 5  # your number of label classes

model_checkpoint_name = "bert-base-cased"
bert_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint_name)
bert_classification_model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint_name,
    num_labels=num_labels
)

fine_tune_classification_model2(bert_classification_model, bert_tokenizer, df_train_copy, df_test_copy)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0 1 2 3 4]


Map:   0%|          | 0/25729 [00:00<?, ? examples/s]

Map:   0%|          | 0/5176 [00:00<?, ? examples/s]

Label: mechanism  → ID: 0 → Weight: 2.8258
Label: effect     → ID: 1 → Weight: 2.9659
Label: advise     → ID: 2 → Weight: 5.5450
Label: int        → ID: 3 → Weight: 55.9326
Label: false      → ID: 4 → Weight: 0.2443


Epoch,Training Loss,Validation Loss,Macro F1,Weighted F1,Int F1
1,0.3517,1.476003,0.745316,0.908869,0.437209


Epoch,Training Loss,Validation Loss,Macro F1,Weighted F1,Int F1
1,0.3517,1.476003,0.745316,0.908869,0.437209
2,0.1904,1.646701,0.772462,0.92348,0.444444


Classification Report:
               precision    recall  f1-score   support

   mechanism     0.8299    0.8516    0.8406       384
      effect     0.6898    0.8176    0.7483       340
      advise     0.8646    0.8684    0.8665       228
         int     0.6250    0.3448    0.4444       116
       false     0.9650    0.9598    0.9624      4108

    accuracy                         0.9247      5176
   macro avg     0.7949    0.7685    0.7725      5176
weighted avg     0.9249    0.9247    0.9235      5176

Confusion Matrix:
 [[ 327    7    0    9   41]
 [   4  278    8    0   50]
 [   0    4  198    3   23]
 [   4   43    0   40   29]
 [  59   71   23   12 3943]]


In [None]:
#Attempt 2.c - weighted loss trainer BERT- weight decay = 0.05, batch size = 8 warmup, int class weight * 2, learning rate 1e-5, early stopping
num_labels = 5  # your number of label classes

model_checkpoint_name = "bert-base-cased"
bert_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint_name)
bert_classification_model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint_name,
    num_labels=num_labels
)

fine_tune_classification_model2(bert_classification_model, bert_tokenizer, df_train_copy, df_test_copy)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0 1 2 3 4]


Map:   0%|          | 0/25729 [00:00<?, ? examples/s]

Map:   0%|          | 0/5176 [00:00<?, ? examples/s]

Label: mechanism  → ID: 0 → Weight: 2.8258
Label: effect     → ID: 1 → Weight: 2.9659
Label: advise     → ID: 2 → Weight: 5.5450
Label: int        → ID: 3 → Weight: 55.9326
Label: false      → ID: 4 → Weight: 0.2443


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Macro F1,Weighted F1,Int F1
1,0.4461,1.238813,0.760029,0.910769,0.492147
2,0.252,1.344469,0.776387,0.924365,0.452261


Classification Report:
               precision    recall  f1-score   support

   mechanism     0.8817    0.8542    0.8677       384
      effect     0.6742    0.7853    0.7255       340
      advise     0.8800    0.8684    0.8742       228
         int     0.5422    0.3879    0.4523       116
       false     0.9632    0.9613    0.9622      4108

    accuracy                         0.9248      5176
   macro avg     0.7883    0.7714    0.7764      5176
weighted avg     0.9251    0.9248    0.9244      5176

Confusion Matrix:
 [[ 328    4    0   10   42]
 [   5  267    4    0   64]
 [   0   11  198    3   16]
 [   5   37    0   45   29]
 [  34   77   23   25 3949]]


In [None]:
#Attempt 2.c reattempt - weighted loss trainer BERT - weight decay = 0.05, batch size = 8 warmup, int class weight * 2, learning rate 1e-5, early stopping
num_labels = 5  # your number of label classes

model_checkpoint_name = "bert-base-cased"
bert_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint_name)
bert_classification_model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint_name,
    num_labels=num_labels
)

fine_tune_classification_model2(bert_classification_model, bert_tokenizer, df_train_copy, df_test_copy)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0 1 2 3 4]


Map:   0%|          | 0/25729 [00:00<?, ? examples/s]

Map:   0%|          | 0/5176 [00:00<?, ? examples/s]

Label: mechanism  → ID: 0 → Weight: 2.8258
Label: effect     → ID: 1 → Weight: 2.9659
Label: advise     → ID: 2 → Weight: 5.5450
Label: int        → ID: 3 → Weight: 55.9326
Label: false      → ID: 4 → Weight: 0.2443


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Macro F1,Weighted F1,Int F1
1,0.5288,1.280773,0.728571,0.889843,0.525714
2,0.3704,1.320964,0.746132,0.902281,0.494624


Classification Report:
               precision    recall  f1-score   support

   mechanism     0.7677    0.7917    0.7795       384
      effect     0.6058    0.8000    0.6895       340
      advise     0.8326    0.8070    0.8196       228
         int     0.6571    0.3966    0.4946       116
       false     0.9554    0.9396    0.9475      4108

    accuracy                         0.9015      5176
   macro avg     0.7637    0.7470    0.7461      5176
weighted avg     0.9064    0.9015    0.9023      5176

Confusion Matrix:
 [[ 304   22    0   10   48]
 [   3  272    3    0   62]
 [   0    1  184    3   40]
 [   3   37    0   46   30]
 [  86  117   34   11 3860]]


In [None]:
#Attempt 2.c reattempt - weighted loss trainer BERT - weight decay = 0.05, batch size = 8 warmup, int class weight * 2, learning rate 1e-5, early stopping
num_labels = 5  # your number of label classes

model_checkpoint_name = "bert-base-cased"
bert_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint_name)
bert_classification_model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint_name,
    num_labels=num_labels
)

fine_tune_classification_model2(bert_classification_model, bert_tokenizer, df_train_copy, df_test_copy)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0 1 2 3 4]


Map:   0%|          | 0/25729 [00:00<?, ? examples/s]

Map:   0%|          | 0/5176 [00:00<?, ? examples/s]

Label: mechanism  → ID: 0 → Weight: 2.8258
Label: effect     → ID: 1 → Weight: 2.9659
Label: advise     → ID: 2 → Weight: 5.5450
Label: int        → ID: 3 → Weight: 15.0000
Label: false      → ID: 4 → Weight: 0.2443


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Macro F1,Weighted F1,Int F1
1,0.4502,0.96485,0.757882,0.908176,0.467005
2,0.3303,1.083741,0.767664,0.915991,0.446602


Classification Report:
               precision    recall  f1-score   support

   mechanism     0.8304    0.8542    0.8421       384
      effect     0.6773    0.8147    0.7397       340
      advise     0.8655    0.8465    0.8559       228
         int     0.5111    0.3966    0.4466       116
       false     0.9598    0.9484    0.9541      4108

    accuracy                         0.9158      5176
   macro avg     0.7688    0.7721    0.7677      5176
weighted avg     0.9175    0.9158    0.9160      5176

Confusion Matrix:
 [[ 328    8    0    0   48]
 [   3  277    4    1   55]
 [   0    2  193    3   30]
 [   3   37    0   46   30]
 [  61   85   26   40 3896]]


In [None]:
#Attempt 2.c reattempt - weighted loss trainer BERT - weight decay = 0.05, batch size = 8 warmup, int class weight * 2, learning rate 1e-5, early stopping
num_labels = 5  # your number of label classes

model_checkpoint_name = "bert-base-cased"
bert_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint_name)
bert_classification_model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint_name,
    num_labels=num_labels
)

fine_tune_classification_model2(bert_classification_model, bert_tokenizer, df_train_copy, df_test_copy)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0 1 2 3 4]


Map:   0%|          | 0/25729 [00:00<?, ? examples/s]

Map:   0%|          | 0/5176 [00:00<?, ? examples/s]

Label: mechanism  → ID: 0 → Weight: 2.8258
Label: effect     → ID: 1 → Weight: 2.9659
Label: advise     → ID: 2 → Weight: 5.0000
Label: int        → ID: 3 → Weight: 5.0000
Label: false      → ID: 4 → Weight: 0.2443


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Macro F1,Weighted F1,Int F1
1,0.4872,1.05672,0.739472,0.89585,0.511364
2,0.3837,1.056192,0.763921,0.903526,0.54023


Classification Report:
               precision    recall  f1-score   support

   mechanism     0.6857    0.8125    0.7437       384
      effect     0.6707    0.8147    0.7357       340
      advise     0.8721    0.8377    0.8546       228
         int     0.8103    0.4052    0.5402       116
       false     0.9544    0.9365    0.9453      4108

    accuracy                         0.9030      5176
   macro avg     0.7987    0.7613    0.7639      5176
weighted avg     0.9089    0.9030    0.9035      5176

Confusion Matrix:
 [[ 312   10    0    0   62]
 [   4  277    2    0   57]
 [   1    0  191    0   36]
 [   3   37    0   47   29]
 [ 135   89   26   11 3847]]


In [None]:
#Attempt 2.c reattempt - weighted loss trainer BERT - weight decay = 0.05, batch size = 8 warmup, int class weight * 2, learning rate 1e-5, early stopping
num_labels = 5  # your number of label classes

model_checkpoint_name = "bert-base-cased"
bert_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint_name)
bert_classification_model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint_name,
    num_labels=num_labels
)

fine_tune_classification_model2(bert_classification_model, bert_tokenizer, df_train_copy, df_test_copy)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0 1 2 3 4]


Map:   0%|          | 0/25729 [00:00<?, ? examples/s]

Map:   0%|          | 0/5176 [00:00<?, ? examples/s]

Label: mechanism  → ID: 0 → Weight: 2.8258
Label: effect     → ID: 1 → Weight: 2.9659
Label: advise     → ID: 2 → Weight: 5.5450
Label: int        → ID: 3 → Weight: 8.0000
Label: false      → ID: 4 → Weight: 0.2443


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Macro F1,Weighted F1,Int F1
1,0.4363,1.078195,0.756899,0.903054,0.537143
2,0.2864,1.196624,0.759214,0.912947,0.427746


Classification Report:
               precision    recall  f1-score   support

   mechanism     0.7465    0.8281    0.7852       384
      effect     0.7214    0.7618    0.7411       340
      advise     0.9282    0.8509    0.8879       228
         int     0.6491    0.3190    0.4277       116
       false     0.9522    0.9562    0.9542      4108

    accuracy                         0.9150      5176
   macro avg     0.7995    0.7432    0.7592      5176
weighted avg     0.9140    0.9150    0.9129      5176

Confusion Matrix:
 [[ 318    8    0    6   52]
 [   4  259    2    0   75]
 [   0    0  194    2   32]
 [   4   37    0   37   38]
 [ 100   55   13   12 3928]]


In [None]:
#Attempt 2.c reattempt - weighted loss trainer BERT- weight decay = 0.05, batch size = 8 warmup, int class weight * 2, learning rate 1e-5, early stopping
num_labels = 5  # your number of label classes

model_checkpoint_name = "bert-base-cased"
bert_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint_name)
bert_classification_model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint_name,
    num_labels=num_labels
)

fine_tune_classification_model2(bert_classification_model, bert_tokenizer, df_train_copy, df_test_copy)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0 1 2 3 4]


Map:   0%|          | 0/25729 [00:00<?, ? examples/s]

Map:   0%|          | 0/5176 [00:00<?, ? examples/s]

Label: mechanism  → ID: 0 → Weight: 2.8258
Label: effect     → ID: 1 → Weight: 2.9659
Label: advise     → ID: 2 → Weight: 3.0000
Label: int        → ID: 3 → Weight: 3.0000
Label: false      → ID: 4 → Weight: 0.2443


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Macro F1,Weighted F1,Int F1
1,0.4449,1.026309,0.742482,0.897971,0.494382
2,0.2876,0.982842,0.762741,0.914742,0.440476


Classification Report:
               precision    recall  f1-score   support

   mechanism     0.7816    0.8203    0.8005       384
      effect     0.6954    0.8059    0.7466       340
      advise     0.9175    0.8289    0.8710       228
         int     0.7115    0.3190    0.4405       116
       false     0.9537    0.9567    0.9552      4108

    accuracy                         0.9167      5176
   macro avg     0.8119    0.7462    0.7627      5176
weighted avg     0.9169    0.9167    0.9147      5176

Confusion Matrix:
 [[ 315   10    0    5   54]
 [   1  274    3    0   62]
 [   0    2  189    2   35]
 [   2   37    0   37   40]
 [  85   71   14    8 3930]]


In [None]:
#Attempt 2.d - weighted loss trainer -BERT  weight decay = 0.05, batch size = 8 warmup, int class weight * 2, learning rate 5e-6, early stopping, dropout 0.35

bert_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint_name)
bert_classification_model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint_name,
    num_labels=num_labels
)

fine_tune_classification_model2(bert_classification_model, bert_tokenizer, df_train_copy, df_test_copy)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0 1 2 3 4]


Map:   0%|          | 0/25729 [00:00<?, ? examples/s]

Map:   0%|          | 0/5176 [00:00<?, ? examples/s]

Label: mechanism  → ID: 0 → Weight: 2.8258
Label: effect     → ID: 1 → Weight: 2.9659
Label: advise     → ID: 2 → Weight: 5.5450
Label: int        → ID: 3 → Weight: 55.9326
Label: false      → ID: 4 → Weight: 0.2443


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Macro F1,Weighted F1,Int F1
1,0.4995,1.277794,0.733557,0.894139,0.5
2,0.378,1.300265,0.760741,0.903471,0.521739


Classification Report:
               precision    recall  f1-score   support

   mechanism     0.6956    0.8151    0.7506       384
      effect     0.6691    0.8029    0.7299       340
      advise     0.8655    0.8465    0.8559       228
         int     0.7059    0.4138    0.5217       116
       false     0.9551    0.9362    0.9455      4108

    accuracy                         0.9028      5176
   macro avg     0.7782    0.7629    0.7607      5176
weighted avg     0.9075    0.9028    0.9035      5176

Confusion Matrix:
 [[ 313   10    0    2   59]
 [   4  273    2    0   61]
 [   1    1  193    0   33]
 [   3   37    0   48   28]
 [ 129   87   28   18 3846]]


In [None]:
#Attempt 2 - weighted loss trainer  BERT - hihdden drop out and attention dropout

num_labels = 5  # your number of label classes

model_checkpoint_name = "bert-base-cased"


config = AutoConfig.from_pretrained(
    model_checkpoint_name,
    num_labels=num_labels,
    hidden_dropout_prob=0.3,
    attention_probs_dropout_prob=0.3
)

bert_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint_name)
bert_classification_model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint_name,
    num_labels=num_labels
)

fine_tune_classification_model2(bert_classification_model, bert_tokenizer, df_train_copy, df_test_copy)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0 1 2 3 4]


Map:   0%|          | 0/25729 [00:00<?, ? examples/s]

Map:   0%|          | 0/5176 [00:00<?, ? examples/s]

Label: mechanism  → ID: 0 → Weight: 2.8258
Label: effect     → ID: 1 → Weight: 2.9659
Label: advise     → ID: 2 → Weight: 5.5450
Label: int        → ID: 3 → Weight: 10.0000
Label: false      → ID: 4 → Weight: 0.2443


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Macro F1,Weighted F1,Int F1
1,0.4314,1.017866,0.741345,0.900209,0.51087
2,0.2579,1.252803,0.760883,0.908672,0.54023
3,0.1892,1.218732,0.768968,0.915871,0.505376
4,0.1014,1.481516,0.751642,0.913093,0.457447


Classification Report:
               precision    recall  f1-score   support

   mechanism     0.8431    0.8255    0.8342       384
      effect     0.6537    0.8382    0.7345       340
      advise     0.7763    0.7763    0.7763       228
         int     0.5972    0.3707    0.4574       116
       false     0.9609    0.9506    0.9557      4108

    accuracy                         0.9133      5176
   macro avg     0.7662    0.7523    0.7516      5176
weighted avg     0.9157    0.9133    0.9131      5176

Confusion Matrix:
 [[ 317    9    7   11   40]
 [   0  285    7    0   48]
 [   1    3  177    3   44]
 [   4   42    0   43   27]
 [  54   97   37   15 3905]]


## Attempt 3 - Focal Loss - ##

In [None]:
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    report = classification_report(labels, preds, output_dict=True)
    return {
        "eval_macro_f1": report["macro avg"]["f1-score"],
        "eval_weighted_f1": report["weighted avg"]["f1-score"],
        "eval_int_f1": report.get('3', {}).get('f1-score')
    }

In [None]:
df_train_copy = df_train.copy()
df_test_copy = df_test.copy()

In [None]:
from torch import nn
from transformers import Trainer

class CustomLossTrainer(Trainer):
    def __init__(self, *args, loss_fn=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_fn = loss_fn

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss = self.loss_fn(logits, labels)

        return (loss, outputs) if return_outputs else loss

In [None]:
import torch
import torch.nn.functional as F


def focal_loss(logits, labels, gamma=2.0, alpha=0.25):
    # Calculate standard cross-entropy loss first.
    ce_loss = F.cross_entropy(logits, labels, reduction='none')

    # Get softmax probabilities.
    pt = torch.exp(-ce_loss)

    # Compute focal loss.
    focal_loss = alpha * (1 - pt) ** gamma * ce_loss
    return focal_loss.mean()

In [None]:
#original attempt 4

def fine_tune_classification_model4(classification_model,
                                   tokenizer,
                                   train_data,
                                   dev_data,
                                   batch_size = 16,
                                   num_epochs = 2):
    """
    Preprocess the data using the given tokenizer (we've give you the code for that part).
    Create the training arguments and trainer for the given model and data (write your code for that).
    Then train it.
    """
    label_map = {'mechanism': 0, 'effect': 1, 'advise': 2, 'int': 3, 'false': 4}
    df_train["labels"] = df_train["labels"].map(label_map)
    df_test["labels"] = df_test["labels"].map(label_map)


    train_data = Dataset.from_pandas(df_train)
    dev_data = Dataset.from_pandas(df_test)

    preprocessed_train_data = train_data.map(preprocess_tokenization, batched=True, fn_kwargs={'tokenizer': tokenizer})
    preprocessed_dev_data = dev_data.map(preprocess_tokenization, batched=True, fn_kwargs={'tokenizer': tokenizer})

    ### YOUR CODE HERE

    save_path = "/content/drive/MyDrive/w266_Final_Project_Output"

    training_args = TrainingArguments(
        output_dir="/content/drive/MyDrive/w266 Final Project",
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        eval_strategy="epoch",
        save_strategy="epoch",
        report_to='none',
        seed=42
    )

    trainer = CustomLossTrainer(
    model=classification_model,
    args=training_args,
    train_dataset=preprocessed_train_data,
    eval_dataset=preprocessed_dev_data,
    compute_metrics=compute_metrics,
    loss_fn=focal_loss
    )

    ### END YOUR CODE

    trainer.train()

    trainer.save_model(save_path)
    tokenizer.save_pretrained(save_path)


    print("\nRunning detailed evaluation on dev set...\n")
    predictions_output = trainer.predict(preprocessed_dev_data)


    preds = np.argmax(predictions_output.predictions, axis=1)
    labels = predictions_output.label_ids

    # Print classification report
    report = classification_report(labels, preds, digits=4, target_names=label_map.keys())
    print("Classification Report:\n", report)

    # Print confusion matrix
    cm = confusion_matrix(labels, preds)
    print("Confusion Matrix:\n", cm)


In [None]:
#Attempt 3 - weighted loss trainer - weight decay = 0.001, batch_Size = 8, warmup_ratio = 0.1
num_labels = 5  # your number of label classes

model_checkpoint_name = "bert-base-cased"
bert_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint_name)
bert_classification_model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint_name,
    num_labels=num_labels
)

fine_tune_classification_model4(bert_classification_model, bert_tokenizer, df_train_copy, df_test_copy)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/25729 [00:00<?, ? examples/s]

Map:   0%|          | 0/5176 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,F1
1,0.019,0.041861,0.898472
2,0.0075,0.038992,0.923882



Running detailed evaluation on dev set...



Classification Report:
               precision    recall  f1-score   support

   mechanism     0.8342    0.8776    0.8553       384
      effect     0.6971    0.7853    0.7386       340
      advise     0.8475    0.8289    0.8381       228
         int     0.7692    0.3448    0.4762       116
       false     0.9623    0.9637    0.9630      4108

    accuracy                         0.9258      5176
   macro avg     0.8221    0.7601    0.7743      5176
weighted avg     0.9260    0.9258    0.9239      5176

Confusion Matrix:
 [[ 337    4    3    0   40]
 [   9  267    8    0   56]
 [   0   13  189    3   23]
 [   2   38    0   40   36]
 [  56   61   23    9 3959]]


## Attempt 4 - UnderSampling combined with attempt 2  - too little samples##

In [None]:
df_train_copy = df_train.copy()
df_test_copy = df_test.copy()

In [None]:
df_train_copy["labels"].value_counts()

Unnamed: 0_level_0,count
labels,Unnamed: 1_level_1
false,21061
mechanism,1821
effect,1735
advise,928
int,184


In [None]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(df_train_copy.drop(columns=["labels"]), df_train_copy["labels"])

df_train_undersampled = X_resampled.copy()
df_train_undersampled["labels"] = y_resampled

In [None]:
df_train_undersampled["labels"].value_counts()

Unnamed: 0_level_0,count
labels,Unnamed: 1_level_1
advise,184
effect,184
false,184
int,184
mechanism,184


In [None]:
df_train_undersampled

Unnamed: 0,sentence,labels
20763,Because both of these drugs have negative inot...,2
18381,Warfarin: Anticoagulant activity should be mon...,2
811,Although the interaction between <E1>almotript...,2
10855,"Therefore, CYP3A4 substrates known to have a n...",2
4935,It is advisable to check coagulation time with...,2
...,...,...
9957,The results of a study of coadministration of ...,0
24042,"<E1>Aspirin</E1>: In normal volunteers, a smal...",0
16245,<E1>Diltiazem</E1>: In patients with mild to m...,0
22371,Lithium: <E1>Valdecoxib</E1> 40 mg BID for 7 d...,0


In [None]:
def fine_tune_classification_model3(classification_model,
                                   tokenizer,
                                   train_data,
                                   dev_data,
                                   batch_size=16,
                                   num_epochs=2):

    label_map = {'mechanism': 0, 'effect': 1, 'advise': 2, 'int': 3, 'false': 4}
    train_data["labels"] = train_data["labels"].map(label_map)
    dev_data["labels"] = dev_data["labels"].map(label_map)

    train_dataset = Dataset.from_pandas(train_data)
    dev_dataset = Dataset.from_pandas(dev_data)
    print(np.unique(train_dataset['labels']))

    preprocessed_train_data = train_dataset.map(preprocess_tokenization, batched=True, fn_kwargs={'tokenizer': tokenizer})
    preprocessed_dev_data = dev_dataset.map(preprocess_tokenization, batched=True, fn_kwargs={'tokenizer': tokenizer})

    unique_labels = np.array(sorted(label_map.values()))  # [0, 1, 2, 3, 4]
    class_weights = compute_class_weight(
        class_weight='balanced',
        classes=unique_labels,
        y=train_data['labels']
    )
    class_weights = np.minimum(class_weights, 10.0)
    class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to("cuda")


    for label, idx in label_map.items():
      print(f"Label: {label:10} → ID: {idx} → Weight: {class_weights_tensor[idx]:.4f}")


    ### Training args
    training_args = TrainingArguments(
        output_dir="/content/drive/MyDrive/w266 Final Project",
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        eval_strategy="epoch",
        save_strategy="epoch",
        report_to='none',
        weight_decay=0.01,
        seed=42
    )

    trainer = CustomTrainer(
    model=classification_model,
    args=training_args,
    train_dataset=preprocessed_train_data,
    eval_dataset=preprocessed_dev_data,
    compute_metrics=compute_metrics,
    class_weights=class_weights_tensor
    )

    # print("Final check: Class weights linked to label IDs:")
    # for label, idx in sorted(label_map.items(), key=lambda x: x[1]):
    #   print(f"  ID {idx}: {label:<10} → Weight: {class_weights_tensor[idx].item():.4f}")

    trainer.train()

    save_path = "/content/drive/MyDrive/w266 Final Project"
    trainer.save_model(save_path)
    tokenizer.save_pretrained(save_path)

    predictions_output = trainer.predict(preprocessed_dev_data)
    preds = np.argmax(predictions_output.predictions, axis=1)
    labels = predictions_output.label_ids

    report = classification_report(labels, preds, digits=4, target_names=label_map.keys())
    print("Classification Report:\n", report)

    cm = confusion_matrix(labels, preds)
    print("Confusion Matrix:\n", cm)

In [None]:
#Attempt 3 - weighted loss trainer
num_labels = 5  # your number of label classes

model_checkpoint_name = "bert-base-cased"
bert_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint_name)
bert_classification_model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint_name,
    num_labels=num_labels
)

fine_tune_classification_model3(bert_classification_model, bert_tokenizer, df_train_undersampled, df_test_copy)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0 1 2 3 4]


Map:   0%|          | 0/920 [00:00<?, ? examples/s]

Map:   0%|          | 0/5176 [00:00<?, ? examples/s]

Label: mechanism  → ID: 0 → Weight: 1.0000
Label: effect     → ID: 1 → Weight: 1.0000
Label: advise     → ID: 2 → Weight: 1.0000
Label: int        → ID: 3 → Weight: 1.0000
Label: false      → ID: 4 → Weight: 1.0000


Epoch,Training Loss,Validation Loss,Macro F1,Weighted F1,Int F1
1,No log,1.373878,0.367113,0.46568,0.195046
2,No log,1.132389,0.452901,0.604374,0.292079


Classification Report:
               precision    recall  f1-score   support

   mechanism     0.2890    0.8203    0.4274       384
      effect     0.2612    0.8588    0.4005       340
      advise     0.3383    0.8991    0.4916       228
         int     0.2049    0.5086    0.2921       116
       false     0.9730    0.4912    0.6529      4108

    accuracy                         0.5582      5176
   macro avg     0.4133    0.7156    0.4529      5176
weighted avg     0.8303    0.5582    0.6044      5176

Confusion Matrix:
 [[ 315   27    9   10   23]
 [   9  292   15   12   12]
 [   2   14  205    3    4]
 [   3   37    0   59   17]
 [ 761  748  377  204 2018]]


##Previous Attempts ##

In [None]:
df_train_copy = df_train.copy()
df_test_copy = df_test.copy()

In [None]:
#df_train_copy["labels"].value_counts()

In [None]:
from sklearn.utils import resample

def upsample_minority_classes(df, label_col='labels'):
    max_size = df[label_col].value_counts().max()
    dfs = [df]
    for class_value, group in df.groupby(label_col):
        samples_needed = max_size - len(group)
        if samples_needed > 0:
            dfs.append(group.sample(samples_needed, replace=True, random_state=42))
    return pd.concat(dfs).sample(frac=1, random_state=42).reset_index(drop=True)


In [None]:
df_train_upsampled = upsample_minority_classes(df_train)

df_train_upsampled_copy = df_train_upsampled.copy()
df_test_copy = df_test.copy()

In [None]:


print(df_train_upsampled['labels'].value_counts())

labels
int          21061
effect       21061
mechanism    21061
false        21061
advise       21061
Name: count, dtype: int64


In [None]:
print(df_test_copy['labels'].value_counts())

labels
4    4108
0     384
1     340
2     228
3     116
Name: count, dtype: int64


In [None]:
from transformers import Trainer
import torch.nn as nn
import torch.nn.functional as F

class FocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=2.0):
        super().__init__()
        self.alpha = alpha  # tensor of shape [num_classes]
        self.gamma = gamma

    def forward(self, logits, targets):
        ce_loss = F.cross_entropy(logits, targets, weight=self.alpha, reduction='none')
        pt = torch.exp(-ce_loss)
        focal_loss = ((1 - pt) ** self.gamma) * ce_loss
        return focal_loss.mean()

class WeightedFocalLossTrainer(Trainer):
    def __init__(self, *args, class_weights=None, gamma=2.0, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights
        self.gamma = gamma

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        loss_fct = FocalLoss(alpha=self.class_weights, gamma=self.gamma)
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss


In [None]:
def fine_tune_classification_model_attempt3(classification_model,
                                   tokenizer,
                                   train_data,
                                   dev_data,
                                   batch_size=16,
                                   num_epochs=2):

    label_map = {'mechanism': 0, 'effect': 1, 'advise': 2, 'int': 3, 'false': 4}
    train_data["labels"] = train_data["labels"].map(label_map)
    dev_data["labels"] = dev_data["labels"].map(label_map)

    # Convert pandas DataFrames to Hugging Face Datasets
    train_dataset = Dataset.from_pandas(train_data)
    dev_dataset = Dataset.from_pandas(dev_data)

    preprocessed_train_data = train_dataset.map(
        lambda batch: preprocess_tokenization(batch, tokenizer),
        batched=True
    )
    preprocessed_dev_data = dev_dataset.map(
        lambda batch: preprocess_tokenization(batch, tokenizer),
        batched=True
    )

    # Define the output directory for saving the model
    save_path = "/content/drive/MyDrive/w266_Final_Project_Output"


    # custom_weights = {
    # 0: 1.0,   # mechanism
    # 1: 1.0,   # effect
    # 2: 1.0,   # advise
    # 3: 3.0,   # int (underperforming, boost it)
    # 4: 0.5    # false (dominant class, reduce impact)
    # }

    # # Step 2: Convert to tensor
    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # class_weights_tensor = torch.tensor(
    #     [custom_weights[i] for i in range(5)],
    #     dtype=torch.float
    # ).to(device)


    training_args = TrainingArguments(
        output_dir=save_path,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        eval_strategy="epoch",
        save_strategy="epoch",
        report_to='none',
        load_best_model_at_end=True,
        metric_for_best_model="macro_f1",
    )

    # trainer = WeightedFocalLossTrainer(
    #     model=classification_model,
    #     args=training_args,
    #     train_dataset=preprocessed_train_data,
    #     eval_dataset=preprocessed_dev_data,
    #     compute_metrics=compute_metrics,
    #     tokenizer=tokenizer,
    #     class_weights=class_weights_tensor,
    #     gamma = 2.0
    # )

    trainer = Trainer(
        model=classification_model,
        args=training_args,
        train_dataset=preprocessed_train_data,
        eval_dataset=preprocessed_dev_data,
        compute_metrics=compute_metrics
    )

    trainer.train()

    # Start training

    # Save the best model and tokenizer
    trainer.save_model(save_path)
    tokenizer.save_pretrained(save_path)

    print("\nRunning detailed evaluation on dev set with the best model...\n")
    predictions_output = trainer.predict(preprocessed_dev_data)

    preds = np.argmax(predictions_output.predictions, axis=1)
    labels = predictions_output.label_ids

    # Print classification report
    report = classification_report(labels, preds, digits=4, target_names=label_map.keys())
    print("Classification Report:\n", report)

    # Print confusion matrix
    cm = confusion_matrix(labels, preds)
    print("Confusion Matrix:\n", cm)

In [None]:
#Attempt 3 - can't recreate for some reason
num_labels = 5  # your number of label classes


model_checkpoint_name = "bert-base-cased"
bert_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint_name)
bert_classification_model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint_name,
    num_labels=num_labels
)

fine_tune_classification_model_attempt3(bert_classification_model, bert_tokenizer, df_train_upsampled_copy, df_test_copy)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/105305 [00:00<?, ? examples/s]

Map:   0%|          | 0/5176 [00:00<?, ? examples/s]

  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,F1,Macro F1
1,0.0195,0.260822,0.9141,0.770441


Epoch,Training Loss,Validation Loss,F1,Macro F1
1,0.0195,0.260822,0.9141,0.770441
2,0.0054,0.272319,0.923677,0.782211



Running detailed evaluation on dev set with the best model...



Classification Report:
               precision    recall  f1-score   support

   mechanism     0.8754    0.8047    0.8385       384
      effect     0.7079    0.7412    0.7241       340
      advise     0.8922    0.7982    0.8426       228
         int     0.7869    0.4138    0.5424       116
       false     0.9526    0.9744    0.9634      4108

    accuracy                         0.9262      5176
   macro avg     0.8430    0.7465    0.7822      5176
weighted avg     0.9244    0.9262    0.9237      5176

Confusion Matrix:
 [[ 309   15    3    0   57]
 [   7  252    0    2   79]
 [   1    8  182    3   34]
 [   2   37    0   48   29]
 [  34   44   19    8 4003]]


In [None]:
num_labels = 5  # your number of label classes


model_checkpoint_name = "bert-base-cased"
bert_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint_name)
bert_classification_model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint_name,
    num_labels=num_labels
)

fine_tune_classification_model_attempt3(bert_classification_model, bert_tokenizer, df_train_upsampled_copy, df_test_copy)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/105305 [00:00<?, ? examples/s]

Map:   0%|          | 0/5176 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,F1,Macro F1
1,0.0667,0.573177,0.898479,0.717789
2,0.0185,0.642751,0.914985,0.742642


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

italicized text## Attempt 5 - Focal Loss  and downsample + balanced upsampling##


In [None]:
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    report = classification_report(labels, preds, output_dict=True)
    return {
        "eval_macro_f1": report["macro avg"]["f1-score"],
        "eval_weighted_f1": report["weighted avg"]["f1-score"],
        "eval_int_f1": report.get('3', {}).get('f1-score')  # Replace '3' with index of 'int'
    }

In [None]:
from sklearn.utils import resample
import pandas as pd

def resample_balanced(df, label_col='labels', max_multiplier=3, random_state=42):
    """
    Downsamples 'false' to at most `max_multiplier` times the largest minority class,
    and upsamples other classes to match the target count.
    """
    # Count label frequencies
    class_counts = df[label_col].value_counts()

    # Exclude 'false' from minority class check
    minority_counts = class_counts.drop('false', errors='ignore')
    if minority_counts.empty:
        raise ValueError("No classes to upsample found.")

    target_count = minority_counts.max()
    false_target = min(class_counts.get('false', 0), target_count * max_multiplier)

    dfs = []
    for class_label, group in df.groupby(label_col):
        n_samples = len(group)

        if class_label == 'false':
            # Downsample false
            if n_samples > false_target:
                group = group.sample(false_target, random_state=random_state)
            dfs.append(group)

        elif n_samples < target_count:
            # Upsample minority class
            upsampled = group.sample(target_count, replace=True, random_state=random_state)
            dfs.append(upsampled)
        else:
            dfs.append(group)

    df_balanced = pd.concat(dfs).sample(frac=1, random_state=random_state).reset_index(drop=True)
    return df_balanced


In [None]:
df_train_balanced = resample_balanced(df_train, label_col="labels", max_multiplier=3)
df_train_balanced["labels"].value_counts()


Unnamed: 0_level_0,count
labels,Unnamed: 1_level_1
false,5463
int,1821
advise,1821
effect,1821
mechanism,1821


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from datasets import Dataset
from transformers import TrainingArguments, Trainer
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# ---- 1. Focal Loss ----
class FocalLoss(nn.Module):
    def __init__(self, gamma=2.0, weight=None):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.weight = weight

    def forward(self, inputs, targets):
        logpt = F.log_softmax(inputs, dim=1)
        pt = torch.exp(logpt)
        logpt = logpt.gather(1, targets.unsqueeze(1))
        pt = pt.gather(1, targets.unsqueeze(1))

        if self.weight is not None:
            at = self.weight[targets].unsqueeze(1)
            loss = -1 * at * (1 - pt) ** self.gamma * logpt
        else:
            loss = -1 * (1 - pt) ** self.gamma * logpt
        return loss.mean()

# ---- 2. Custom Model with Focal Loss ----
from transformers import BertForSequenceClassification

class BertWithFocalLoss(BertForSequenceClassification):
    def __init__(self, config, gamma=2.0, weight=None):
        super().__init__(config)
        self.focal_loss = FocalLoss(gamma=gamma, weight=weight)

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None, **kwargs):
        # Explicitly call only accepted arguments
        outputs = super().forward(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            labels=None  # we'll compute our own loss
        )

        logits = outputs.logits

        if labels is not None:
            loss = self.focal_loss(logits, labels)
            return {"loss": loss, "logits": logits}
        else:
            return {"logits": logits}


In [None]:
# ---- 3. Training Function ----

def fine_tune_classification_model_attempt5(classification_model,
                                   tokenizer,
                                   train_data,
                                   dev_data,
                                   batch_size=16,
                                   num_epochs=2):

    label_map = {'mechanism': 0, 'effect': 1, 'advise': 2, 'int': 3, 'false': 4}
    id2label = {v: k for k, v in label_map.items()}
    train_data["labels"] = train_data["labels"].map(label_map)
    dev_data["labels"] = dev_data["labels"].map(label_map)

    train_dataset = Dataset.from_pandas(train_data)
    dev_dataset = Dataset.from_pandas(dev_data)

    preprocessed_train_data = train_dataset.map(
        lambda batch: preprocess_tokenization(batch, tokenizer),
        batched=True
    )
    preprocessed_dev_data = dev_dataset.map(
        lambda batch: preprocess_tokenization(batch, tokenizer),
        batched=True
    )

    save_path = "/content/drive/MyDrive/w266_Final_Project_Output"

    training_args = TrainingArguments(
        output_dir=save_path,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        eval_strategy="epoch",
        save_strategy="epoch",
        report_to='none',
        load_best_model_at_end=True,
        metric_for_best_model="macro_f1",
    )

    trainer = Trainer(
        model=classification_model,
        args=training_args,
        train_dataset=preprocessed_train_data,
        eval_dataset=preprocessed_dev_data,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    trainer.train()

    trainer.save_model(save_path)
    tokenizer.save_pretrained(save_path)

    print("\nRunning detailed evaluation on dev set with the best model...\n")
    predictions_output = trainer.predict(preprocessed_dev_data)

    preds = np.argmax(predictions_output.predictions, axis=1)
    labels = predictions_output.label_ids

    report = classification_report(labels, preds, digits=4, target_names=[id2label[i] for i in range(5)])
    print("Classification Report:\n", report)

    cm = confusion_matrix(labels, preds)
    print("Confusion Matrix:\n", cm)


In [None]:
#Attempt 5 - gamma 1.5 - no resample

from transformers import AutoConfig

model_checkpoint_name = "bert-base-cased"
num_labels = 5

config = AutoConfig.from_pretrained(model_checkpoint_name, num_labels=num_labels)
bert_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint_name)

bert_classification_model = BertWithFocalLoss.from_pretrained(
    model_checkpoint_name,
    config=config
)

fine_tune_classification_model_attempt5(bert_classification_model, bert_tokenizer, df_train, df_test)

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertWithFocalLoss were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/25729 [00:00<?, ? examples/s]

Map:   0%|          | 0/5176 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Macro F1,Weighted F1,Int F1
1,0.0869,0.191661,0.775577,0.920901,0.505376
2,0.0359,0.188103,0.799582,0.934105,0.522727



Running detailed evaluation on dev set with the best model...



Classification Report:
               precision    recall  f1-score   support

   mechanism     0.8601    0.8802    0.8700       384
      effect     0.7361    0.7794    0.7571       340
      advise     0.8844    0.8728    0.8786       228
         int     0.7667    0.3966    0.5227       116
       false     0.9659    0.9730    0.9694      4108

    accuracy                         0.9361      5176
   macro avg     0.8426    0.7804    0.7996      5176
weighted avg     0.9349    0.9361    0.9341      5176

Confusion Matrix:
 [[ 338    6    3    0   37]
 [   3  265    6   10   56]
 [   1    2  199    3   23]
 [   6   39    0   46   25]
 [  45   48   17    1 3997]]


In [None]:
#Attempt 5 - gamma 1.5 - resample

from transformers import AutoConfig

model_checkpoint_name = "bert-base-cased"
num_labels = 5

config = AutoConfig.from_pretrained(model_checkpoint_name, num_labels=num_labels)
bert_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint_name)

bert_classification_model = BertWithFocalLoss.from_pretrained(
    model_checkpoint_name,
    config=config
)

fine_tune_classification_model_attempt5(bert_classification_model, bert_tokenizer, df_train_balanced, df_test)




Some weights of BertWithFocalLoss were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/12747 [00:00<?, ? examples/s]

Map:   0%|          | 0/5176 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Macro F1,Weighted F1,Int F1
1,0.3115,0.201957,0.716301,0.875727,0.497297
2,0.0599,0.234461,0.752158,0.899063,0.479592



Running detailed evaluation on dev set with the best model...



Classification Report:
               precision    recall  f1-score   support

   mechanism     0.6928    0.8984    0.7823       384
      effect     0.6117    0.8941    0.7264       340
      advise     0.7483    0.9386    0.8327       228
         int     0.5875    0.4052    0.4796       116
       false     0.9759    0.9063    0.9398      4108

    accuracy                         0.8951      5176
   macro avg     0.7232    0.8085    0.7522      5176
weighted avg     0.9122    0.8951    0.8991      5176

Confusion Matrix:
 [[ 345   10    3    0   26]
 [   1  304    2    4   29]
 [   0    2  214    3    9]
 [   4   37    0   47   28]
 [ 148  144   67   26 3723]]


In [None]:
#Attempt 5 - gamma 1.0 - no resample

from transformers import AutoConfig

model_checkpoint_name = "bert-base-uncased"
num_labels = 5

config = AutoConfig.from_pretrained(model_checkpoint_name, num_labels=num_labels)
bert_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint_name)

bert_classification_model = BertWithFocalLoss.from_pretrained(
    model_checkpoint_name,
    config=config
)

fine_tune_classification_model_attempt5(bert_classification_model, bert_tokenizer, df_train, df_test)

Some weights of BertWithFocalLoss were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/25729 [00:00<?, ? examples/s]

Map:   0%|          | 0/5176 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Macro F1,Weighted F1,Int F1
1,0.0963,0.211027,0.763294,0.917621,0.447619
2,0.038,0.222097,0.785893,0.930399,0.475676



Running detailed evaluation on dev set with the best model...



Classification Report:
               precision    recall  f1-score   support

   mechanism     0.8544    0.8255    0.8397       384
      effect     0.7044    0.8059    0.7517       340
      advise     0.9018    0.8860    0.8938       228
         int     0.6377    0.3793    0.4757       116
       false     0.9668    0.9703    0.9685      4108

    accuracy                         0.9318      5176
   macro avg     0.8130    0.7734    0.7859      5176
weighted avg     0.9310    0.9318    0.9304      5176

Confusion Matrix:
 [[ 317   17    5    7   38]
 [   6  274    3    0   57]
 [   1    4  202    3   18]
 [   7   41    0   44   24]
 [  40   53   14   15 3986]]


In [None]:
#Attempt 5 - gamma 1.0

from transformers import AutoConfig

model_checkpoint_name = "bert-base-uncased"
num_labels = 5

config = AutoConfig.from_pretrained(model_checkpoint_name, num_labels=num_labels)
bert_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint_name)

bert_classification_model = BertWithFocalLoss.from_pretrained(
    model_checkpoint_name,
    config=config
)

fine_tune_classification_model_attempt5(bert_classification_model, bert_tokenizer, df_train, df_test)

Some weights of BertWithFocalLoss were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/25729 [00:00<?, ? examples/s]

Map:   0%|          | 0/5176 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Macro F1,Weighted F1,Int F1
1,0.1071,0.220143,0.753292,0.908075,0.516484
2,0.0393,0.21664,0.802137,0.931028,0.551724



Running detailed evaluation on dev set with the best model...



Classification Report:
               precision    recall  f1-score   support

   mechanism     0.8262    0.8542    0.8399       384
      effect     0.7229    0.8441    0.7788       340
      advise     0.8800    0.8684    0.8742       228
         int     0.8276    0.4138    0.5517       116
       false     0.9671    0.9649    0.9660      4108

    accuracy                         0.9322      5176
   macro avg     0.8448    0.7891    0.8021      5176
weighted avg     0.9336    0.9322    0.9310      5176

Confusion Matrix:
 [[ 328    9    3    0   44]
 [   9  287    0    0   44]
 [   1    7  198    3   19]
 [   3   37    0   48   28]
 [  56   57   24    7 3964]]


In [None]:
#Attempt 5 - gamma 2.0 - no resample

from transformers import AutoConfig

model_checkpoint_name = "bert-base-uncased"
num_labels = 5

config = AutoConfig.from_pretrained(model_checkpoint_name, num_labels=num_labels)
bert_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint_name)

bert_classification_model = BertWithFocalLoss.from_pretrained(
    model_checkpoint_name,
    config=config
)

fine_tune_classification_model_attempt5(bert_classification_model, bert_tokenizer, df_train, df_test)

Some weights of BertWithFocalLoss were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/25729 [00:00<?, ? examples/s]

Map:   0%|          | 0/5176 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Macro F1,Weighted F1,Int F1
1,0.0672,0.167762,0.738265,0.912595,0.486486
2,0.0256,0.150724,0.799883,0.935759,0.513661



Running detailed evaluation on dev set with the best model...



Classification Report:
               precision    recall  f1-score   support

   mechanism     0.8757    0.8620    0.8688       384
      effect     0.7694    0.8147    0.7914       340
      advise     0.8832    0.8289    0.8552       228
         int     0.7015    0.4052    0.5137       116
       false     0.9646    0.9761    0.9704      4108

    accuracy                         0.9378      5176
   macro avg     0.8389    0.7774    0.7999      5176
weighted avg     0.9357    0.9378    0.9358      5176

Confusion Matrix:
 [[ 331    7    2   10   34]
 [   3  277    2    4   54]
 [   2    0  189    3   34]
 [   5   39    0   47   25]
 [  37   37   21    3 4010]]


In [None]:
#Attempt 5 - gamma 2.0 - resample

from transformers import AutoConfig

model_checkpoint_name = "bert-base-uncased"
num_labels = 5

config = AutoConfig.from_pretrained(model_checkpoint_name, num_labels=num_labels)
bert_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint_name)

bert_classification_model = BertWithFocalLoss.from_pretrained(
    model_checkpoint_name,
    config=config
)

fine_tune_classification_model_attempt5(bert_classification_model, bert_tokenizer, df_train_balanced, df_test)

Some weights of BertWithFocalLoss were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/12747 [00:00<?, ? examples/s]

Map:   0%|          | 0/5176 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Macro F1,Weighted F1,Int F1
1,0.259,0.161019,0.750234,0.901637,0.529101
2,0.0443,0.19303,0.754316,0.905668,0.485437



Running detailed evaluation on dev set with the best model...



Classification Report:
               precision    recall  f1-score   support

   mechanism     0.6944    0.8047    0.7455       384
      effect     0.6509    0.8118    0.7225       340
      advise     0.8684    0.8684    0.8684       228
         int     0.5556    0.4310    0.4854       116
       false     0.9639    0.9360    0.9497      4108

    accuracy                         0.9038      5176
   macro avg     0.7466    0.7704    0.7543      5176
weighted avg     0.9100    0.9038    0.9057      5176

Confusion Matrix:
 [[ 309    9    3   10   53]
 [  10  276    3    8   43]
 [   1    4  198    3   22]
 [   3   37    0   50   26]
 [ 122   98   24   19 3845]]


In [None]:
# Load from saved path
load_path = "/content/drive/MyDrive/w266 Final Project"
tokenizer = AutoTokenizer.from_pretrained(load_path)
model = AutoModelForSequenceClassification.from_pretrained(load_path)

# Example prediction
text = "Co-administration of <E1>bosentan</E1> and <E2>simvastatin</E2> may reduce plasma concentrations of the latter."
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
outputs = model(**inputs)
predicted_class = outputs.logits.argmax(dim=1).item()

print("Predicted label:", predicted_class)

Predicted label: 0
