In [1]:
import numpy as np
import pandas as pd
import re
import random

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold

from sklearn.metrics import precision_score, recall_score, f1_score

import torch
from transformers import AutoTokenizer, AutoModel, set_seed, AdamW, get_linear_schedule_with_warmup
from transformers import AutoModelForSequenceClassification, BertForSequenceClassification, DistilBertForSequenceClassification 
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback
import datasets
from datasets import Dataset

from openprompt.data_utils import InputExample
from openprompt.plms import load_plm, T5TokenizerWrapper
from openprompt.prompts import ManualTemplate, SoftTemplate, ManualVerbalizer, SoftVerbalizer
from openprompt import PromptDataLoader, PromptForClassification

  from .autonotebook import tqdm as notebook_tqdm


In [33]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
random_state = 42
torch.manual_seed(42)
random.seed(42)
np.random.seed(42)
set_seed(42)

# Load in data

In [3]:
data = pd.read_csv('shrunk_annotations.csv')
data.head()

Unnamed: 0,pmcid,methods_materials,definitions_diagnosis_ranges,clusion_criteria,formula,misc
0,PMC6509938,Patients and methodsStudy participantsThis was...,0,0,0,0
1,PMC6509938,Participants of this study included 198 volunt...,0,0,0,0
2,PMC6509938,Each of them attended an eight-week LM program...,0,0,0,0
3,PMC6509938,Individuals who had been treated with antihype...,0,1,0,0
4,PMC6509938,Individuals were not enrolled if they were you...,0,1,0,0


# Baselines
## Dummy Classifier (naive baseline) and SVM w/ Bag of Words

In [5]:
# Preprocessing for baselines - lowercases and removes punctuation
data['BoW'] = data['methods_materials'].apply(lambda s: re.sub(r'[^\w\s]', '', s.lower()))

In [6]:
# Initialize/Map Bag of Words Features
vectorizer = CountVectorizer()
bag = vectorizer.fit_transform(data['BoW'])
BoW = bag.toarray()

In [7]:
# Split data (train/dev and test)
X_train_dev, X_test, y_train_dev, y_test = train_test_split(
    pd.DataFrame(BoW), 
    data[['definitions_diagnosis_ranges']], 
    test_size=0.2, 
    random_state=random_state)

In [8]:
# Dummy Classifier (Naive Baseline)
dummy_clf = DummyClassifier(strategy="stratified", random_state=random_state)
dummy_clf.fit(X_train_dev, y_train_dev.values.ravel())

y_pred = dummy_clf.predict(X_test)
y_true = y_test.values.ravel()

precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1: {f1}")

Precision: 0.10638297872340426
Recall: 0.11363636363636363
F1: 0.10989010989010989


In [9]:
# Logistic Regression using Bag of Words (Baseline)
lr_classifier = LogisticRegression(penalty='l2', max_iter=10000)

lr_grid_param = {
    'solver': ['lbfgs', 'liblinear', 'newton-cg'],
    'tol': [1e-3, 5e-4, 1e-4, 5e-5, 1e-5],
    'C': [0.9, 1.0, 1.1],
    'warm_start': [True, False],
}

cv = StratifiedKFold(n_splits=5, shuffle=False)

lr_grid_cv = GridSearchCV(estimator=lr_classifier,
                          param_grid=lr_grid_param,
                          scoring='f1',
                          cv=cv,
                          n_jobs=-1,
                          verbose=3,
                         )

lr_grid_cv.fit(X_train_dev, y_train_dev.values.ravel())

lr_best = lr_grid_cv.best_estimator_

Fitting 5 folds for each of 90 candidates, totalling 450 fits
[CV 1/5] END C=0.9, solver=lbfgs, tol=0.001, warm_start=True;, score=0.754 total time=   0.9s
[CV 2/5] END C=0.9, solver=lbfgs, tol=0.001, warm_start=True;, score=0.655 total time=   0.8s
[CV 3/5] END C=0.9, solver=lbfgs, tol=0.001, warm_start=True;, score=0.618 total time=   0.8s
[CV 4/5] END C=0.9, solver=lbfgs, tol=0.001, warm_start=True;, score=0.538 total time=   0.9s
[CV 5/5] END C=0.9, solver=lbfgs, tol=0.001, warm_start=True;, score=0.480 total time=   0.9s
[CV 1/5] END C=0.9, solver=lbfgs, tol=0.001, warm_start=False;, score=0.754 total time=   0.8s
[CV 2/5] END C=0.9, solver=lbfgs, tol=0.001, warm_start=False;, score=0.655 total time=   0.8s
[CV 3/5] END C=0.9, solver=lbfgs, tol=0.001, warm_start=False;, score=0.618 total time=   0.8s
[CV 4/5] END C=0.9, solver=lbfgs, tol=0.001, warm_start=False;, score=0.538 total time=   0.9s
[CV 5/5] END C=0.9, solver=lbfgs, tol=0.001, warm_start=False;, score=0.480 total time= 

In [10]:
y_pred = lr_best.predict(X_test)
y_true = y_test.values.ravel()

precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1: {f1}")

Precision: 0.7692307692307693
Recall: 0.45454545454545453
F1: 0.5714285714285714


Test statistics:

In [11]:
print(f"Total: {len(y_test)}, Positive: {y_test.sum().item()}, Ratio: {(y_test.sum().item())/len(y_test)}")

Total: 426, Positive: 44, Ratio: 0.10328638497652583


### Performance across 426 instances (44 positive instances - 10.3%)
#### Dummy Classifier:
Precision: 0.10638297872340426
Recall: 0.11363636363636363
F1: 0.10989010989010989

#### SVM w/ BoWs:
Precision: 0.7692307692307693
Recall: 0.45454545454545453
F1: 0.5714285714285714

# Large Language Model
## Finetuning pretrained language models
Models trained:
1. bert-base-uncased
2. biolink-bert-base
3. sapbert-pubmedbert
4. sciner-topic

In [4]:
data.head()

Unnamed: 0,pmcid,methods_materials,definitions_diagnosis_ranges,clusion_criteria,formula,misc
0,PMC6509938,Patients and methodsStudy participantsThis was...,0,0,0,0
1,PMC6509938,Participants of this study included 198 volunt...,0,0,0,0
2,PMC6509938,Each of them attended an eight-week LM program...,0,0,0,0
3,PMC6509938,Individuals who had been treated with antihype...,0,1,0,0
4,PMC6509938,Individuals were not enrolled if they were you...,0,1,0,0


In [5]:
# Split data (train/dev and test)
X_train_dev, X_test, y_train_dev, y_test = train_test_split(
    data[['methods_materials']], 
    data[['definitions_diagnosis_ranges']], 
    test_size=0.2, 
    random_state=random_state)

X_train, X_dev, y_train, y_dev = train_test_split(
    X_train_dev, 
    y_train_dev, 
    test_size=0.2, 
    random_state=random_state)

In [6]:
data_train = pd.concat([X_train, y_train], axis=1)
data_dev = pd.concat([X_dev, y_dev], axis=1)
data_test = pd.concat([X_test, y_test], axis=1)

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [7]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding='max_length', truncation=True, max_length = max_length)

def preprocessing(dataset):
    dataset.rename(columns = {'methods_materials':'text', 'definitions_diagnosis_ranges':'labels'}, inplace = True)
    dataset = Dataset.from_pandas(dataset[['text', 'labels']], preserve_index=False)
    dataset_token = dataset.map(tokenize_function)
    return dataset_token

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    #prec = precision_score(predictions, labels)
    prec = precision_score(labels, predictions)
    recall = recall_score(labels, predictions)
    recall = recall_score(labels, predictions)
    f1 = f1_score(labels, predictions)
    return {"precision": prec, "recall": recall, "f1": f1}


---

##  BERT Baseline (BERT, uncased)

In [38]:
model_name = "bert-base-uncased"
max_length = 512
early_stop_var = 3
num_epochs = 5
batch_size = 2

In [39]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

train_dataset = preprocessing(data_train)
dev_dataset = preprocessing(data_dev)
test_dataset = preprocessing(data_test)

loading configuration file config.json from cache at /home/joemenke/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.23.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file vocab.txt from cache at /home/joemenke/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/vocab.t

Map:   0%|          | 0/1362 [00:00<?, ? examples/s]

Map:   0%|          | 0/341 [00:00<?, ? examples/s]

Map:   0%|          | 0/426 [00:00<?, ? examples/s]

In [40]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, ignore_mismatched_sizes=True).to(device)
model.gradient_checkpointing_enable()

early_stop = EarlyStoppingCallback(early_stopping_patience = early_stop_var)
training_args = TrainingArguments(output_dir="output",
                                  evaluation_strategy="steps",
                                  logging_strategy="steps",
                                  logging_steps = 50,
                                  eval_steps=50,
                                  per_device_train_batch_size=1,
                                  per_device_eval_batch_size=1,
                                  gradient_accumulation_steps=4,
                                  gradient_checkpointing=True,
                                  num_train_epochs=5,
                                  learning_rate = 2e-5,
                                  weight_decay = 0.001,
                                  load_best_model_at_end = True,
                                  metric_for_best_model = 'f1'
                                  )

loading configuration file config.json from cache at /home/joemenke/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.23.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file pytorch_model.bin from cache at /home/joemenke/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb489

In [41]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    compute_metrics=compute_metrics,
    callbacks = [early_stop],
)

trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1362
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 4
  Total optimization steps = 1700


Step,Training Loss,Validation Loss,Precision,Recall,F1
50,0.4532,0.326655,0.0,0.0,0.0
100,0.2938,0.225827,0.405405,0.882353,0.555556
150,0.1057,0.389634,0.432432,0.888889,0.581818
200,0.1919,0.317907,0.486486,0.9,0.631579
250,0.2899,0.404384,0.27027,0.833333,0.408163
300,0.2997,0.195612,0.621622,0.851852,0.71875
350,0.1502,0.198843,0.675676,0.833333,0.746269
400,0.0665,0.268563,0.783784,0.591837,0.674419
450,0.1494,0.208755,0.72973,0.710526,0.72
500,0.141,0.236586,0.648649,0.827586,0.727273


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 341
  Batch size = 1
  _warn_prf(average, modifier, msg_start, len(result))
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 341
  Batch size = 1
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  

TrainOutput(global_step=650, training_loss=0.2003207250741812, metrics={'train_runtime': 231.836, 'train_samples_per_second': 29.374, 'train_steps_per_second': 7.333, 'total_flos': 684614966046720.0, 'train_loss': 0.2003207250741812, 'epoch': 1.91})

In [43]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 341
  Batch size = 1


{'eval_loss': 0.23658645153045654,
 'eval_precision': 0.6486486486486487,
 'eval_recall': 0.8275862068965517,
 'eval_f1': 0.7272727272727273,
 'eval_runtime': 4.8185,
 'eval_samples_per_second': 70.769,
 'eval_steps_per_second': 70.769,
 'epoch': 1.91}

In [42]:
logits = trainer.predict(test_dataset).predictions.squeeze()
predictions = list(np.argmax(logits, axis=-1))
labels = test_dataset['labels']
prec = precision_score(predictions, labels)
recall = recall_score(predictions, labels)
f1 = f1_score(predictions, labels)
print(f"Performance: P = {prec}; R = {recall}; F1 = {f1}")

The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 426
  Batch size = 1


Performance: P = 0.7272727272727273; R = 0.8; F1 = 0.761904761904762



---

##  BioLinkBERT-Base

In [29]:
model_name = "michiyasunaga/BioLinkBERT-base"
max_length = 512
early_stop_var = 3
num_epochs = 5

In [34]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

train_dataset = preprocessing(data_train)
dev_dataset = preprocessing(data_dev)
test_dataset = preprocessing(data_test)

loading file vocab.txt from cache at /home/joemenke/.cache/huggingface/hub/models--michiyasunaga--BioLinkBERT-base/snapshots/b71f5d70f063d1c8f1124070ce86f1ee463ca1fe/vocab.txt
loading file tokenizer.json from cache at /home/joemenke/.cache/huggingface/hub/models--michiyasunaga--BioLinkBERT-base/snapshots/b71f5d70f063d1c8f1124070ce86f1ee463ca1fe/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /home/joemenke/.cache/huggingface/hub/models--michiyasunaga--BioLinkBERT-base/snapshots/b71f5d70f063d1c8f1124070ce86f1ee463ca1fe/special_tokens_map.json
loading file tokenizer_config.json from cache at /home/joemenke/.cache/huggingface/hub/models--michiyasunaga--BioLinkBERT-base/snapshots/b71f5d70f063d1c8f1124070ce86f1ee463ca1fe/tokenizer_config.json


Map:   0%|          | 0/1362 [00:00<?, ? examples/s]

Map:   0%|          | 0/341 [00:00<?, ? examples/s]

Map:   0%|          | 0/426 [00:00<?, ? examples/s]

In [35]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, ignore_mismatched_sizes=True).to(device)

model.gradient_checkpointing_enable()

early_stop = EarlyStoppingCallback(early_stopping_patience = early_stop_var)
training_args = TrainingArguments(output_dir="output",
                                  evaluation_strategy="steps",
                                  logging_strategy="steps",
                                  logging_steps = 50,
                                  eval_steps=50,
                                  per_device_train_batch_size=1,
                                  per_device_eval_batch_size=1,
                                  gradient_accumulation_steps=4,
                                  gradient_checkpointing=True,
                                  num_train_epochs=5,
                                  learning_rate = 2e-5,
                                  weight_decay = 0.001,
                                  load_best_model_at_end = True,
                                  metric_for_best_model = 'f1'
                                  )

loading configuration file config.json from cache at /home/joemenke/.cache/huggingface/hub/models--michiyasunaga--BioLinkBERT-base/snapshots/b71f5d70f063d1c8f1124070ce86f1ee463ca1fe/config.json
Model config BertConfig {
  "_name_or_path": "michiyasunaga/BioLinkBERT-base",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.23.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28895
}

loading weights file pytorch_model.bin from cache at /home/joemenke/.cache/huggingface/hub/models--michiyasunaga--BioLinkBERT-base/snapsho

In [36]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    compute_metrics=compute_metrics,
    callbacks = [early_stop],
)

trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1362
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 4
  Total optimization steps = 1700


Step,Training Loss,Validation Loss,Precision,Recall,F1
50,0.4224,0.30412,0.0,0.0,0.0
100,0.4851,0.409657,0.0,0.0,0.0
150,0.3605,0.447883,0.0,0.0,0.0
200,0.1847,0.299787,0.432432,0.888889,0.581818
250,0.2073,0.244515,0.675676,0.714286,0.694444
300,0.2751,0.190643,0.702703,0.787879,0.742857
350,0.1903,0.229896,0.621622,0.851852,0.71875
400,0.11,0.260081,0.675676,0.78125,0.724638
450,0.1779,0.244011,0.675676,0.757576,0.714286
500,0.1761,0.216366,0.567568,0.913043,0.7


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 341
  Batch size = 1
  _warn_prf(average, modifier, msg_start, len(result))
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 341
  Batch size = 1
  _warn_prf(average, modifier, msg_start, len(result))
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely

TrainOutput(global_step=1150, training_loss=0.143807518793189, metrics={'train_runtime': 411.0503, 'train_samples_per_second': 16.567, 'train_steps_per_second': 4.136, 'total_flos': 1211889520988160.0, 'train_loss': 0.143807518793189, 'epoch': 3.38})

In [37]:
logits = trainer.predict(test_dataset).predictions.squeeze()
predictions = list(np.argmax(logits, axis=-1))
labels = test_dataset['labels']
prec = precision_score(predictions, labels)
recall = recall_score(predictions, labels)
f1 = f1_score(predictions, labels)
print(f"Performance: P = {prec}; R = {recall}; F1 = {f1}")

The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 426
  Batch size = 1


Performance: P = 0.8409090909090909; R = 0.7254901960784313; F1 = 0.7789473684210527



---

##  SapBERT-PubMedBERT

In [16]:
model_name = "cambridgeltl/SapBERT-from-PubMedBERT-fulltext"
max_length = 512
early_stop_var = 3
num_epochs = 5

In [17]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

train_dataset = preprocessing(data_train)
dev_dataset = preprocessing(data_dev)
test_dataset = preprocessing(data_test)

loading configuration file config.json from cache at /home/joemenke/.cache/huggingface/hub/models--cambridgeltl--SapBERT-from-PubMedBERT-fulltext/snapshots/ec3f68b2d0c3317e1e503050f34940d493004603/config.json
Model config BertConfig {
  "_name_or_path": "cambridgeltl/SapBERT-from-PubMedBERT-fulltext",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.23.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file vocab.txt from cache at /home/joemenke/.cache/huggingface/hub/models--cambridgeltl--SapBERT-fro

Map:   0%|          | 0/1362 [00:00<?, ? examples/s]

Map:   0%|          | 0/341 [00:00<?, ? examples/s]

Map:   0%|          | 0/426 [00:00<?, ? examples/s]

In [13]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, ignore_mismatched_sizes=True).to(device)

model.gradient_checkpointing_enable()

early_stop = EarlyStoppingCallback(early_stopping_patience = early_stop_var)
training_args = TrainingArguments(output_dir="output",
                                  evaluation_strategy="steps",
                                  logging_strategy="steps",
                                  logging_steps = 50,
                                  eval_steps=50,
                                  per_device_train_batch_size=1,
                                  per_device_eval_batch_size=1,
                                  gradient_accumulation_steps=4,
                                  gradient_checkpointing=True,
                                  num_train_epochs=5,
                                  learning_rate = 2e-5,
                                  weight_decay = 0.001,
                                  load_best_model_at_end = True,
                                  metric_for_best_model = 'f1'
                                  )

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cambridgeltl/SapBERT-from-PubMedBERT-fulltext and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    compute_metrics=compute_metrics,
    callbacks = [early_stop],
)

trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1362
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 4
  Total optimization steps = 1700


Step,Training Loss,Validation Loss,Precision,Recall,F1
50,0.4058,0.272545,0.027027,1.0,0.052632
100,0.2873,0.282132,0.648649,0.489796,0.55814
150,0.1007,0.351347,0.459459,0.944444,0.618182
200,0.125,0.360286,0.540541,0.869565,0.666667
250,0.2494,0.225823,0.702703,0.787879,0.742857
300,0.2091,0.28751,0.540541,0.833333,0.655738
350,0.168,0.299662,0.486486,0.818182,0.610169
400,0.1223,0.23838,0.648649,0.8,0.716418
450,0.0845,0.259385,0.675676,0.833333,0.746269
500,0.1048,0.289926,0.567568,0.875,0.688525


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 341
  Batch size = 1
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 341
  Batch size = 1
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 341
  Batch size = 1
The following colum

TrainOutput(global_step=1150, training_loss=0.10634213807790176, metrics={'train_runtime': 455.1961, 'train_samples_per_second': 14.961, 'train_steps_per_second': 3.735, 'total_flos': 1211889520988160.0, 'train_loss': 0.10634213807790176, 'epoch': 3.38})

In [15]:
logits = trainer.predict(test_dataset).predictions.squeeze()
predictions = list(np.argmax(logits, axis=-1))
labels = test_dataset['labels']
prec = precision_score(predictions, labels)
recall = recall_score(predictions, labels)
f1 = f1_score(predictions, labels)
print(f"Performance: P = {prec}; R = {recall}; F1 = {f1}")

The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 426
  Batch size = 1


Performance: P = 0.6818181818181818; R = 0.7692307692307693; F1 = 0.7228915662650602



---

## SciNERTopic

In [13]:
model_name = "RJuro/SciNERTopic"
max_length = 512
early_stop_var = 3
num_epochs = 5

In [14]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

train_dataset = preprocessing(data_train)
dev_dataset = preprocessing(data_dev)
test_dataset = preprocessing(data_test)

Downloading (…)okenizer_config.json:   0%|          | 0.00/421 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/222k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

loading file vocab.txt from cache at /home/joemenke/.cache/huggingface/hub/models--RJuro--SciNERTopic/snapshots/b484b958d3d53f2e43c7280abcb60388c2eb293f/vocab.txt
loading file tokenizer.json from cache at /home/joemenke/.cache/huggingface/hub/models--RJuro--SciNERTopic/snapshots/b484b958d3d53f2e43c7280abcb60388c2eb293f/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /home/joemenke/.cache/huggingface/hub/models--RJuro--SciNERTopic/snapshots/b484b958d3d53f2e43c7280abcb60388c2eb293f/special_tokens_map.json
loading file tokenizer_config.json from cache at /home/joemenke/.cache/huggingface/hub/models--RJuro--SciNERTopic/snapshots/b484b958d3d53f2e43c7280abcb60388c2eb293f/tokenizer_config.json


Map:   0%|          | 0/1362 [00:00<?, ? examples/s]

Map:   0%|          | 0/341 [00:00<?, ? examples/s]

Map:   0%|          | 0/426 [00:00<?, ? examples/s]

In [16]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, ignore_mismatched_sizes=True).to(device)

model.gradient_checkpointing_enable()

early_stop = EarlyStoppingCallback(early_stopping_patience = early_stop_var)
training_args = TrainingArguments(output_dir="output_sciNER",
                                  evaluation_strategy="steps",
                                  logging_strategy="steps",
                                  logging_steps = 50,
                                  eval_steps=50,
                                  per_device_train_batch_size=1,
                                  per_device_eval_batch_size=1,
                                  gradient_accumulation_steps=4,
                                  gradient_checkpointing=True,
                                  num_train_epochs=5,
                                  learning_rate = 2e-5,
                                  weight_decay = 0.001,
                                  load_best_model_at_end = True,
                                  metric_for_best_model = 'f1'
                                  )

loading configuration file config.json from cache at /home/joemenke/.cache/huggingface/hub/models--RJuro--SciNERTopic/snapshots/b484b958d3d53f2e43c7280abcb60388c2eb293f/config.json
Model config BertConfig {
  "_name_or_path": "RJuro/SciNERTopic",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.23.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 31116
}

loading weights file pytorch_model.bin from cache at /home/joemenke/.cache/huggingface/hub/models--RJuro--SciNERTopic/snapshots/b484b958d3d53f2e43c7280abc

In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    compute_metrics=compute_metrics,
    callbacks = [early_stop],
)

trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1362
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 4
  Total optimization steps = 1700


Step,Training Loss,Validation Loss,Precision,Recall,F1
50,0.358,0.265187,0.837838,0.5,0.626263
100,0.2751,0.283488,0.648649,0.648649,0.648649
150,0.112,0.356613,0.459459,0.944444,0.618182
200,0.165,0.383153,0.567568,0.84,0.677419
250,0.303,0.281687,0.567568,0.777778,0.65625
300,0.2501,0.229941,0.594595,0.785714,0.676923
350,0.1637,0.297702,0.567568,0.875,0.688525
400,0.1077,0.230773,0.675676,0.78125,0.724638
450,0.0755,0.20646,0.72973,0.710526,0.72
500,0.1936,0.225892,0.594595,0.916667,0.721311


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 341
  Batch size = 1
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 341
  Batch size = 1
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 341
  Batch size = 1
The following colum

TrainOutput(global_step=1150, training_loss=0.11529109556389891, metrics={'train_runtime': 401.0794, 'train_samples_per_second': 16.979, 'train_steps_per_second': 4.239, 'total_flos': 1211889520988160.0, 'train_loss': 0.11529109556389891, 'epoch': 3.38})

In [18]:
logits = trainer.predict(test_dataset).predictions.squeeze()
predictions = list(np.argmax(logits, axis=-1))
labels = test_dataset['labels']
prec = precision_score(predictions, labels)
recall = recall_score(predictions, labels)
f1 = f1_score(predictions, labels)
print(f"Performance: P = {prec}; R = {recall}; F1 = {f1}")

The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 426
  Batch size = 1


Performance: P = 0.75; R = 0.7021276595744681; F1 = 0.7252747252747254



---

# Prompt Learning

In [50]:
# Split data (train/dev and test)
renamed = data.rename(columns = {'methods_materials':'text', 'definitions_diagnosis_ranges':'label'})

renamed['text'] = renamed['text'].str.slice(0,512) # truncate if over 512

X_train_dev, X_test, y_train_dev, y_test = train_test_split(
    renamed[['text']], 
    renamed[['label']], 
    test_size=0.2, 
    random_state=random_state)

X_train, X_dev, y_train, y_dev = train_test_split(
    X_train_dev, 
    y_train_dev, 
    test_size=0.2, 
    random_state=random_state)

data_train = pd.concat([X_train, y_train], axis=1)
data_dev = pd.concat([X_dev, y_dev], axis=1)
data_test = pd.concat([X_test, y_test], axis=1)

mydataset_train = Dataset.from_pandas(data_train).rename_column("__index_level_0__", "idx")
mydataset_dev = Dataset.from_pandas(data_dev).rename_column("__index_level_0__", "idx")
mydataset_test = Dataset.from_pandas(data_test).rename_column("__index_level_0__", "idx")

mydataset = datasets.DatasetDict({
    'train': mydataset_train,
    'validation': mydataset_dev,
    'test': mydataset_test})

In [51]:
dataset = {}
for split in ['train', 'validation', 'test']:
    dataset[split] = []
    for data in mydataset[split]:
        input_example = InputExample(text_a = data['text'], label=int(data['label']), guid=data['idx'])
        dataset[split].append(input_example)

## Manual Prompt

In [99]:
plm, tokenizer, model_config, WrapperClass = load_plm("t5", "t5-base")

template_text = 'context: {"placeholder":"text_a"} Does this sentence contain a definition or a diagnosis range? {"mask"}.'
mytemplate = ManualTemplate(tokenizer=tokenizer, text=template_text)

loading configuration file config.json from cache at /home/joemenke/.cache/huggingface/hub/models--t5-base/snapshots/fe6d9bf207cd3337512ca838a8b453f87a9178ef/config.json
Model config T5Config {
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
 

In [100]:
wrapped_t5tokenizer= T5TokenizerWrapper(
    max_seq_length=128, 
    decoder_max_length=3, 
    tokenizer=tokenizer,
    truncate_method="head")

In [101]:
model_inputs = {}
for split in ['train', 'validation', 'test']:
    model_inputs[split] = []
    for sample in dataset[split]:
        tokenized_example = wrapped_t5tokenizer.tokenize_one_example(mytemplate.wrap_one_example(sample), teacher_forcing=False)
        model_inputs[split].append(tokenized_example)

In [102]:
train_dataloader = PromptDataLoader(dataset=dataset["train"], template=mytemplate, tokenizer=tokenizer,
    tokenizer_wrapper_class=WrapperClass, max_seq_length=256, decoder_max_length=3,
    batch_size=4,shuffle=True, teacher_forcing=False, predict_eos_token=False,
    truncate_method="head")

validation_dataloader = PromptDataLoader(dataset=dataset["validation"], template=mytemplate, tokenizer=tokenizer,
    tokenizer_wrapper_class=WrapperClass, max_seq_length=256, decoder_max_length=3,
    batch_size=4,shuffle=False, teacher_forcing=False, predict_eos_token=False,
    truncate_method="head")

tokenizing: 1362it [00:01, 968.47it/s]
tokenizing: 341it [00:00, 977.57it/s]


In [103]:
myverbalizer = ManualVerbalizer(tokenizer, num_classes=2, label_words=[["yes"], ["no"]])

In [104]:
use_cuda = True

prompt_model = PromptForClassification(
    plm=plm,
    template=mytemplate, 
    verbalizer=myverbalizer, 
    freeze_plm=False)

if use_cuda:
    prompt_model=  prompt_model.cuda()

In [106]:
loss_func = torch.nn.CrossEntropyLoss()
no_decay = ['bias', 'LayerNorm.weight']

optimizer_grouped_parameters = [
    {'params': [p for n, p in prompt_model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in prompt_model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=1e-4)
best_f1 = 0.1
prompt_model.train()
for epoch in range(8):
    tot_loss = 0
    for step, inputs in enumerate(train_dataloader):
        if use_cuda:
            inputs = inputs.cuda()
        logits = prompt_model(inputs)
        labels = inputs['label']
        loss = loss_func(logits, labels)
        loss.backward()
        tot_loss += loss.item()
        optimizer.step()
        optimizer.zero_grad()
        if step %100 ==1:
            allpreds = []
            alllabels = []
            prompt_model.eval()
            for step, inputs in enumerate(validation_dataloader):
                if use_cuda:
                    inputs = inputs.cuda()
                logits = prompt_model(inputs)
                labels = inputs['label']
                alllabels.extend(labels.cpu().tolist())
                allpreds.extend(torch.argmax(logits, dim=-1).cpu().tolist())

                p = precision_score(allpreds, alllabels)
                r = recall_score(allpreds, alllabels)
                f1 = f1_score(allpreds, alllabels)

            if f1 >= best_f1:
                best_f1 = f1
                print(f"Saving best model: {best_f1}")
                torch.save(prompt_model.state_dict(), "prompt_model.ckpt")
            
            prompt_model.train()
            print(f"Epoch {epoch+1}, Average loss: {tot_loss/(step+1)}, P: {p}, R: {r}, F1: {f1}")

Epoch 1, Average loss: 0.008587113819843116, P: 0.0, R: 0.0, F1: 0.0
Saving best model: 0.5454545454545455
Epoch 1, Average loss: 0.4772357999883202, P: 0.40540540540540543, R: 0.8333333333333334, F1: 0.5454545454545455
Saving best model: 0.6451612903225806
Epoch 1, Average loss: 0.7415055790293399, P: 0.5405405405405406, R: 0.8, F1: 0.6451612903225806
Epoch 1, Average loss: 0.9064538445974143, P: 0.35135135135135137, R: 0.9285714285714286, F1: 0.5098039215686275
Saving best model: 0.7187499999999999
Epoch 2, Average loss: 0.000641106216366901, P: 0.6216216216216216, R: 0.8518518518518519, F1: 0.7187499999999999
Epoch 2, Average loss: 0.12505808935553475, P: 0.4594594594594595, R: 0.85, F1: 0.5964912280701754
Epoch 2, Average loss: 0.24950577583208647, P: 0.5675675675675675, R: 0.9130434782608695, F1: 0.6999999999999998
Saving best model: 0.75
Epoch 2, Average loss: 0.3856166238834957, P: 0.8108108108108109, R: 0.6976744186046512, F1: 0.75
Epoch 3, Average loss: 0.006987244965573556, P

In [107]:
prompt_model.load_state_dict(torch.load("prompt_model.ckpt"))
prompt_model = prompt_model.cuda()

test_dataloader = PromptDataLoader(dataset=dataset["test"], template=mytemplate, tokenizer=tokenizer,
    tokenizer_wrapper_class=WrapperClass, max_seq_length=256, decoder_max_length=3,
    batch_size=4,shuffle=False, teacher_forcing=False, predict_eos_token=False,
    truncate_method="head")

allpreds = []
alllabels = []
for step, inputs in enumerate(test_dataloader):
    if use_cuda:
        inputs = inputs.cuda()
    logits = prompt_model(inputs)
    labels = inputs['label']
    alllabels.extend(labels.cpu().tolist())
    allpreds.extend(torch.argmax(logits, dim=-1).cpu().tolist())
    
    prec = precision_score(allpreds, alllabels)
    recall = recall_score(allpreds, alllabels)
    f1 = f1_score(allpreds, alllabels)

print(f"Performance: P = {prec}; R = {recall}; F1 = {f1}")

tokenizing: 426it [00:00, 954.72it/s]


Performance: P = 0.8863636363636364; R = 0.582089552238806; F1 = 0.7027027027027027



---

## Soft Prompt (Template + Verbalizer)
#### w/ specific intialized template

In [38]:
plm, tokenizer, model_config, WrapperClass = load_plm("t5", "t5-base")

template_text = '{"placeholder":"text_a"} {"soft":"Does this sentence contain a definition or a diagnosis range?"} {"mask"} {"soft"}'
mytemplate = SoftTemplate(model=plm, tokenizer=tokenizer, text=template_text)

In [39]:
wrapped_t5tokenizer= T5TokenizerWrapper(
    max_seq_length=128, 
    decoder_max_length=3, 
    tokenizer=tokenizer,
    truncate_method="head")

In [40]:
model_inputs = {}
for split in ['train', 'validation', 'test']:
    model_inputs[split] = []
    for sample in dataset[split]:
        tokenized_example = wrapped_t5tokenizer.tokenize_one_example(mytemplate.wrap_one_example(sample), teacher_forcing=False)
        model_inputs[split].append(tokenized_example)

In [41]:
train_dataloader = PromptDataLoader(dataset=dataset["train"], template=mytemplate, tokenizer=tokenizer,
    tokenizer_wrapper_class=WrapperClass, max_seq_length=256, decoder_max_length=3,
    batch_size=4,shuffle=True, teacher_forcing=False, predict_eos_token=False,
    truncate_method="head")

validation_dataloader = PromptDataLoader(dataset=dataset["validation"], template=mytemplate, tokenizer=tokenizer,
    tokenizer_wrapper_class=WrapperClass, max_seq_length=256, decoder_max_length=3,
    batch_size=4,shuffle=False, teacher_forcing=False, predict_eos_token=False,
    truncate_method="head")

tokenizing: 1362it [00:01, 1153.98it/s]
tokenizing: 341it [00:00, 1179.31it/s]


In [42]:
myverbalizer = SoftVerbalizer(tokenizer, plm, num_classes=2)

In [43]:
use_cuda = True

prompt_model = PromptForClassification(
    plm=plm,
    template=mytemplate, 
    verbalizer=myverbalizer, 
    freeze_plm=False)

if use_cuda:
    prompt_model=  prompt_model.cuda()

In [44]:
loss_func = torch.nn.CrossEntropyLoss()
no_decay = ['bias', 'LayerNorm.weight']

optimizer_grouped_parameters = [
    {'params': [p for n, p in prompt_model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in prompt_model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=1e-4)
best_f1 = 0.1
prompt_model.train()
for epoch in range(8):
    tot_loss = 0
    for step, inputs in enumerate(train_dataloader):
        if use_cuda:
            inputs = inputs.cuda()
        logits = prompt_model(inputs)
        labels = inputs['label']
        loss = loss_func(logits, labels)
        loss.backward()
        tot_loss += loss.item()
        optimizer.step()
        optimizer.zero_grad()
        if step %100 ==1:
            allpreds = []
            alllabels = []
            prompt_model.eval()
            for step, inputs in enumerate(validation_dataloader):
                if use_cuda:
                    inputs = inputs.cuda()
                logits = prompt_model(inputs)
                labels = inputs['label']
                alllabels.extend(labels.cpu().tolist())
                allpreds.extend(torch.argmax(logits, dim=-1).cpu().tolist())

                p = precision_score(allpreds, alllabels)
                r = recall_score(allpreds, alllabels)
                f1 = f1_score(allpreds, alllabels)

            if f1 >= best_f1:
                best_f1 = f1
                print(f"Saving best model: {best_f1}")
                torch.save(prompt_model.state_dict(), "soft_prompt_model.ckpt")
            
            prompt_model.train()
            print(f"Epoch {epoch+1}, Average loss: {tot_loss/(step+1)}, P: {p}, R: {r}, F1: {f1}")

Saving best model: 0.19576719576719578
Epoch 1, Average loss: 0.026357004808825114, P: 1.0, R: 0.10850439882697947, F1: 0.19576719576719578
Epoch 1, Average loss: 0.5219194789438747, P: 0.0, R: 0.0, F1: 0.0
Saving best model: 0.5357142857142857
Epoch 1, Average loss: 0.832122742176749, P: 0.40540540540540543, R: 0.7894736842105263, F1: 0.5357142857142857
Saving best model: 0.5714285714285715
Epoch 1, Average loss: 1.0649292303552462, P: 0.43243243243243246, R: 0.8421052631578947, F1: 0.5714285714285715
Epoch 2, Average loss: 0.006932020533916562, P: 0.40540540540540543, R: 0.8333333333333334, F1: 0.5454545454545455
Epoch 2, Average loss: 0.15800276505869143, P: 0.32432432432432434, R: 1.0, F1: 0.489795918367347
Saving best model: 0.6451612903225806
Epoch 2, Average loss: 0.29532318448019756, P: 0.5405405405405406, R: 0.8, F1: 0.6451612903225806
Saving best model: 0.676470588235294
Epoch 2, Average loss: 0.44177600317418053, P: 0.6216216216216216, R: 0.7419354838709677, F1: 0.6764705882

In [45]:
prompt_model.load_state_dict(torch.load("soft_prompt_model.ckpt"))
prompt_model = prompt_model.cuda()

test_dataloader = PromptDataLoader(dataset=dataset["test"], template=mytemplate, tokenizer=tokenizer,
    tokenizer_wrapper_class=WrapperClass, max_seq_length=256, decoder_max_length=3,
    batch_size=4,shuffle=False, teacher_forcing=False, predict_eos_token=False,
    truncate_method="head")

allpreds = []
alllabels = []
for step, inputs in enumerate(test_dataloader):
    if use_cuda:
        inputs = inputs.cuda()
    logits = prompt_model(inputs)
    labels = inputs['label']
    alllabels.extend(labels.cpu().tolist())
    allpreds.extend(torch.argmax(logits, dim=-1).cpu().tolist())
    
    prec = precision_score(allpreds, alllabels)
    recall = recall_score(allpreds, alllabels)
    f1 = f1_score(allpreds, alllabels)

print(f"Performance: P = {prec}; R = {recall}; F1 = {f1}")

tokenizing: 426it [00:00, 1152.94it/s]


Performance: P = 0.5681818181818182; R = 0.7575757575757576; F1 = 0.6493506493506495



---

## Soft Prompt (Template + Verbalizer)
#### w/ randomly initialized template

In [52]:
plm, tokenizer, model_config, WrapperClass = load_plm("t5", "t5-base")

template_text = '{"placeholder":"text_a"} {"soft"} {"mask"} {"soft"}'
mytemplate = SoftTemplate(model=plm, tokenizer=tokenizer, text=template_text)

In [53]:
wrapped_t5tokenizer= T5TokenizerWrapper(
    max_seq_length=128, 
    decoder_max_length=3, 
    tokenizer=tokenizer,
    truncate_method="head")

In [54]:
model_inputs = {}
for split in ['train', 'validation', 'test']:
    model_inputs[split] = []
    for sample in dataset[split]:
        tokenized_example = wrapped_t5tokenizer.tokenize_one_example(mytemplate.wrap_one_example(sample), teacher_forcing=False)
        model_inputs[split].append(tokenized_example)

In [55]:
train_dataloader = PromptDataLoader(dataset=dataset["train"], template=mytemplate, tokenizer=tokenizer,
    tokenizer_wrapper_class=WrapperClass, max_seq_length=256, decoder_max_length=3,
    batch_size=4,shuffle=True, teacher_forcing=False, predict_eos_token=False,
    truncate_method="head")

validation_dataloader = PromptDataLoader(dataset=dataset["validation"], template=mytemplate, tokenizer=tokenizer,
    tokenizer_wrapper_class=WrapperClass, max_seq_length=256, decoder_max_length=3,
    batch_size=4,shuffle=False, teacher_forcing=False, predict_eos_token=False,
    truncate_method="head")

tokenizing: 1362it [00:01, 1156.69it/s]
tokenizing: 341it [00:00, 1176.70it/s]


In [56]:
myverbalizer = SoftVerbalizer(tokenizer, plm, num_classes=2)

In [57]:
use_cuda = True

prompt_model = PromptForClassification(
    plm=plm,
    template=mytemplate, 
    verbalizer=myverbalizer, 
    freeze_plm=False)

if use_cuda:
    prompt_model=  prompt_model.cuda()

In [58]:
loss_func = torch.nn.CrossEntropyLoss()
no_decay = ['bias', 'LayerNorm.weight']

optimizer_grouped_parameters = [
    {'params': [p for n, p in prompt_model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in prompt_model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=1e-4)
best_f1 = 0.1
prompt_model.train()
for epoch in range(8):
    tot_loss = 0
    for step, inputs in enumerate(train_dataloader):
        if use_cuda:
            inputs = inputs.cuda()
        logits = prompt_model(inputs)
        labels = inputs['label']
        loss = loss_func(logits, labels)
        loss.backward()
        tot_loss += loss.item()
        optimizer.step()
        optimizer.zero_grad()
        if step %100 ==1:
            allpreds = []
            alllabels = []
            prompt_model.eval()
            for step, inputs in enumerate(validation_dataloader):
                if use_cuda:
                    inputs = inputs.cuda()
                logits = prompt_model(inputs)
                labels = inputs['label']
                alllabels.extend(labels.cpu().tolist())
                allpreds.extend(torch.argmax(logits, dim=-1).cpu().tolist())

                p = precision_score(allpreds, alllabels)
                r = recall_score(allpreds, alllabels)
                f1 = f1_score(allpreds, alllabels)

            if f1 >= best_f1:
                best_f1 = f1
                print(f"Saving best model: {best_f1}")
                torch.save(prompt_model.state_dict(), "true_soft_prompt_model.ckpt")
            
            prompt_model.train()
            print(f"Epoch {epoch+1}, Average loss: {tot_loss/(step+1)}, P: {p}, R: {r}, F1: {f1}")

Epoch 1, Average loss: 0.014202357724655506, P: 0.0, R: 0.0, F1: 0.0
Saving best model: 0.391304347826087
Epoch 1, Average loss: 0.3549600159099629, P: 0.24324324324324326, R: 1.0, F1: 0.391304347826087
Saving best model: 0.6333333333333333
Epoch 1, Average loss: 0.5979261933717617, P: 0.5135135135135135, R: 0.8260869565217391, F1: 0.6333333333333333
Saving best model: 0.65625
Epoch 1, Average loss: 0.7917336918153736, P: 0.5675675675675675, R: 0.7777777777777778, F1: 0.65625
Epoch 2, Average loss: 0.0005569188357439152, P: 0.4864864864864865, R: 0.8571428571428571, F1: 0.6206896551724138
Saving best model: 0.6829268292682927
Epoch 2, Average loss: 0.12911268933351303, P: 0.7567567567567568, R: 0.6222222222222222, F1: 0.6829268292682927
Epoch 2, Average loss: 0.2839046452670943, P: 0.5675675675675675, R: 0.8076923076923077, F1: 0.6666666666666666
Saving best model: 0.7058823529411764
Epoch 2, Average loss: 0.3827251874527699, P: 0.6486486486486487, R: 0.7741935483870968, F1: 0.70588235

In [59]:
prompt_model.load_state_dict(torch.load("true_soft_prompt_model.ckpt"))
prompt_model = prompt_model.cuda()

test_dataloader = PromptDataLoader(dataset=dataset["test"], template=mytemplate, tokenizer=tokenizer,
    tokenizer_wrapper_class=WrapperClass, max_seq_length=256, decoder_max_length=3,
    batch_size=4,shuffle=False, teacher_forcing=False, predict_eos_token=False,
    truncate_method="head")

allpreds = []
alllabels = []
for step, inputs in enumerate(test_dataloader):
    if use_cuda:
        inputs = inputs.cuda()
    logits = prompt_model(inputs)
    labels = inputs['label']
    alllabels.extend(labels.cpu().tolist())
    allpreds.extend(torch.argmax(logits, dim=-1).cpu().tolist())
    
    prec = precision_score(allpreds, alllabels)
    recall = recall_score(allpreds, alllabels)
    f1 = f1_score(allpreds, alllabels)

print(f"Performance: P = {prec}; R = {recall}; F1 = {f1}")

tokenizing: 426it [00:00, 1151.92it/s]


Performance: P = 0.7727272727272727; R = 0.7555555555555555; F1 = 0.7640449438202247



---

## Multi-Label Classification

Including other labels to improve performance of label 1 (definitions)...


---

## Span Attention

Future work...