In [None]:
import os
import math
import random as rand
from tqdm import tqdm, trange

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from transformers import AutoTokenizer, AutoModel, TFBertForSequenceClassification, AutoModelForMaskedLM
from datasets import Dataset
import evaluate

import tensorflow as tf

from sklearn.metrics import confusion_matrix, f1_score, recall_score, precision_score

# Data Loading

In [2]:
def func_betolt(lr, Trainable, train_dataset, val_dataset, test_dataset, tokenizer):
        # Load BERT tokenizer
        # tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        # Load BERT model
        model = TFBertForSequenceClassification.from_pretrained("yorko/scibert_scivocab_uncased_long_4096", from_pt=True)
        
        # Set up optimizer and loss function
        optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
        
        model.layers[0].trainable = Trainable
        print ("Learning rate: " + str(lr) + "    Trainable: " + str(Trainable))
        model.summary()
        model.compile(optimizer=optimizer, loss=loss, metrics=['sparse_categorical_accuracy'])
        es = tf.keras.callbacks.EarlyStopping(patience=10, monitor="val_loss", restore_best_weights=True)
        hist = model.fit(train_dataset, epochs=1000, 
                validation_data=val_dataset,
                callbacks=[es],
                verbose=1)
        
        #plt.plot(hist.history["loss"])
        #plt.plot(hist.history["val_loss"])
        vissza = [len(hist.history["loss"]), model.evaluate(train_dataset, verbose=0), model.evaluate(val_dataset, verbose=0), model.evaluate(test_dataset, verbose=0)]
        return vissza

In [3]:
dataset = 'title_abstract_keywords'

In [4]:
df = pd.read_pickle("data/train_{}.pkl".format(dataset))

In [5]:
df.head()

Unnamed: 0,label,text
0,1,Comparing measurement properties of EQ-5D-Y-3L...
1,0,Feasibility of the EQ-5D in the elderly popula...
2,1,Comparing the self-reported health-related qua...
3,1,Testing measurement properties of two EQ-5D yo...
4,1,Use of Antimalarial Agents is Associated with ...


In [6]:
train_dataset = Dataset.from_pandas(df)

In [7]:
train_dataset[0]

{'label': 1,
 'text': 'Comparing measurement properties of EQ-5D-Y-3L and EQ-5D-Y-5L in paediatric patients [SEP] BACKGROUND: The adult versions EQ-5D-3L and EQ-5D-5L have been extensive compared. This is not the case for the EQ-5D youth versions. The study aim was to compare the measurement properties and responsiveness of EQ-5D-Y-3L and EQ-5D-Y-5L in paediatric patients. METHODS: A sample of patients 8-16\xa0years old with different diseases and a wide range of disease severity was asked to complete EQ-5D-Y-3L, EQ-5D-Y-5L, PedsQL Generic Core Scale, and selected, appropriate disease-specific instruments, three times. EQ-5D-Y-3L and EQ-5D-Y-5L were compared in terms of: feasibility, (re-)distribution properties, discriminatory power, convergent validity, test-retest reliability, and responsiveness. RESULTS: 286 participating patients suffered from one of the following diseases: major beta-thalassemia, haemophilia, acute lymphoblastic leukaemia, acute illness. Missing responses were co

In [8]:
#random stratified validation subset split
#_diff = 1
#while _diff >= .02:
#    tts = train_dataset.train_test_split(test_size=.15, shuffle=True)
#    _train_ratio, _val_ratio = np.sum(tts["train"]["label"]) / len(tts["train"]["label"]), np.sum(tts["test"]["label"]) / len(tts["test"]["label"])
#    _diff = abs(_train_ratio - _val_ratio)
#    print(_train_ratio, _val_ratio, _diff)
#
#train_dataset = tts["train"]
#val_dataset = tts["test"]


#subsets should be fixed for all tests
_val_ids = [2, 7, 24, 32, 36, 47, 49, 59, 61, 71, 72, 86, 90, 95, 96]
train_dataset = Dataset.from_pandas(df[~df.index.isin(_val_ids)])
val_dataset = Dataset.from_pandas(df[df.index.isin(_val_ids)])

In [9]:
np.sum(train_dataset["label"]) / len(train_dataset["label"]), np.sum(val_dataset["label"]) / len(val_dataset["label"])

(0.611764705882353, 0.6)

In [10]:
train_dataset[0]

{'label': 1,
 'text': 'Comparing measurement properties of EQ-5D-Y-3L and EQ-5D-Y-5L in paediatric patients [SEP] BACKGROUND: The adult versions EQ-5D-3L and EQ-5D-5L have been extensive compared. This is not the case for the EQ-5D youth versions. The study aim was to compare the measurement properties and responsiveness of EQ-5D-Y-3L and EQ-5D-Y-5L in paediatric patients. METHODS: A sample of patients 8-16\xa0years old with different diseases and a wide range of disease severity was asked to complete EQ-5D-Y-3L, EQ-5D-Y-5L, PedsQL Generic Core Scale, and selected, appropriate disease-specific instruments, three times. EQ-5D-Y-3L and EQ-5D-Y-5L were compared in terms of: feasibility, (re-)distribution properties, discriminatory power, convergent validity, test-retest reliability, and responsiveness. RESULTS: 286 participating patients suffered from one of the following diseases: major beta-thalassemia, haemophilia, acute lymphoblastic leukaemia, acute illness. Missing responses were co

In [11]:
df = pd.read_pickle("data/test_{}.pkl".format(dataset))

In [12]:
test_dataset = Dataset.from_pandas(df)

In [13]:
np.sum(test_dataset["label"]) / len(test_dataset["label"])

0.6

# Preparation for training

In [14]:
tokenizer = AutoTokenizer.from_pretrained("yorko/scibert_scivocab_uncased_long_4096")

In [15]:
#def preprocess_function(examples):
#    return tokenizer(examples["text"], truncation=True, padding=True)

In [16]:
#encodings = dataset.map(preprocess_function, batched=True)
train_encodings = tokenizer(train_dataset["text"], truncation=True, padding=True, max_length=4096)
val_encodings = tokenizer(val_dataset["text"], truncation=True, padding=True, max_length=4096)
test_encodings = tokenizer(test_dataset["text"], truncation=True, padding=True, max_length=4096)

In [17]:
len(train_encodings[0]), len(train_encodings[1]), len(train_encodings[2])

(974, 974, 974)

In [18]:
np.mean([np.sum([t == '[PAD]' for t in train_encodings[e].tokens]) for e in range(0,80)])

505.3125

In [19]:
train_labels = train_dataset["label"]
val_labels = val_dataset["label"]
test_labels = test_dataset["label"]

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
)).shuffle(100).batch(16)

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_labels
)).shuffle(100).batch(16)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    test_labels
)).batch(16)

In [21]:
# Load BERT tokenizer
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load BERT model
Trainable = True
for lr in [1e-4, 2e-4, 5e-4, 1e-5, 2e-5, 5e-5, 1e-6, 2e-6, 5e-6]:
    for Ismetles in range (0,5):
        TestEredmeny = func_betolt(lr, Trainable, train_dataset, val_dataset, test_dataset, tokenizer)
        print(lr,  TestEredmeny)


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.encoder.layer.10.attention.self.value_global.weight', 'bert.encoder.layer.7.attention.self.value_global.weight', 'bert.encoder.layer.9.attention.self.query_global.weight', 'bert.encoder.layer.4.attention.self.query_global.weight', 'bert.encoder.layer.1.attention.self.query_global.weight', 'bert.encoder.layer.2.attention.self.query_global.weight', 'bert.encoder.layer.2.attention.self.key_global.weight', 'bert.encoder.layer.8.attention.self.query_global.bias', 'bert.encoder.layer.4.attention.self.query_global.bias', 'bert.encoder.layer.7.attention.self.value_global.bias', 'bert.encoder.layer.1.attention.self.key_global.bias', 'bert.encoder.layer.9.attention.self.key_global.weight', 'bert.encoder.layer.11.attention.self.query_global.weight', 'bert.encoder.layer.1.attention.self.value_global.bias', 'bert.encoder.layer.7.attention.self.query_global.weight', 'bert.encode

Learning rate: 1e-06    Trainable: True
Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  112670976 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 112,672,514
Trainable params: 112,672,514
Non-trainable params: 0
_________________________________________________________________
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/100

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.encoder.layer.10.attention.self.value_global.weight', 'bert.encoder.layer.7.attention.self.value_global.weight', 'bert.encoder.layer.9.attention.self.query_global.weight', 'bert.encoder.layer.4.attention.self.query_global.weight', 'bert.encoder.layer.1.attention.self.query_global.weight', 'bert.encoder.layer.2.attention.self.query_global.weight', 'bert.encoder.layer.2.attention.self.key_global.weight', 'bert.encoder.layer.8.attention.self.query_global.bias', 'bert.encoder.layer.4.attention.self.query_global.bias', 'bert.encoder.layer.7.attention.self.value_global.bias', 'bert.encoder.layer.1.attention.self.key_global.bias', 'bert.encoder.layer.9.attention.self.key_global.weight', 'bert.encoder.layer.11.attention.self.query_global.weight', 'bert.encoder.layer.1.attention.self.value_global.bias', 'bert.encoder.layer.7.attention.self.query_global.weight', 'bert.encode

Learning rate: 1e-06    Trainable: True
Model: "tf_bert_for_sequence_classification_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  112670976 
                                                                 
 dropout_75 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 112,672,514
Trainable params: 112,672,514
Non-trainable params: 0
_________________________________________________________________
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.encoder.layer.10.attention.self.value_global.weight', 'bert.encoder.layer.7.attention.self.value_global.weight', 'bert.encoder.layer.9.attention.self.query_global.weight', 'bert.encoder.layer.4.attention.self.query_global.weight', 'bert.encoder.layer.1.attention.self.query_global.weight', 'bert.encoder.layer.2.attention.self.query_global.weight', 'bert.encoder.layer.2.attention.self.key_global.weight', 'bert.encoder.layer.8.attention.self.query_global.bias', 'bert.encoder.layer.4.attention.self.query_global.bias', 'bert.encoder.layer.7.attention.self.value_global.bias', 'bert.encoder.layer.1.attention.self.key_global.bias', 'bert.encoder.layer.9.attention.self.key_global.weight', 'bert.encoder.layer.11.attention.self.query_global.weight', 'bert.encoder.layer.1.attention.self.value_global.bias', 'bert.encoder.layer.7.attention.self.query_global.weight', 'bert.encode

Learning rate: 2e-06    Trainable: True
Model: "tf_bert_for_sequence_classification_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  112670976 
                                                                 
 dropout_113 (Dropout)       multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 112,672,514
Trainable params: 112,672,514
Non-trainable params: 0
_________________________________________________________________
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.encoder.layer.10.attention.self.value_global.weight', 'bert.encoder.layer.7.attention.self.value_global.weight', 'bert.encoder.layer.9.attention.self.query_global.weight', 'bert.encoder.layer.4.attention.self.query_global.weight', 'bert.encoder.layer.1.attention.self.query_global.weight', 'bert.encoder.layer.2.attention.self.query_global.weight', 'bert.encoder.layer.2.attention.self.key_global.weight', 'bert.encoder.layer.8.attention.self.query_global.bias', 'bert.encoder.layer.4.attention.self.query_global.bias', 'bert.encoder.layer.7.attention.self.value_global.bias', 'bert.encoder.layer.1.attention.self.key_global.bias', 'bert.encoder.layer.9.attention.self.key_global.weight', 'bert.encoder.layer.11.attention.self.query_global.weight', 'bert.encoder.layer.1.attention.self.value_global.bias', 'bert.encoder.layer.7.attention.self.query_global.weight', 'bert.encode

Learning rate: 2e-06    Trainable: True
Model: "tf_bert_for_sequence_classification_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  112670976 
                                                                 
 dropout_151 (Dropout)       multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 112,672,514
Trainable params: 112,672,514
Non-trainable params: 0
_________________________________________________________________
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.encoder.layer.10.attention.self.value_global.weight', 'bert.encoder.layer.7.attention.self.value_global.weight', 'bert.encoder.layer.9.attention.self.query_global.weight', 'bert.encoder.layer.4.attention.self.query_global.weight', 'bert.encoder.layer.1.attention.self.query_global.weight', 'bert.encoder.layer.2.attention.self.query_global.weight', 'bert.encoder.layer.2.attention.self.key_global.weight', 'bert.encoder.layer.8.attention.self.query_global.bias', 'bert.encoder.layer.4.attention.self.query_global.bias', 'bert.encoder.layer.7.attention.self.value_global.bias', 'bert.encoder.layer.1.attention.self.key_global.bias', 'bert.encoder.layer.9.attention.self.key_global.weight', 'bert.encoder.layer.11.attention.self.query_global.weight', 'bert.encoder.layer.1.attention.self.value_global.bias', 'bert.encoder.layer.7.attention.self.query_global.weight', 'bert.encode

Learning rate: 5e-06    Trainable: True
Model: "tf_bert_for_sequence_classification_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  112670976 
                                                                 
 dropout_189 (Dropout)       multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 112,672,514
Trainable params: 112,672,514
Non-trainable params: 0
_________________________________________________________________
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.encoder.layer.10.attention.self.value_global.weight', 'bert.encoder.layer.7.attention.self.value_global.weight', 'bert.encoder.layer.9.attention.self.query_global.weight', 'bert.encoder.layer.4.attention.self.query_global.weight', 'bert.encoder.layer.1.attention.self.query_global.weight', 'bert.encoder.layer.2.attention.self.query_global.weight', 'bert.encoder.layer.2.attention.self.key_global.weight', 'bert.encoder.layer.8.attention.self.query_global.bias', 'bert.encoder.layer.4.attention.self.query_global.bias', 'bert.encoder.layer.7.attention.self.value_global.bias', 'bert.encoder.layer.1.attention.self.key_global.bias', 'bert.encoder.layer.9.attention.self.key_global.weight', 'bert.encoder.layer.11.attention.self.query_global.weight', 'bert.encoder.layer.1.attention.self.value_global.bias', 'bert.encoder.layer.7.attention.self.query_global.weight', 'bert.encode

Learning rate: 5e-06    Trainable: True
Model: "tf_bert_for_sequence_classification_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  112670976 
                                                                 
 dropout_227 (Dropout)       multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 112,672,514
Trainable params: 112,672,514
Non-trainable params: 0
_________________________________________________________________
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1