In [None]:
from utilities import *
import numpy as np 
import pandas as pd 
import json
import os
from tqdm import tqdm

import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
#from kaggle_datasets import KaggleDatasets
import transformers

from tokenizers import BertWordPieceTokenizer
from transformers import TFAutoModel, AutoTokenizer

In [None]:
datapath = '/home/koki/Desktop/Data/NLP/arxiv/archive/arxiv-metadata-oai-snapshot.json'
            #'/kaggle/input/arxiv/arxiv-metadata-oai-snapshot.json'
labelmap = {}
sample_rate = 1 # sample rate in percentage, 1% is just for testing  
abstracts_train, labels_train, labelmap = get_data_and_labels(
                    datapath=datapath,
                    year=2021, 
                    month_start=1,
                    month_end=12,
                    labelmap=labelmap,
                    update_map=True,
                    sample_rate=sample_rate
                    )

abstracts_val, labels_val, _ = get_data_and_labels(
                    datapath=datapath,
                    year=2022, 
                    month_start=1,
                    month_end=6,
                    labelmap=labelmap,
                    update_map=False,
                    sample_rate=sample_rate
                    )

abstracts_test, labels_test, _ = get_data_and_labels(
                    datapath=datapath,
                    year=2022, 
                    month_start=7,
                    month_end=12,
                    labelmap=labelmap,
                    update_map=False,
                    sample_rate=sample_rate
                    )

### Loading a Bert model

In [None]:
modelname= 'bert-base-uncased'
bert_tokenizer = AutoTokenizer.from_pretrained(modelname)
bert_model = TFAutoModel.from_pretrained(modelname) 

In [None]:
SEQ_LEN = 512 # for scalability
def bert_tokenize(sentence):
    tokens = bert_tokenizer.encode_plus(sentence, max_length=SEQ_LEN,
                                   truncation=True, padding='max_length',
                                   add_special_tokens=True, return_attention_mask=True,
                                   return_token_type_ids=False, return_tensors='tf')
    return tokens['input_ids'], tokens['attention_mask']

### Tokenization and computing the attention mask

In [None]:
def get_ids_and_masks(abstracts):
# initialize two arrays for input tensors
    Xids = np.zeros((len(abstracts), SEQ_LEN))
    Xmask = np.zeros((len(abstracts), SEQ_LEN))

    for i, sentence in enumerate(abstracts):
        if i % 5000 == 0:
            print('#  processed documents', i)
        Xids[i, :], Xmask[i, :] = bert_tokenize(sentence)
    return Xids, Xmask

Xids_train, Xmask_train = get_ids_and_masks(abstracts_train)
Xids_val, Xmask_val = get_ids_and_masks(abstracts_val)
Xids_test, Xmask_test = get_ids_and_masks(abstracts_test)

In [None]:
# needed to create a tensorflow dataset
def map_func(input_ids, masks, labels):
    return {'input_ids': input_ids, 'attention_mask': masks}, labels

In [None]:
nr_classes = np.max([l for lb in labels_train for l in lb]) + 1 #labels are consecutive integers starting at 0
nr_classes

In [None]:
def encode_labels(labels, nr_classes=nr_classes):
    enc_labels = np.zeros((len(labels), nr_classes))  # initialize empty (all zero) label array
    for idx, label in enumerate(labels):
        for lb in label:
            enc_labels[idx, lb] = 1  # add ones in indices where we have a value
    return enc_labels

enc_labels_train = encode_labels(labels_train)
enc_labels_val = encode_labels(labels_val)
enc_labels_test = encode_labels(labels_test)

assert enc_labels_train.shape[1] == enc_labels_val.shape[1]
assert enc_labels_train.shape[1] == enc_labels_test.shape[1]

In [None]:
def get_dataset(Xids, Xmask, enc_labels):
    dataset = tf.data.Dataset.from_tensor_slices((Xids, Xmask, enc_labels))
    dataset = dataset.map(map_func)
    dataset = dataset.batch(64) #dataset.shuffle(1000).batch(32)
    return dataset

train = get_dataset(Xids_train, Xmask_train, enc_labels_train)
val = get_dataset(Xids_val, Xmask_val, enc_labels_val)
test = get_dataset(Xids_test, Xmask_test, None)

### Defining the model

In [None]:
def get_mlp_model():
    input_ids = tf.keras.layers.Input(shape=(SEQ_LEN,), name='input_ids', dtype='int32')
    mask = tf.keras.layers.Input(shape=(SEQ_LEN,), name='attention_mask', dtype='int32')

    input_ids = tf.keras.layers.Input(shape=(SEQ_LEN,), name='input_ids', dtype='int32')
    mask = tf.keras.layers.Input(shape=(SEQ_LEN,), name='attention_mask', dtype='int32')

    embeddings = bert_model(input_ids, attention_mask=mask)[0]  # we only keep last_hidden_state

    print(embeddings.shape)
    X = tf.keras.layers.GlobalMaxPool1D()(embeddings)  # reduce tensor dimensionality
    #X = tf.keras.layers.AveragePooling1D()(embeddings)
    print(X.shape)
    X = tf.keras.layers.BatchNormalization()(X)
    X = tf.keras.layers.Dense(128, activation='leaky_relu')(X)
    X = tf.keras.layers.Dropout(0.3)(X)
    y = tf.keras.layers.Dense(nr_classes, activation='sigmoid', name='outputs')(X)  # adjust based on number of sentiment classes
    
    mlp_model = tf.keras.Model(inputs=[input_ids, mask], outputs=y)
    #mlp_model.layers[2].trainable = False
    
    # freezing the Bert layer 
    bert_layer = [layer.name for layer in mlp_model.layers if 'tf_bert' in layer.name]
    mlp_model.get_layer(bert_layer[0]).trainable=False
    
    return mlp_model

In [None]:
# necessary if we want to create a new model with a clean state
tf.keras.backend.clear_session()

In [None]:
mlp_model = get_mlp_model()
mlp_model.summary()

In [None]:
# the F1 metrics defined on tensors, might be used for early stopping

@tf.function
def macro_f1(y, y_hat, thresh=0.5):
    
    y_pred = tf.cast(tf.greater(y_hat, thresh), tf.float32)
    tp = tf.cast(tf.math.count_nonzero(y_pred * y, axis=0), tf.float32)
    fp = tf.cast(tf.math.count_nonzero(y_pred * (1 - y), axis=0), tf.float32)
    fn = tf.cast(tf.math.count_nonzero((1 - y_pred) * y, axis=0), tf.float32)
    f1 = 2*tp / (2*tp + fn + fp + 1e-16)
    macro_f1 = tf.reduce_mean(f1)
    return macro_f1


In [None]:
learning_rate = 0.01
optimizer = tf.keras.optimizers.Adam(learning_rate)
#optimizer = tf.keras.optimizers.SGD(learning_rate)
losses = [tf.keras.losses.BinaryCrossentropy(), tf.keras.losses.BinaryFocalCrossentropy()]
loss = losses[1]
acc = tf.keras.metrics.BinaryAccuracy('accuracy')
auc = tf.keras.metrics.AUC()

mlp_model.compile(optimizer=optimizer, loss=loss, metrics=[macro_f1, auc])

In [None]:
# in order to store the model in a Kaggle 
!mkdir -p training

In [None]:
checkpoint_path = "/kaggle/working/training/mlp_focal_cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)

In [None]:
es_callback = tf.keras.callbacks.EarlyStopping(patience=4, monitor='val_auc', \
                                            mode='max', restore_best_weights=True)
history = mlp_model.fit(train, validation_data=val, callbacks=[es_callback, cp_callback], epochs=20)

In [None]:
learning_rate = 0.01
optimizer = tf.keras.optimizers.Adam(learning_rate)
#optimizer = tf.keras.optimizers.SGD(learning_rate)
losses = [tf.keras.losses.BinaryCrossentropy(), tf.keras.losses.BinaryFocalCrossentropy()]
loss = losses[1]
acc = tf.keras.metrics.BinaryAccuracy('accuracy')
auc = tf.keras.metrics.AUC()

lstm_model.compile(optimizer=optimizer, loss=loss, metrics=[macro_f1, auc])

### A model with an LSTM layer

In [None]:
tf.keras.backend.clear_session()

In [None]:
def get_lstm_model():
    input_ids = tf.keras.layers.Input(shape=(SEQ_LEN,), name='input_ids', dtype='int32')
    mask = tf.keras.layers.Input(shape=(SEQ_LEN,), name='attention_mask', dtype='int32')

    input_ids = tf.keras.layers.Input(shape=(SEQ_LEN,), name='input_ids', dtype='int32')
    mask = tf.keras.layers.Input(shape=(SEQ_LEN,), name='attention_mask', dtype='int32')

    embeddings = bert_model(input_ids, attention_mask=mask)[0]  # we only keep tensor 0 (last_hidden_state)

    print(embeddings.shape)
    X = tf.keras.layers.LSTM(256, kernel_initializer='random_normal', return_sequences=False)(embeddings)

    X = tf.keras.layers.BatchNormalization()(X)
    #X = tf.keras.layers.Dense(32, activation='relu')(X)
    X = tf.keras.layers.Dropout(0.3)(X)
    y = tf.keras.layers.Dense(nr_classes, activation='softmax', name='outputs')(X)  # adjust based on number of sentiment classes

    lstm_model = tf.keras.Model(inputs=[input_ids, mask], outputs=y)
    #lstm_model.layers[2].trainable = False
    lstm_model.get_layer('tf_bert_model').trainable=False
    
    return lstm_model

lstm_model = get_lstm_model()
lstm_model.summary()

In [None]:
learning_rate = 0.01
optimizer = tf.keras.optimizers.Adam(learning_rate)
#optimizer = tf.keras.optimizers.SGD(learning_rate)
idx = 1
losses = [tf.keras.losses.BinaryCrossentropy(), tf.keras.losses.BinaryFocalCrossentropy()]
loss = losses[idx]
acc = tf.keras.metrics.BinaryAccuracy('accuracy')
auc = tf.keras.metrics.AUC()

lstm_model.compile(optimizer=optimizer, loss=loss, metrics=[macro_f1, auc])

In [None]:
# define a suitable name
checkpoint_paths = ["/kaggle/working/training_1/cp_lstm_cross_entropy.ckpt", \
                    "/kaggle/working/training_1/cp_lstm_focal_loss.ckpt"]
checkpoint_dir = os.path.dirname(checkpoint_paths[idx])

# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)

In [None]:
es_callback = tf.keras.callbacks.EarlyStopping(patience=4, monitor='val_macro_f1', \
                                            mode='max', restore_best_weights=True)
history = lstm_model.fit(train, validation_data=val, callbacks=[es_callback, cp_callback], epochs=20)