In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%pip install -q -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [3]:
import json
import pandas as pd
import os
import re
import string
import pickle
from transformers import BertForSequenceClassification
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import AutoTokenizer
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
from transformers import TrainingArguments, Trainer
from datasets import Dataset
import numpy as np
from sklearn.model_selection import train_test_split

from vecsim_app.categories import CATEGORIES
from vecsim_app.data_utils import papers

DATA_PATH = "/home/jovyan/arxiv/arxiv-metadata-oai-snapshot.json"
YEAR_CUTOFF = 2012
YEAR_PATTERN = r"(19|20[0-9]{2})"

In [4]:
df = pd.DataFrame(papers(data_path=DATA_PATH, year_cutoff=YEAR_CUTOFF, year_pattern=YEAR_PATTERN))
len(df)

409500

In [5]:
df.head(3)

Unnamed: 0,id,title,year,authors,categories,abstract
0,704.0304,The World as Evolving Information,2012,Carlos Gershenson,"cs.IT,cs.AI,math.IT,q-bio.PE",This paper discusses the benefits of describ...
1,704.2744,Nahm transform and parabolic minimal Laplace t...,2012,Szilard Szabo,math.AG,We prove that Nahm transform for integrable ...
2,704.2768,Heat Equations and the Weighted $\bar\partial$...,2012,Andrew Raich,"math.AP,math.CV",The purpose of this article is to establish ...


In [6]:
df['text'] = df['title'] + ' ' + df['abstract']
# df['categories'] = df['categories'].apply(lambda x: x.split(','))

In [7]:
df.iloc[0].categories

'cs.IT,cs.AI,math.IT,q-bio.PE'

In [8]:
# Split into train and test
df_2, df_unused = train_test_split(df, train_size=0.5)  # take only 50% of dataset
df_train, df_test = train_test_split(df_2, train_size=0.8)

df.size, df_train.size, df_test.size

(2866500, 1146600, 286650)

In [9]:
def get_tokenizer(tokenizer_model):
    def tokenize_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True)

    tokenizer = AutoTokenizer.from_pretrained(tokenizer_model)
    return tokenize_function, tokenizer

tokenize_function, tokenizer = get_tokenizer('bert-base-uncased')

In [11]:
mlb = MultiLabelBinarizer()
# mlb.fit([[(k,v) for k, v in CATEGORIES.items()]]) #df_train['categories'])
mlb.fit([list(CATEGORIES.keys())]) #df_train['categories'])
mlb.classes_[:10]

array(['astro-ph', 'astro-ph.CO', 'astro-ph.EP', 'astro-ph.GA',
       'astro-ph.HE', 'astro-ph.IM', 'astro-ph.SR', 'cond-mat.dis-nn',
       'cond-mat.mes-hall', 'cond-mat.mtrl-sci'], dtype=object)

In [12]:
def preprocess_data(examples):
    # take a batch of texts
    text = examples["text"]

    # encode them
    encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)

    encoded_categories = mlb.transform([c.split(',') for c in examples['categories']]).astype(float)

    encoding["labels"] = encoded_categories

    return encoding

In [14]:
df_train_hf = Dataset.from_pandas(df_train[['text', 'categories']])
tokenized_train = df_train_hf.map(preprocess_data, batched=True)

df_test_hf = Dataset.from_pandas(df_test[['text', 'categories']])
tokenized_test = df_test_hf.map(preprocess_data, batched=True)

  0%|          | 0/164 [00:00<?, ?ba/s]



  0%|          | 0/41 [00:00<?, ?ba/s]



In [15]:
# Debugging - get inverse transform

print("Reversed", mlb.inverse_transform(np.asarray(tokenized_test[0]['labels']).reshape(1, -1)))
print("Original categories", tokenized_test[0]['categories'])

Reversed [('astro-ph.HE', 'gr-qc')]
Original categories gr-qc,astro-ph.HE


In [None]:
!rm -r checkpoint
!mkdir checkpoint
with open('checkpoint/mlb.pkl', 'wb') as handle:
    pickle.dump(mlb, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Training multi label class model

In [None]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels=len(mlb.classes_), 
    problem_type="multi_label_classification"
)

In [None]:
# Adaptation: https://github.com/NielsRogge/Transformers-Tutorials/blob/master/BERT/Fine_tuning_BERT_(and_friends)_for_multi_label_text_classification.ipynb 
# 
batch_size = 50
metric_name = "f1"

args = TrainingArguments(
    f"paper-multilabel-finetuning",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size//2,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    eval_accumulation_steps=1,
)


# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.45):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    return multi_label_metrics(
        predictions=p.predictions, 
        labels=p.label_ids)

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
eval = trainer.evaluate()
eval

In [None]:
text = df['text'].iloc[5]
categories = df['categories'].iloc[5]
print(categories)

encoding = tokenizer(text, return_tensors="pt")
encoding = {k: v.to(trainer.model.device) for k, v in encoding.items()}

outputs = trainer.model(**encoding)
logits = outputs.logits

In [None]:
# apply sigmoid + threshold
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(logits.squeeze().cpu())
predictions = np.zeros(probs.shape)
predictions[np.where(probs >= 0.45)] = 1

In [None]:
print(text)
print(mlb.inverse_transform(predictions.reshape(1, -1)))

In [None]:
trainer.save_model(output_dir='./checkpoint')