In [1]:
!python --version

Python 3.7.12


In [2]:
!pip3 install transformers



In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
from pathlib import Path
# path to train.csv test.csv and test_labels.csv
data_dir = Path("/content/drive/MyDrive/data/foscdata")

In [5]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import transformers
print(f"Running on transformers v{transformers.__version__}")

Running on transformers v4.17.0


In [6]:
def onehot(row):
    return [float(l.strip('[]')) for l in row['labelsI'].split()]

In [7]:
def fetch_data(train_, test_):

    # foscdata/train_train.csv 
    # foscdata/train_val.csv 

    # foscdata/train.csv 
    # foscdata/test.csv
    # reading either train/test pair of the above datasets;

    train_df = pd.read_csv(train_)
    train_df['labels'] = train_df.apply(onehot, axis=1)
    train_df = train_df[['payload', 'labels']]
    train_df = train_df.sample(n=256000)

    #sample = train_df.sample(n=512)
    #mask = np.random.rand(len(sample)) < 0.8
    #train_sample = sample[mask]
    #test_sample = sample[~mask]

    test_df = pd.read_csv(test_)
    test_df['labels'] = test_df.apply(onehot, axis=1)
    test_df = test_df[['payload', 'labels']]
    test_df = test_df.sample(n=32000)

    return train_df, test_df

In [8]:
def compute_metrics(eval_pred):

    preds, labels = eval_pred
    metrics_ = dict()
    preds = np.array(preds) >= 0.5
    metrics_['accuracy'] = accuracy_score(labels, preds)
    metrics_['micro-precision'] = precision_score(labels, preds, average='micro', zero_division=0)
    metrics_['macro-precision'] = precision_score(labels, preds, average='macro', zero_division=0)
    metrics_['micro-recall'] = recall_score(labels, preds, average='micro', zero_division=0)
    metrics_['macro-recall'] = recall_score(labels, preds, average='macro', zero_division=0)
    metrics_['micro-f1'] = f1_score(labels, preds, average='micro', zero_division=0)
    metrics_['macro-f1'] = f1_score(labels, preds, average='macro', zero_division=0)

    print('\n' + 'accuracy: {}'.format(metrics_['accuracy']) + '\n')
    print('micro-precision: {}'.format(metrics_['micro-precision']) + '.\n')
    print('macro-precision: {}'.format(metrics_['macro-precision']) + '.\n')
    print('micro-recall: {}'.format(metrics_['micro-recall']) + '.\n')
    print('macro-recall: {}'.format(metrics_['macro-recall']) + '.\n')
    print('micro-f1: {}'.format(metrics_['micro-f1']) + '.\n')
    print('macro-f1: {}'.format(metrics_['macro-f1']) + '.\n')

    return metrics_

In [9]:
import torch

class FOSCDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [10]:
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, 
                          PreTrainedModel, BertModel, BertForSequenceClassification,
                          TrainingArguments, Trainer)
from transformers.modeling_outputs import SequenceClassifierOutput


class FOSCModel:
    def __init__(self, model_ckpt, num_labels):

        self.tokenizer = AutoTokenizer.from_pretrained(
            model_ckpt, 
            problem_type='multi_label_classification')

        self.model = AutoModelForSequenceClassification.from_pretrained(
            model_ckpt, 
            num_labels=num_labels, 
            problem_type='multi_label_classification')

        self.args = TrainingArguments(
            output_dir='fosc_ckpts',
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=2,
            evaluation_strategy = 'epoch',
            save_strategy='epoch',
            learning_rate=2e-5,
            load_best_model_at_end=True,
            weight_decay=0.01,
            seed=42
            )

In [11]:
#trainer
class FOSCTrainer:
    def __init__(self, model_, args_, tokenizer_, train_, test_):
        self.trainer = Trainer(
            model_,
            args_,
            tokenizer=tokenizer_,
            train_dataset=train_,
            eval_dataset=test_,
            compute_metrics=compute_metrics
            )

In [None]:
#run
if __name__ == "__main__":

    #gpu
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    #load data
    train, val = fetch_data(data_dir/'train_train.csv', data_dir/'train_val.csv')
    print('Data loaded successfully.')

    #AutoModelForSequenceClassification from model.py
    fosc_m = FOSCModel(
        model_ckpt='allenai/scibert_scivocab_uncased', 
        num_labels=20,
        )
    print('FOSCModel instantiated.')

    #AutoTokenizer from model.py
    tokenizer = fosc_m.tokenizer
    print('Tokenizer instantiated.')

    #train dataset
    train_encodings = tokenizer(train['payload'].values.tolist(), truncation=True, max_length=128)
    train_labels = train['labels'].values.tolist()
    #test dataset
    test_encodings = tokenizer(val['payload'].values.tolist(), truncation=True, max_length=128)
    test_labels = val['labels'].values.tolist()

    #custom dataloaders from dataset.py
    train_dataset = FOSCDataset(train_encodings, train_labels)
    test_dataset = FOSCDataset(test_encodings, test_labels)
    print('Dataloaders instantiated.')

    #model
    model = fosc_m.model.to(device)
    print('Model instantiated.')

    #args
    args = fosc_m.args

    print('Trainer arguments set.')

    #trainer
    fosc_t = FOSCTrainer(
        model_=model,
        args_=args,
        tokenizer_=tokenizer,
        train_=train_dataset,
        test_=test_dataset
        )

    trainer = fosc_t.trainer
    print('Trainer now running:')

    #train
    trainer.train()

    #val
    trainer.evaluate()


Data loaded successfully.


Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification we

FOSCModel instantiated.
Tokenizer instantiated.
Dataloaders instantiated.


***** Running training *****
  Num examples = 256000
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 32000


Model instantiated.
Trainer arguments set.
Trainer now running:


Epoch,Training Loss,Validation Loss


***** Running Evaluation *****
  Num examples = 32000
  Batch size = 16
