In [None]:
from google.colab import drive
drive.mount('/content/drive/')

import os
os.chdir('/content/drive/My Drive/Colab Notebooks/LayoutQT')
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import torch

# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [None]:
def process_data_for_bert(data):

    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

    X_tokenized = tokenizer(list(data['texttags']), padding=True, truncation=True, max_length=512)
    y = list(data['label'])
    
    dataset = Dataset(X_tokenized, y)

    return dataset

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from transformers import AutoModelForSequenceClassification
from transformers import AutoModelForMaskedLM
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

def runBert(df_train_dataset, df_val_dataset, df_test_dataset):
    
    batch_size=16

    model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=16)
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=False )
     
    train_dataset = process_data_for_bert(df_train_dataset)
    val_dataset = process_data_for_bert(df_val_dataset)
    test_dataset = process_data_for_bert(df_test_dataset)

    # ----- 2. Fine-tune pretrained model -----#
    # Define Trainer parameters
    def compute_metrics(p):
        
        pred, labels = p
        pred = np.argmax(pred, axis=1)

        accuracy = accuracy_score(y_true=labels, y_pred=pred)
        recall = recall_score(y_true=labels, y_pred=pred, average='micro')
        precision = precision_score(y_true=labels, y_pred=pred, average='micro')
        f1 = f1_score(y_true=labels, y_pred=pred, average='micro')

        return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

    # Define Trainer
    args = TrainingArguments(
        "./model/layoutqt_rvlcdip",
        evaluation_strategy = "epoch",
        save_strategy = "epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=5,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model='f1'
      # optim="adamw_torch"
    )

    def model_init():
        return model

    trainer = Trainer(
        model_init=model_init,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    # New code - wrap collator in a dictionary
    #data_collator = trainer.data_collator
    #trainer.data_collator = lambda data: dict(data_collator(data))
    # End new code

    trainer.train()

    # Make prediction
    raw_pred, _, _ = trainer.predict(test_dataset)

    # Preprocess raw predictions
    y_pred = np.argmax(raw_pred, axis=1)

    return y_pred
    #return roc_auc_score(df_test_dataset['class'], y_pred), f1_score(df_test_dataset['class'], y_pred, average='macro')

In [None]:
import numpy as np
import pandas as pd

             
df_train = pd.read_csv("./input/rvlcdip/train_tags.csv")
 
df_val = pd.read_csv("./input/rvlcdip/val_tags.csv")

df_test = pd.read_csv("./input/rvlcdip/test_tags.csv")

df_train['texttags'] = df_train['texttags'].apply(lambda x: str(x))
df_test['texttags'] = df_test['texttags'].apply(lambda x: str(x))

y_pred = runBert(df_train, df_val, df_test)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6859,0.662924,0.802913,0.802913,0.802913,0.802913
2,0.5335,0.615903,0.823301,0.823301,0.823301,0.823301
3,0.4103,0.63967,0.82767,0.82767,0.82767,0.82767
4,0.2993,0.702951,0.828641,0.828641,0.828641,0.828641


***** Running Evaluation *****
  Num examples = 2060
  Batch size = 16
Saving model checkpoint to ./model/layoutqt_rvlcdip/checkpoint-7812
Configuration saved in ./model/layoutqt_rvlcdip/checkpoint-7812/config.json
Model weights saved in ./model/layoutqt_rvlcdip/checkpoint-7812/pytorch_model.bin
tokenizer config file saved in ./model/layoutqt_rvlcdip/checkpoint-7812/tokenizer_config.json
Special tokens file saved in ./model/layoutqt_rvlcdip/checkpoint-7812/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 2060
  Batch size = 16
Saving model checkpoint to ./model/layoutqt_rvlcdip/checkpoint-15624
Configuration saved in ./model/layoutqt_rvlcdip/checkpoint-15624/config.json
Model weights saved in ./model/layoutqt_rvlcdip/checkpoint-15624/pytorch_model.bin
tokenizer config file saved in ./model/layoutqt_rvlcdip/checkpoint-15624/tokenizer_config.json
Special tokens file saved in ./model/layoutqt_rvlcdip/checkpoint-15624/special_tokens_map.json
***** Running Evaluation 

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6859,0.662924,0.802913,0.802913,0.802913,0.802913
2,0.5335,0.615903,0.823301,0.823301,0.823301,0.823301
3,0.4103,0.63967,0.82767,0.82767,0.82767,0.82767
4,0.2993,0.702951,0.828641,0.828641,0.828641,0.828641
5,0.2442,0.744823,0.830097,0.830097,0.830097,0.830097


***** Running Evaluation *****
  Num examples = 2060
  Batch size = 16
Saving model checkpoint to ./model/layoutqt_rvlcdip/checkpoint-39060
Configuration saved in ./model/layoutqt_rvlcdip/checkpoint-39060/config.json
Model weights saved in ./model/layoutqt_rvlcdip/checkpoint-39060/pytorch_model.bin
tokenizer config file saved in ./model/layoutqt_rvlcdip/checkpoint-39060/tokenizer_config.json
Special tokens file saved in ./model/layoutqt_rvlcdip/checkpoint-39060/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./model/layoutqt_rvlcdip/checkpoint-39060 (score: 0.8300970873786409).
***** Running Prediction *****
  Num examples = 39999
  Batch size = 16


In [None]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import roc_auc_score

print("Accuracy:", accuracy_score(df_test['label'], y_pred))
print("F1:", f1_score(df_test['label'], y_pred, average='micro'))


Accuracy: 0.8423710592764819
F1: 0.8423710592764819


In [None]:
print(classification_report(df_test['label'], y_pred))

              precision    recall  f1-score   support

           0       0.88      0.85      0.86      2464
           1       0.78      0.78      0.78      2506
           2       0.97      0.96      0.96      2516
           3       0.81      0.79      0.80      2531
           4       0.71      0.72      0.71      2515
           5       0.80      0.80      0.80      2498
           6       0.91      0.87      0.89      2572
           7       0.92      0.90      0.91      2472
           8       0.67      0.82      0.74      2527
           9       0.84      0.81      0.83      2463
          10       0.82      0.79      0.81      2505
          11       0.86      0.84      0.85      2477
          12       0.79      0.81      0.80      2489
          13       0.88      0.86      0.87      2435
          14       0.99      0.98      0.98      2537
          15       0.90      0.89      0.90      2492

    accuracy                           0.84     39999
   macro avg       0.85   