# Fast.ai NLP for beginners using Transformers library

[Reference: Fast.AI Getting started with NLP in Transformers](https://www.kaggle.com/code/jhoward/getting-started-with-nlp-for-absolute-beginners/notebook)

## Dependencies

In [1]:
!pip install transformers
!pip install datasets
!pip install transformers[torch]

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.23.0-py3-none-any.

## Imports

In [2]:
import os
import zipfile as zp
import matplotlib.pyplot as plt
from google.colab import userdata
import shutil
import datasets
import torch
import datasets
from datasets import Dataset, DatasetDict, load_dataset, load_metric, concatenate_datasets
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
import pandas as pd
from typing import Union

os.environ['KAGGLE_USERNAME'] = 'kevvo83'
os.environ['KAGGLE_KEY'] =  userdata.get('KAGGLE_KEY') # Add the KAGGLE_KEY secret to google colab secrets

## Train/Val/Test Datasets download

In [3]:
# First manually accepted the competition rules at https://www.kaggle.com/competitions/paddy-disease-classification/rules

!kaggle competitions download -c 'us-patent-phrase-to-phrase-matching'

Downloading us-patent-phrase-to-phrase-matching.zip to /content
  0% 0.00/682k [00:00<?, ?B/s]
100% 682k/682k [00:00<00:00, 101MB/s]


In [4]:
zipfile = '/content/us-patent-phrase-to-phrase-matching.zip'
path = '/content/us-patent-phrase-to-phrase-matching'

if os.path.exists(path):
  shutil.rmtree(path, ignore_errors=True, onerror=None)

zp.ZipFile(f'{zipfile}').extractall(path)

In [5]:
train_val_ds = load_dataset(
                              "/content/us-patent-phrase-to-phrase-matching/",
                              data_files='train.csv',
                              streaming=False
                  ).get('train').\
                  shuffle(seed=44)

Generating train split: 0 examples [00:00, ? examples/s]

In [23]:
test_ds = load_dataset(
                      "/content/us-patent-phrase-to-phrase-matching/",
                      data_files='test.csv',
                      streaming=False
          ).get('train')
test_ds

Dataset({
    features: ['id', 'anchor', 'target', 'context'],
    num_rows: 36
})

In [6]:
train_val_ds

Dataset({
    features: ['id', 'anchor', 'target', 'context', 'score'],
    num_rows: 36473
})

## First cut

### Load pre-trained model

In [7]:
model_nm = 'microsoft/deberta-v3-small'

from transformers import AutoModelForSequenceClassification,AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_nm)

tokenizer

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



DebertaV2TokenizerFast(name_or_path='microsoft/deberta-v3-small', vocab_size=128000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	128000: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

### Pre-process & tokenize datasets

In [8]:
import multiprocess
from multiprocess import set_start_method
try:
  multiprocess.set_start_method("spawn")
except RuntimeError:
  print("Context has likely been set already - passing")
  pass

#### Train/Val dataset pre-processing

In [9]:
def process_ds(row: dict) -> dict:
  return {
      'input': 'TEXT1: ' + row['context'] + '; TEXT2: ' + row['target'] + '; ANC1: ' + row['anchor']
  }

train_val_ds = train_val_ds.\
                map(lambda row: process_ds(row), batched = False).\
                map(lambda row: tokenizer(row['input']), batched=True).\
                rename_column('score', 'labels')

train_val_dsd = train_val_ds.train_test_split(test_size=0.2, seed=44)

train_val_dsd

Map:   0%|          | 0/36473 [00:00<?, ? examples/s]

Map:   0%|          | 0/36473 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'anchor', 'target', 'context', 'labels', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 29178
    })
    test: Dataset({
        features: ['id', 'anchor', 'target', 'context', 'labels', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 7295
    })
})

In [10]:
assert tokenizer.vocab['▁object'] == 2713 # That looks like an underscore but it's not
assert tokenizer.vocab['▁TEXT'] == 54453 # That looks like an undersfore but its not

#### Test dataset pre-processing

In [24]:
test_ds = test_ds.\
          map(lambda row: process_ds(row), batched = False).\
          map(lambda row: tokenizer(row['input']), batched=True)

test_ds

Map:   0%|          | 0/36 [00:00<?, ? examples/s]

Map:   0%|          | 0/36 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'anchor', 'target', 'context', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 36
})

### Train model

In [10]:
train_val_ds.format

{'type': None,
 'format_kwargs': {},
 'columns': ['id',
  'anchor',
  'target',
  'context',
  'score',
  'input',
  'input_ids',
  'token_type_ids',
  'attention_mask'],
 'output_all_columns': False}

In [18]:
bs = 128
epochs = 4
lr = 8e-5

args = TrainingArguments(
    'outputs',
    learning_rate=lr,
    warmup_ratio=0.1,
    lr_scheduler_type='cosine',
    fp16=True,
    evaluation_strategy="epoch",
    per_device_train_batch_size=bs,
    per_device_eval_batch_size=bs*2,
    num_train_epochs=epochs,
    weight_decay=0.01,
    report_to='none'
  )

model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=1)

import numpy as np
def corr(x,y): return np.corrcoef(x,y)[0][1]
def corr_d(eval_pred): return {'pearson': corr(*eval_pred)}

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
import accelerate
accelerate.__version__

'0.30.1'

In [19]:
trainer = Trainer(
                  model,
                  args,
                  train_dataset=train_val_dsd['train'],
                  eval_dataset=train_val_dsd['test'],
                  tokenizer=tokenizer,
                  compute_metrics=corr_d
          )

In [20]:
trainer.train()

Epoch,Training Loss,Validation Loss,Pearson
1,No log,0.029849,0.801257
2,No log,0.027975,0.824031
3,0.030600,0.021373,0.83792
4,0.030600,0.021985,0.838702


TrainOutput(global_step=912, training_loss=0.022869040045821874, metrics={'train_runtime': 229.5941, 'train_samples_per_second': 508.341, 'train_steps_per_second': 3.972, 'total_flos': 773561790761580.0, 'train_loss': 0.022869040045821874, 'epoch': 4.0})

### Predict on test dataset

In [45]:
predictions = trainer.predict(test_ds)

predictions = np.clip(predictions.predictions, 0.0, 1.0)

preds_ds = test_ds.\
          add_column('predictions', predictions.flatten()).\
          select_columns(['id', 'predictions'])

preds_ds

Dataset({
    features: ['id', 'predictions'],
    num_rows: 36
})

## Iterate on first model

The first model above was pretty good - `r` of 0.8387 after 4 epochs of training.

In the following sections, I'll try different techniques to improve this performance

[Reference: Fast.AI - Iterate like a grandmaster](https://www.kaggle.com/code/jhoward/iterate-like-a-grandmaster/#Improving-the-model)

### Iterate idea 1 - Try a model trained on patents data - `BERT for patents`

[Model on Huggingface Model Hub](https://huggingface.co/anferico/bert-for-patents)

In [2]:
from transformers import AutoModelForSequenceClassification,AutoTokenizer

model2_nm = 'anferico/bert-for-patents'
tokenizer2 = AutoTokenizer.from_pretrained(model2_nm)

tokenizer2

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/327 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/329k [00:00<?, ?B/s]

BertTokenizerFast(name_or_path='anferico/bert-for-patents', vocab_size=39859, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

#### Pre-process train/val/test datasets

In [3]:
train_val_ds = load_dataset(
                              "/content/us-patent-phrase-to-phrase-matching/",
                              data_files='train.csv',
                              streaming=False
                  ).get('train').\
                  shuffle(seed=44)

train_val_ds = train_val_ds.\
                map(lambda row: process_ds(row), batched = False).\
                map(lambda row: tokenizer2(row['input']), batched=True).\
                rename_column('score', 'labels')

train_val_dsd = train_val_ds.train_test_split(test_size=0.2, seed=44)

train_val_dsd

NameError: name 'load_dataset' is not defined

#### Define model args



In [None]:
bs = 128
epochs = 4
lr = 8e-5

args2 = TrainingArguments(
    'outputs',
    learning_rate=lr,
    warmup_ratio=0.1,
    lr_scheduler_type='cosine',
    fp16=True,
    evaluation_strategy="epoch",
    per_device_train_batch_size=bs,
    per_device_eval_batch_size=bs*2,
    num_train_epochs=epochs,
    weight_decay=0.01,
    report_to='none'
  )

model2 = AutoModelForSequenceClassification.from_pretrained(model2_nm, num_labels=1)

trainer2 = Trainer(
                  model2,
                  args2,
                  train_dataset=train_val_dsd['train'],
                  eval_dataset=train_val_dsd['test'],
                  tokenizer=tokenizer2,
                  compute_metrics=corr_d
          )

#### Train model

In [None]:
trainer2.train()