# BERT for Patents Baseline

- [kfold strategy](https://www.kaggle.com/code/abhishek/phrase-matching-folds)
- Utilize [Cooperative Patent Classification Codes Meaning](https://www.kaggle.com/datasets/xhlulu/cpc-codes)
- reference [phantivia'Notebook](https://www.kaggle.com/code/phantivia/uspppm-huggingface-train-inference-baseline)
- [BERT for Patents](https://www.kaggle.com/datasets/ksork6s4/bert-for-patents) from [huggingface page](https://huggingface.co/anferico/bert-for-patents)


### Please refer to [Inference Notebook](https://www.kaggle.com/code/ksork6s4/uspppm-bert-for-patents-baseline-inference/edit/run/91272728) as well.

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold
import shutil

from torch.utils.data import DataLoader, Dataset
import transformers
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification, AutoTokenizer

os.environ['WANDB_DISABLED'] = 'true'

# Config

In [2]:
class CFG:
    input_path = '/kaggle/input/us-patent-phrase-to-phrase-matching/'
    model_path = '/kaggle/input/deberta-v3-large/deberta-v3-large/'
    
    learning_rate = 2e-5
    weight_decay = 0.01
    num_fold = 5
    epochs = 7
    batch_size = 10

# Preprocessing

In [3]:
from pandas.core.common import random_state
train_df = pd.read_csv(f'{CFG.input_path}train.csv')
titles = pd.read_csv('/kaggle/input/cpc-codes/titles.csv')
train_df = train_df.merge(titles, left_on='context', right_on='code')

# https://www.kaggle.com/code/abhishek/phrase-matching-folds
def create_folds(data, num_splits):
    # we create a new column called kfold and fill it with -1
    data['fold'] = -1
    
    # the next step is to randomize the rows of the data
    data = data.sample(frac=1, random_state=42).reset_index(drop=True)

    # calculate number of bins by Sturge's rule
    # I take the floor of the value, you can also
    # just round it
    # 16
    # num_bins = int(np.floor(1 + np.log2(len(data))))
    
    # bin targets
    data.loc[:, 'bins'] = pd.cut(
        data['score'], bins=5, labels=False
    )
    
    # initiate the kfold class from model_selection module
    kf = StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=42)
    
    # fill the new kfold column
    # note that, instead of targets, we use bins!
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)):
        data.loc[v_, 'fold'] = f
    
    # drop the bins column
    data = data.drop('bins', axis=1)

    # return dataframe with folds
    return data

In [4]:
train_df['input'] = train_df['title'] + ' ' + train_df['anchor']
train_df = create_folds(train_df, CFG.num_fold)

# Tokenizer

In [5]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model_path)

# Dataset

In [6]:
class TrainDataset(Dataset):
    def __init__(self, df):
        self.inputs = df['input'].values.astype(str)
        self.targets = df['target'].values.astype(str)
        self.label = df['score'].values

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, item):
        inputs = self.inputs[item]
        targets = self.targets[item]
        label = self.label[item]
        
        return {
        **tokenizer( inputs, targets ),
        'label':label.astype(np.float32)
    }

# Train

In [7]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.reshape(len(predictions))
    return {
        'pearson': np.corrcoef(predictions, labels)[0][1]
    }

In [8]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(CFG.model_path, num_labels=1)

In [9]:
from ray import tune

def get_trial_params(trial):
    return {
            'learning_rate': tune.loguniform(1e-6, 1e-4),
            'num_train_epochs': tune.choice(list(range(1, 2))),
            'seed': tune.uniform(1, 40),
            'per_device_train_batch_size': tune.choice([10, ]),
        }

In [10]:
from ray.tune.suggest.hyperopt import HyperOptSearch
from ray.tune.schedulers import ASHAScheduler


oof_df = pd.DataFrame()
for fold in range(CFG.num_fold):
    tr_data = train_df[train_df['fold'] != fold].reset_index(drop=True)
    va_data = train_df[train_df['fold'] == fold].reset_index(drop=True)
    tr_dataset = TrainDataset(tr_data)
    va_dataset = TrainDataset(va_data)
    
    args = TrainingArguments(
        output_dir=f'/tmp/deberta',
        evaluation_strategy='epoch',
        save_strategy='epoch',
        weight_decay=CFG.weight_decay,
        metric_for_best_model='pearson',
        load_best_model_at_end=True,
    )
    
    trainer = Trainer(
        args=args,
        train_dataset=tr_dataset,
        eval_dataset=va_dataset,
        tokenizer=tokenizer,
        model_init=model_init,
        compute_metrics=compute_metrics,
    )
    
    best_trial = trainer.hyperparameter_search(
        hp_space=get_trial_params,
        n_trials=3,
        direction='maximize',
        backend='ray',
        search_alg=HyperOptSearch(metric='objective', mode='max'),
        scheduler=ASHAScheduler(metric='objective', mode='max'),
    )
    
    args = TrainingArguments(
        output_dir=f'/tmp/deberta',
        evaluation_strategy='epoch',
        save_strategy='epoch',
        num_train_epochs=CFG.epochs,
        weight_decay=CFG.weight_decay,
        metric_for_best_model='pearson',
        load_best_model_at_end=True,
    )
    
    model = AutoModelForSequenceClassification.from_pretrained(CFG.model_path, num_labels=1)
    trainer = Trainer(
        model,
        args,
        train_dataset=tr_dataset,
        eval_dataset=va_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )
    
    trainer.train(resume_from_checkpoint=f'/tmp/deberta/run-{best_trial.run_id}/checkpoint-2918/')
    shutil.rmtree(f'/tmp/deberta')
    trainer.save_model(f'deberta_{fold}')
    
    outputs = trainer.predict(va_dataset)
    predictions = outputs.predictions.reshape(-1)
    va_data['preds'] = predictions
    oof_df = pd.concat([oof_df, va_data])

In [13]:
%cd /kaggle/working

In [14]:
from IPython.display import FileLink

FileLink(f'deberta_4/pytorch_model.bin')