# BERT for Patents Baseline

- [kfold strategy](https://www.kaggle.com/code/abhishek/phrase-matching-folds)
- Utilize [Cooperative Patent Classification Codes Meaning](https://www.kaggle.com/datasets/xhlulu/cpc-codes)
- reference [phantivia'Notebook](https://www.kaggle.com/code/phantivia/uspppm-huggingface-train-inference-baseline)
- [BERT for Patents](https://www.kaggle.com/datasets/ksork6s4/bert-for-patents) from [huggingface page](https://huggingface.co/anferico/bert-for-patents)


### Please refer to [Inference Notebook](https://www.kaggle.com/code/ksork6s4/uspppm-bert-for-patents-baseline-inference/edit/run/91272728) as well.

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold
import shutil

from torch.utils.data import DataLoader, Dataset
import transformers
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification, AutoTokenizer

os.environ['WANDB_DISABLED'] = 'true'

# Config

In [None]:
class CFG:
    input_path = '/kaggle/input/us-patent-phrase-to-phrase-matching/'
    model_path = '/kaggle/input/deberta-5-folds/deberta_2/'
    
    learning_rate = 2e-5
    weight_decay = 0.01
    num_fold = 5
    epochs = 7
    batch_size = 10

# Preprocessing

In [None]:
from pandas.core.common import random_state
train_df = pd.read_csv(f'{CFG.input_path}train.csv')
titles = pd.read_csv('/kaggle/input/cpc-codes/titles.csv')
train_df = train_df.merge(titles, left_on='context', right_on='code')

In [None]:
train_df['input'] = train_df['title'] + '[SEP]' + train_df['anchor']

# Dataset

In [None]:
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel

In [None]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model_path)
model = AutoModelForSequenceClassification.from_pretrained(CFG.model_path, output_hidden_states=True, num_labels=1)

In [None]:
inputs = train_df['input'].values.astype(str)
targets = train_df['target'].values.astype(str)
inputs = inputs.tolist()
targets = targets.tolist()

In [None]:
embeddings = []

In [None]:
from tqdm import tqdm

for i in tqdm(range(0, train_df.shape[0], 32)):
    train_data = tokenizer(inputs[i:i+32], targets[i:i+32], padding=True).input_ids
    train_data = torch.LongTensor(train_data)
#     train_data = train_data.unsqueeze(0)
    with torch.no_grad():
        out = model(input_ids=train_data)
        
    last_four_layers = [out.hidden_states[i] for i in (-1, -2, -3, -4)]
    cat_hidden_states = torch.cat(tuple(last_four_layers), dim=-1)
    cat_sentence_embedding = torch.mean(cat_hidden_states, dim=1).squeeze()
    embeddings.append(cat_sentence_embedding.numpy())

In [None]:
embeddings = np.concatenate(embeddings, axis=0)

In [None]:
pd.DataFrame(embeddings).to_csv("deberta_2_embeddings.csv", header=None, index=None)

In [None]:
%cd /kaggle/working

In [None]:
from IPython.display import FileLink

FileLink(f'deberta_2_embeddings.csv')

In [None]:
while True:
    pass