In [6]:
import torch
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer


In [15]:
MAX_LEN = 128
BATCH_SIZE = 32
LR = 5e-5

In [3]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,id,anchor,target,context,score,context_text
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,furniture; domestic articles or appliances; co...
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,furniture; domestic articles or appliances; co...
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,furniture; domestic articles or appliances; co...
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5,furniture; domestic articles or appliances; co...
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,furniture; domestic articles or appliances; co...


In [4]:
df.shape

(36473, 6)

In [12]:
def get_transformer_input(row, text):
    return f'{row.anchor} [SEP] {row.target} [SEP] {row.context_text if text else row.context}'

In [13]:
df.apply(get_transformer_input, text=False, axis=1)

0        abatement [SEP] abatement of pollution [SEP] A47
1                abatement [SEP] act of abating [SEP] A47
2               abatement [SEP] active catalyst [SEP] A47
3           abatement [SEP] eliminating process [SEP] A47
4                 abatement [SEP] forest region [SEP] A47
                               ...                       
36468         wood article [SEP] wooden article [SEP] B44
36469             wood article [SEP] wooden box [SEP] B44
36470          wood article [SEP] wooden handle [SEP] B44
36471        wood article [SEP] wooden material [SEP] B44
36472       wood article [SEP] wooden substrate [SEP] B44
Length: 36473, dtype: object

In [14]:
df.apply(get_transformer_input, text=True, axis=1)

0        abatement [SEP] abatement of pollution [SEP] f...
1        abatement [SEP] act of abating [SEP] furniture...
2        abatement [SEP] active catalyst [SEP] furnitur...
3        abatement [SEP] eliminating process [SEP] furn...
4        abatement [SEP] forest region [SEP] furniture;...
                               ...                        
36468    wood article [SEP] wooden article [SEP] decora...
36469    wood article [SEP] wooden box [SEP] decorative...
36470    wood article [SEP] wooden handle [SEP] decorat...
36471    wood article [SEP] wooden material [SEP] decor...
36472    wood article [SEP] wooden substrate [SEP] deco...
Length: 36473, dtype: object

In [36]:
class PatentDataset(Dataset):
    def __init__(self, texts, scores, tokenizer, max_len=128):
        self.texts = texts
        self.scores = scores
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)  
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        score = self.scores[idx] if self.scores is not None else None
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            padding="max_length",
            return_token_type_ids=False,
            max_length=self.max_len,
            return_attention_mask=True,
            return_tensors="pt",
            truncation="only_first"
        )
        
        item = {
            "text": text,
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
        }

        if score:
            item["scores"] = torch.tensor(score, dtype=torch.long)

        return item

In [37]:
def create_data_loader(df, tokenizer, text=False):
    ds = PatentDataset(
        texts=df.apply(get_transformer_input, text=True, axis=1).to_numpy(),
        scores=df.score.to_numpy() if "score" in df.columns else None,
        tokenizer=tokenizer,
        max_len=MAX_LEN,
    )
    return DataLoader(ds, batch_size=BATCH_SIZE, shuffle=True)

In [19]:
tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-base')

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [38]:
train_dl = create_data_loader(df, tokenizer)

In [39]:
for b in train_dl:
    print(b)
    break

{'text': ['compression loss [SEP] loss of smell [SEP] machines or engines in general; engine plants in general; steam engines', 'connect to common conductor [SEP] self adjusting load responsive brake [SEP] hoisting; lifting; hauling', 'normal base [SEP] base system [SEP] printing; lining machines; typewriters; stamps', 'oxygen value [SEP] gas level [SEP] combustion engines; hot-gas or combustion-product engine plants', 'speed control means [SEP] network control [SEP] controlling; regulating', 'walnut oil [SEP] black walnut oil [SEP] medical or veterinary science; hygiene', 'tunneling capacitor [SEP] drain region [SEP] information storage', 'single pumping chamber [SEP] single pumping chamber outlet port [SEP] positive - displacement machines for liquids; pumps for liquids or elastic fluids', 'alumino silicates [SEP] oxides which zeolites [SEP] photography; cinematography; analogous techniques using waves other than optical waves; electrography; holography', 'filled interior [SEP] cavit

  item["scores"] = torch.tensor(score, dtype=torch.long)
