<a href="https://www.kaggle.com/code/ikram98ai/us-patent-phrase-to-phrase-matching?scriptVersionId=189685936" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

## Import and EDA

In [None]:
# %pip install -U datasets==2.17.0
# %pip install transformers==4.27.2 

In [57]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from datasets import Dataset,DatasetDict
from transformers import AutoModelForSequenceClassification,AutoTokenizer
from transformers import TrainingArguments,Trainer

In [20]:
df = pd.read_csv('/kaggle/input/us-patent-phrase-to-phrase-matching/train.csv')
df

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.50
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75
2,36d72442aefd8232,abatement,active catalyst,A47,0.25
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.50
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.00
...,...,...,...,...,...
36468,8e1386cbefd7f245,wood article,wooden article,B44,1.00
36469,42d9e032d1cd3242,wood article,wooden box,B44,0.50
36470,208654ccb9e14fa3,wood article,wooden handle,B44,0.50
36471,756ec035e694722b,wood article,wooden material,B44,0.75


In [21]:
df.describe(include='object')

Unnamed: 0,id,anchor,target,context
count,36473,36473,36473,36473
unique,36473,733,29340,106
top,37d61fd2272659b1,component composite coating,composition,H01
freq,1,152,24,2186


In [22]:
df.score.describe()

count    36473.000000
mean         0.362062
std          0.258335
min          0.000000
25%          0.250000
50%          0.250000
75%          0.500000
max          1.000000
Name: score, dtype: float64

In [23]:
df.score.value_counts()

0.50    12300
0.25    11519
0.00     7471
0.75     4029
1.00     1154
Name: score, dtype: int64

In [24]:
df.score.value_counts(normalize=True).round(3)

0.50    0.337
0.25    0.316
0.00    0.205
0.75    0.110
1.00    0.032
Name: score, dtype: float64

In [25]:
df['input'] = 'TEXT1: ' + df.context + '; TEXT2: ' + df.target + '; ANC1: ' + df.anchor

In [26]:
print(max([len(input_text) for input_text in df.input]))
print(min([len(input_text) for input_text in df.input]))

133
33


In [27]:
df.input.head()

0    TEXT1: A47; TEXT2: abatement of pollution; ANC...
1    TEXT1: A47; TEXT2: act of abating; ANC1: abate...
2    TEXT1: A47; TEXT2: active catalyst; ANC1: abat...
3    TEXT1: A47; TEXT2: eliminating process; ANC1: ...
4    TEXT1: A47; TEXT2: forest region; ANC1: abatement
Name: input, dtype: object

## Tokenization

In [28]:
ds = Dataset.from_pandas(df)
ds

Dataset({
    features: ['id', 'anchor', 'target', 'context', 'score', 'input'],
    num_rows: 36473
})

A deep learning model expects numbers as inputs, not English sentences! So we need to do two things:

- *Tokenization*: Split each text up into words (or actually, as we'll see, into *tokens*)
- *Numericalization*: Convert each word (or token) into a number.

In [29]:
model_nm = 'microsoft/deberta-v3-small'

In [30]:
# `AutoTokenizer` will create a tokenizer appropriate for a given model:
tokz = AutoTokenizer.from_pretrained(model_nm)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [31]:
# Uncommon words will be split into pieces. The start of a new word is represented by `▁`:
print(tokz.tokenize("A platypus is an ornithorhynchus anatinus."))

['▁A', '▁platypus', '▁is', '▁an', '▁or', 'ni', 'tho', 'rhynch', 'us', '▁an', 'at', 'inus', '.']


In [32]:
# Here's a simple function which tokenizes our inputs:
def tok_func(x): return tokz(x["input"])

In [33]:
# To run this quickly in parallel on every row in our dataset, use map
tok_ds = ds.map(tok_func, batched=True)

  0%|          | 0/37 [00:00<?, ?ba/s]

In [34]:
#  the input and IDs for the first row of our data:
row = tok_ds[0]
print(row['input'])
print(row['input_ids'])

TEXT1: A47; TEXT2: abatement of pollution; ANC1: abatement
[1, 54453, 435, 294, 336, 5753, 346, 54453, 445, 294, 47284, 265, 6435, 346, 23702, 435, 294, 47284, 2]


In [35]:
# Vocab in the tokenizer which contains a unique integer for every possible token string
tokz.vocab['▁of']

265

In [36]:
# Transformers always assumes that your labels has the column name `labels`, it's currently `score`.
tok_ds = tok_ds.rename_columns({'score':'labels'})

## train test split

In [37]:
dds = tok_ds.train_test_split(0.25, seed=42)
dds

DatasetDict({
    train: Dataset({
        features: ['id', 'anchor', 'target', 'context', 'labels', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 27354
    })
    test: Dataset({
        features: ['id', 'anchor', 'target', 'context', 'labels', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9119
    })
})

## Metrics and correlation

*submissions are evaluated on the [Pearson correlation coefficient](https://en.wikipedia.org/wiki/Pearson_correlation_coefficient) between the predicted and actual similarity scores*. This coefficient is usually abbreviated using the single letter *r*. It is the most widely used measure of the degree of relationship between two variables.


In [41]:
def corr(x,y): return np.corrcoef(x,y)[0][1]

In [42]:
def show_corr(df, a, b):
    x,y = df[a],df[b]
    plt.scatter(x,y, alpha=0.5, s=4)
    plt.title(f'{a} vs {b}; r: {corr(x, y):.2f}')

In [43]:
# Transformers expects metrics to be returned as a `dict`, so the trainer knows what label to use.
def corr_d(eval_pred): return {'pearson': corr(*eval_pred)}

## Training our model

In [47]:
# We pick a batch size that fits our GPU, and small number of epochs so we can run experiments quickly
bs = 128
epochs = 4
lr = 8e-5

In [48]:
# Transformers uses the `TrainingArguments` class to set up arguments
args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,
    evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,
    num_train_epochs=epochs, weight_decay=0.01, report_to='none')

In [49]:
model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=1)
# Trainer is a class which combines the data and model together
trainer = Trainer(model, args, train_dataset=dds['train'], eval_dataset=dds['test'],
                  tokenizer=tokz, compute_metrics=corr_d)

Downloading:   0%|          | 0.00/273M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-small were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.bias', 'mask_predictions.classifier.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from 

In [50]:
trainer.train();

The following columns in the training set  don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: context, id, input, target, anchor.
***** Running training *****
  Num examples = 27354
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 856


Epoch,Training Loss,Validation Loss,Pearson
1,No log,0.02618,0.796731
2,No log,0.02206,0.822574
3,0.033400,0.022044,0.833279
4,0.033400,0.022786,0.833725


  args.max_grad_norm,
The following columns in the evaluation set  don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: context, id, input, target, anchor.
***** Running Evaluation *****
  Num examples = 9119
  Batch size = 256
The following columns in the evaluation set  don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: context, id, input, target, anchor.
***** Running Evaluation *****
  Num examples = 9119
  Batch size = 256
Saving model checkpoint to outputs/checkpoint-500
Configuration saved in outputs/checkpoint-500/config.json
Model weights saved in outputs/checkpoint-500/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-500/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-500/special_tokens_map.json
added tokens file saved in outputs/checkpoint-500/added_tokens.json
The following columns in the evaluation set  don't have a correspo

The key thing to look at is the "Pearson" value in table above. As you see, it's increasing, and is already above 0.8. That's great news!

## Model Evaluation

In [None]:
actual = np.array(dds['test']['labels'])
eval_ds = dds['test'].remove_columns('labels')

In [None]:
probs = peft_trainer.predict(eval_ds).predictions.astype(float)
# Initialize an array of zeros with the same shape as probs
preds = np.zeros_like(probs)

# # Use np.argmax to find the index of the max value in each row
# max_indices = np.argmax(probs, axis=1)
# preds[np.arange(preds.shape[0]), max_indices] = 1

In [None]:
print(corr_d([actual,preds]))

## Test set

In [51]:
test_df = pd.read_csv('/kaggle/input/us-patent-phrase-to-phrase-matching/test.csv')
test_df['input'] = 'TEXT1: ' + test_df.context + '; TEXT2: ' + test_df.target + '; ANC1: ' + test_df.anchor
test_ds = Dataset.from_pandas(test_df).map(tok_func, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [52]:
preds = trainer.predict(test_ds).predictions.astype(float)
preds

The following columns in the test set  don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: context, id, target, input, anchor.
***** Running Prediction *****
  Num examples = 36
  Batch size = 256


array([[ 0.56103516],
       [ 0.67480469],
       [ 0.53857422],
       [ 0.32128906],
       [-0.00606918],
       [ 0.54150391],
       [ 0.5078125 ],
       [ 0.05545044],
       [ 0.26025391],
       [ 1.12011719],
       [ 0.26000977],
       [ 0.23596191],
       [ 0.72509766],
       [ 0.87548828],
       [ 0.71533203],
       [ 0.50292969],
       [ 0.34008789],
       [-0.02651978],
       [ 0.64404297],
       [ 0.37426758],
       [ 0.48803711],
       [ 0.26855469],
       [ 0.08905029],
       [ 0.28857422],
       [ 0.58789062],
       [-0.02453613],
       [-0.03347778],
       [-0.0295105 ],
       [-0.03665161],
       [ 0.51416016],
       [ 0.34008789],
       [ 0.02220154],
       [ 0.75      ],
       [ 0.50341797],
       [ 0.44604492],
       [ 0.22900391]])

In [53]:
# some of our predictions are <0, or >1!  fixing those out-of-bounds predictions:
preds = np.clip(preds, 0, 1)

In [58]:
preds.round(2)

array([[0.56],
       [0.67],
       [0.54],
       [0.32],
       [0.  ],
       [0.54],
       [0.51],
       [0.06],
       [0.26],
       [1.  ],
       [0.26],
       [0.24],
       [0.73],
       [0.88],
       [0.72],
       [0.5 ],
       [0.34],
       [0.  ],
       [0.64],
       [0.37],
       [0.49],
       [0.27],
       [0.09],
       [0.29],
       [0.59],
       [0.  ],
       [0.  ],
       [0.  ],
       [0.  ],
       [0.51],
       [0.34],
       [0.02],
       [0.75],
       [0.5 ],
       [0.45],
       [0.23]])

In [59]:
# Save a CSV in notebook
submission = Dataset.from_dict({
    'id': test_ds['id'],
    'score': preds
})

submission.to_csv('submission.csv', index=False)

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

1026