In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/cpc-codes/titles.csv
/kaggle/input/debertav3small/spm.model
/kaggle/input/debertav3small/config.json
/kaggle/input/debertav3small/README.md
/kaggle/input/debertav3small/tf_model.h5
/kaggle/input/debertav3small/tokenizer_config.json
/kaggle/input/debertav3small/pytorch_model.bin
/kaggle/input/us-patent-phrase-to-phrase-matching/sample_submission.csv
/kaggle/input/us-patent-phrase-to-phrase-matching/train.csv
/kaggle/input/us-patent-phrase-to-phrase-matching/test.csv


# Credits:
this notebook is based on the notbook ["Getting started with NLP for absolute beginners"](https://www.kaggle.com/code/jhoward/getting-started-with-nlp-for-absolute-beginners) by @Jeremy Howard

## Imports

In [2]:
import tensorflow as tf
import tensorflow_hub as hub
import seaborn as sns
import matplotlib.pyplot as plt

# EDA

In [3]:
train = pd.read_csv("/kaggle/input/us-patent-phrase-to-phrase-matching/train.csv")
print(train.shape)
train.head()

(36473, 5)


Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75
2,36d72442aefd8232,abatement,active catalyst,A47,0.25
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0


In [4]:
train.describe(include='object')

Unnamed: 0,id,anchor,target,context
count,36473,36473,36473,36473
unique,36473,733,29340,106
top,37d61fd2272659b1,component composite coating,composition,H01
freq,1,152,24,2186


In [5]:
train.groupby('score').count()

Unnamed: 0_level_0,id,anchor,target,context
score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.0,7471,7471,7471,7471
0.25,11519,11519,11519,11519
0.5,12300,12300,12300,12300
0.75,4029,4029,4029,4029
1.0,1154,1154,1154,1154


## Score meanings according to [Data Description](https://www.kaggle.com/competitions/us-patent-phrase-to-phrase-matching/data?select=train.csv)

The scores are in the 0-1 range with increments of 0.25 with the following meanings:

    1.0 - Very close match. This is typically an exact match except possibly for differences in conjugation, quantity (e.g. singular vs. plural), and addition or removal of stopwords (e.g. “the”, “and”, “or”).
    0.75 - Close synonym, e.g. “mobile phone” vs. “cellphone”. This also includes abbreviations, e.g. "TCP" -> "transmission control protocol".
    0.5 - Synonyms which don’t have the same meaning (same function, same properties). This includes broad-narrow (hyponym) and narrow-broad (hypernym) matches.
    0.25 - Somewhat related, e.g. the two phrases are in the same high level domain but are not synonyms. This also includes antonyms.
    0.0 - Unrelated.


# Test Corpus

In [6]:
test = pd.read_csv("/kaggle/input/us-patent-phrase-to-phrase-matching/test.csv")

In [7]:
test.shape

(36, 4)

In [8]:
def cosine(u, v):
    """
    cosine similarity definition
    """
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

# External Context

In [9]:
# ajouter le contexte https://www.cooperativepatentclassification.org/Archive
# extraire le contexte du fichier xml

In [10]:
titles = pd.read_csv("../input/cpc-codes/titles.csv")
print(titles.shape)
titles.head()

(260476, 7)


Unnamed: 0,code,title,section,class,subclass,group,main_group
0,A,HUMAN NECESSITIES,A,,,,
1,A01,AGRICULTURE; FORESTRY; ANIMAL HUSBANDRY; HUNTI...,A,1.0,,,
2,A01B,SOIL WORKING IN AGRICULTURE OR FORESTRY; PARTS...,A,1.0,B,,
3,A01B1/00,Hand tools (edge trimmers for lawns A01G3/06 ...,A,1.0,B,1.0,0.0
4,A01B1/02,Spades; Shovels {(hand-operated dredgers E02F3...,A,1.0,B,1.0,2.0


# HuggingFace AutoModelForSequenceClassification

In [11]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from scipy.stats import pearsonr
from scipy.stats import spearmanr # pour trouver les correspondances entre les résultats sur des échelles variées
from sklearn.model_selection import StratifiedGroupKFold

In [12]:
import transformers
from transformers import TrainingArguments,Trainer
from transformers import AutoModelForSequenceClassification,AutoTokenizer

import torch
import datasets
from datasets import load_dataset, Dataset, DatasetDict

In [13]:
import warnings
import logging
from IPython.display import display, HTML

# Corpora merge

In [14]:
train['section'] = train["context"].str[0]
print(train.shape)
display(train.head())
titles=titles.rename(columns={"code": "context"})
display(titles.head())
train = train.merge(titles[["context", "title"]], on="context").rename(columns={"title": "context_title"})
train = train.merge(titles[["context", "title"]].rename(columns={"context": "section"}), on="section").rename(columns={"title": "section_title"})
display(train.head())

(36473, 6)


Unnamed: 0,id,anchor,target,context,score,section
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,A
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,A
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,A
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5,A
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,A


Unnamed: 0,context,title,section,class,subclass,group,main_group
0,A,HUMAN NECESSITIES,A,,,,
1,A01,AGRICULTURE; FORESTRY; ANIMAL HUSBANDRY; HUNTI...,A,1.0,,,
2,A01B,SOIL WORKING IN AGRICULTURE OR FORESTRY; PARTS...,A,1.0,B,,
3,A01B1/00,Hand tools (edge trimmers for lawns A01G3/06 ...,A,1.0,B,1.0,0.0
4,A01B1/02,Spades; Shovels {(hand-operated dredgers E02F3...,A,1.0,B,1.0,2.0


Unnamed: 0,id,anchor,target,context,score,section,context_title,section_title
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,A,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,HUMAN NECESSITIES
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,A,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,HUMAN NECESSITIES
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,A,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,HUMAN NECESSITIES
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5,A,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,HUMAN NECESSITIES
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,A,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,HUMAN NECESSITIES


In [15]:
logging.disable(logging.WARNING)
warnings.simplefilter('ignore')

# Deberta-v3-small

In [16]:
#model_name = 'microsoft/deberta-v3-small' (online version)
model_name = '../input/debertav3small'

In [17]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [18]:
tokenizer.all_special_tokens

['[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]']

In [19]:
def clean(x):
    t = x.lower()
    t = t.replace("[",'')
    t = t.replace(";",'')
    t = t.replace(",",'')
    t = t.replace("]",'')
    t = t.replace(":",'')
    t = t.replace("(",'')
    t = t.replace(")",'')
    t = t.replace("{",'')
    t = t.replace("}",'')
    t = t.replace("/",' ')
    t = t.replace("-",' ')
    return t

In [20]:
train["inputs"] = train["section_title"].apply(clean) + " [SEP] " + train["anchor"] + " [SEP] " + train["target"]

In [21]:
def tok_func(x):
    return tokenizer(x["inputs"])

In [22]:
train_ds = Dataset.from_pandas(train).rename_column('score', 'label')

In [23]:
inps = "anchor","target","context","context_title","section_title"
tok_ds = train_ds.map(tok_func, batched=True, remove_columns=inps+('inputs','id','section'))

  0%|          | 0/37 [00:00<?, ?ba/s]

In [24]:
tok_ds

Dataset({
    features: ['label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 36473
})

In [25]:
# bs - batch size
lr,bs = 8e-5,64
# wd: weight decay
wd,epochs = 0.01,4

In [26]:
anchors = train["anchor"].unique()
np.random.shuffle(anchors)

In [27]:
val_prop = 0.25
val_sz = int(len(anchors)*val_prop)
val_anchors = anchors[:val_sz]

In [28]:
is_val = np.isin(train.anchor, val_anchors)
idxs = np.arange(len(train))
val_idxs = idxs[ is_val]
trn_idxs = idxs[~is_val]
len(val_idxs),len(trn_idxs)

(8963, 27510)

In [29]:
dds = DatasetDict({"train":tok_ds.select(trn_idxs),
             "test": tok_ds.select(val_idxs)})

In [30]:
def corr(eval_pred): return {'pearson': np.corrcoef(*eval_pred)[0][1]}

In [31]:
#args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,
    #evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,
     #num_train_epochs=epochs, weight_decay=wd, report_to='none')
args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine',
    evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,
    num_train_epochs=epochs, weight_decay=wd, report_to='none')

In [32]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)
trainer = Trainer(model, args, train_dataset=dds['train'], eval_dataset=dds['test'],
               tokenizer=tokenizer, compute_metrics=corr)

In [33]:
dds['train'][0]

{'label': 0.5,
 '__index_level_0__': 0,
 'input_ids': [1, 857, 19629, 2, 47284, 2, 47284, 265, 6435, 2],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [34]:
trainer.train()

Epoch,Training Loss,Validation Loss,Pearson
1,No log,0.033712,0.779204
2,0.044800,0.025779,0.800698
3,0.019600,0.024763,0.809451
4,0.012500,0.024667,0.811183


TrainOutput(global_step=1720, training_loss=0.023718745763911756, metrics={'train_runtime': 278.8184, 'train_samples_per_second': 394.665, 'train_steps_per_second': 6.169, 'total_flos': 479727315718380.0, 'train_loss': 0.023718745763911756, 'epoch': 4.0})

In [35]:
trainer.save_model()

# Test corpus preprocessing

In [36]:
test.head()

Unnamed: 0,id,anchor,target,context
0,4112d61851461f60,opc drum,inorganic photoconductor drum,G02
1,09e418c93a776564,adjust gas flow,altering gas flow,F23
2,36baf228038e314b,lower trunnion,lower locating,B60
3,1f37ead645e7f0c8,cap component,upper portion,D06
4,71a5b6ad068d531f,neural stimulation,artificial neural network,H04


In [37]:
test['section'] = test["context"].str[0]
print(test.shape)
display(titles.head())
test = test.merge(titles[["context", "title"]], on="context").rename(columns={"title": "context_title"})
test = test.merge(titles[["context", "title"]].rename(columns={"context": "section"}), on="section").rename(columns={"title": "section_title"})
display(test.head())

(36, 5)


Unnamed: 0,context,title,section,class,subclass,group,main_group
0,A,HUMAN NECESSITIES,A,,,,
1,A01,AGRICULTURE; FORESTRY; ANIMAL HUSBANDRY; HUNTI...,A,1.0,,,
2,A01B,SOIL WORKING IN AGRICULTURE OR FORESTRY; PARTS...,A,1.0,B,,
3,A01B1/00,Hand tools (edge trimmers for lawns A01G3/06 ...,A,1.0,B,1.0,0.0
4,A01B1/02,Spades; Shovels {(hand-operated dredgers E02F3...,A,1.0,B,1.0,2.0


Unnamed: 0,id,anchor,target,context,section,context_title,section_title
0,4112d61851461f60,opc drum,inorganic photoconductor drum,G02,G,OPTICS,PHYSICS
1,5203a36c501f1b7c,generate in layer,generate by layer,G02,G,OPTICS,PHYSICS
2,7aa5908a77a7ec24,el display,illumination,G02,G,OPTICS,PHYSICS
3,442c114ed5c4e3c9,tunneling capacitor,capacitor housing,G11,G,INFORMATION STORAGE,PHYSICS
4,12ca31f018a2e2b9,speed control means,control loop,G05,G,CONTROLLING; REGULATING,PHYSICS


In [38]:
test["inputs"] = test["section_title"].apply(clean) + " [SEP] " + test["anchor"] + " [SEP] " + test["target"]

In [39]:
test_ds = Dataset.from_pandas(test)

In [40]:
test_ds

Dataset({
    features: ['id', 'anchor', 'target', 'context', 'section', 'context_title', 'section_title', 'inputs', '__index_level_0__'],
    num_rows: 36
})

In [41]:
inps = "anchor","target","context","context_title","section_title"
test_tok_ds = test_ds.map(tok_func, batched=True, remove_columns=inps+('inputs','id','section'))

  0%|          | 0/1 [00:00<?, ?ba/s]

In [42]:
preds = trainer.predict(test_tok_ds).predictions.astype(float)
preds

array([[ 0.53237623],
       [ 0.51400381],
       [ 0.49207991],
       [ 0.60019964],
       [-0.00188122],
       [ 0.71541005],
       [ 0.59056664],
       [ 0.67587668],
       [ 1.04140818],
       [ 0.24156526],
       [ 0.22715119],
       [ 0.55409724],
       [-0.01919177],
       [ 0.48592019],
       [ 0.58153784],
       [-0.02628734],
       [ 0.23908675],
       [ 0.8091166 ],
       [ 0.78909087],
       [-0.02151323],
       [ 0.27933672],
       [ 0.27358788],
       [ 0.28280753],
       [ 0.30986708],
       [ 0.00248956],
       [ 0.34835896],
       [ 0.37278619],
       [ 0.48865965],
       [ 0.22111028],
       [ 0.51936871],
       [ 0.43639582],
       [ 0.23699901],
       [-0.02548514],
       [ 0.26238585],
       [-0.01883685],
       [ 0.04000514]])

In [43]:
submission = datasets.Dataset.from_dict({
    'id': test_ds['id'],
    'score': preds
})

submission.to_csv('submission.csv', index=False)

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

1094