In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import tensorflow as tf
from tensorflow import keras
# import bert
import math
import os

from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Model
from tensorflow.keras import callbacks
import tensorflow.keras.backend as K

import codecs
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle

from transformers import BertTokenizer, TFBertModel

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

/kaggle/input/us-patent-phrase-to-phrase-matching/sample_submission.csv
/kaggle/input/us-patent-phrase-to-phrase-matching/train.csv
/kaggle/input/us-patent-phrase-to-phrase-matching/test.csv
/kaggle/input/cpc-codes/titles.csv
/kaggle/input/bert-for-patent/bert-for-patents/config.json
/kaggle/input/bert-for-patent/bert-for-patents/README.md
/kaggle/input/bert-for-patent/bert-for-patents/pytorch_model.bin
/kaggle/input/bert-for-patent/bert-for-patents/.gitattributes
/kaggle/input/bert-for-patent/bert-for-patents/vocab.txt
/kaggle/input/bert-for-patent/bert-for-patents/.git/config
/kaggle/input/bert-for-patent/bert-for-patents/.git/packed-refs
/kaggle/input/bert-for-patent/bert-for-patents/.git/HEAD
/kaggle/input/bert-for-patent/bert-for-patents/.git/index
/kaggle/input/bert-for-patent/bert-for-patents/.git/description
/kaggle/input/bert-for-patent/bert-for-patents/.git/info/exclude
/kaggle/input/bert-for-patent/bert-for-patents/.git/refs/heads/main
/kaggle/input/bert-for-patent/bert-for-

# environmental variables

In [2]:
pt_model_dir = "/kaggle/input/bert-for-patent/bert-for-patents/"
ft_model_dir = "/kaggle/input/uspppm-bertforpatent-keras-train/usppm_bfp_v2.h5"
test_data_path = "/kaggle/input/us-patent-phrase-to-phrase-matching/test.csv"

max_seq_len = 80
learning_rate = 2e-5

# BERT Tokenizer 

In [3]:
tokenizer = BertTokenizer.from_pretrained(pt_model_dir)
pad_idx = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
print(tokenizer)
print("Padding token index : ", pad_idx)

PreTrainedTokenizer(name_or_path='/kaggle/input/bert-for-patent/bert-for-patents/', vocab_size=39859, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
Padding token index :  0


# encode funtion

In [4]:
def encode_text(text_pairs, 
                tokenizer,
                max_length):
    
    # With tokenizer's batch_encode_plus batch of both the sentences are
    # encoded together and separated by [SEP] token.
    encoded = tokenizer.batch_encode_plus(
        text_pairs,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_token_type_ids=True,
        return_tensors="tf",
    )

    # Convert batch of encoded features to numpy array.
    input_ids = np.array(encoded["input_ids"], dtype="int32")
    attention_masks = np.array(encoded["attention_mask"], dtype="int32")
    token_type_ids = np.array(encoded["token_type_ids"], dtype="int32")

    return {
        "input_ids": input_ids,
        "attention_masks": attention_masks,
        "token_type_ids": token_type_ids
    }

# Test data load

In [5]:
test_data = pd.read_csv(test_data_path, sep=',')
print(test_data[:5])

                 id              anchor                         target context
0  4112d61851461f60            opc drum  inorganic photoconductor drum     G02
1  09e418c93a776564     adjust gas flow              altering gas flow     F23
2  36baf228038e314b      lower trunnion                 lower locating     B60
3  1f37ead645e7f0c8       cap component                  upper portion     D06
4  71a5b6ad068d531f  neural stimulation      artificial neural network     H04


# load cpc title data

In [6]:
cpc_codes = pd.read_csv("/kaggle/input/cpc-codes/titles.csv")
print(cpc_codes)

                code                                              title  \
0                  A                                  HUMAN NECESSITIES   
1                A01  AGRICULTURE; FORESTRY; ANIMAL HUSBANDRY; HUNTI...   
2               A01B  SOIL WORKING IN AGRICULTURE OR FORESTRY; PARTS...   
3           A01B1/00  Hand tools (edge trimmers for lawns A01G3/06  ...   
4           A01B1/02  Spades; Shovels {(hand-operated dredgers E02F3...   
...              ...                                                ...   
260471  Y10T483/1864                      including tool pot or adapter   
260472  Y10T483/1873                                    Indexing matrix   
260473  Y10T483/1882                                        Rotary disc   
260474  Y10T483/1891                                      Chain or belt   
260475    Y10T483/19                                      Miscellaneous   

       section  class subclass  group  main_group  
0            A    NaN      NaN    NaN         N

In [7]:
test_data = test_data.merge(cpc_codes, left_on='context', right_on='code', how='left')
test_data['text'] = test_data['title'] + ' ' + test_data['anchor']

In [8]:
test_data

Unnamed: 0,id,anchor,target,context,code,title,section,class,subclass,group,main_group,text
0,4112d61851461f60,opc drum,inorganic photoconductor drum,G02,G02,OPTICS,G,2.0,,,,OPTICS opc drum
1,09e418c93a776564,adjust gas flow,altering gas flow,F23,F23,COMBUSTION APPARATUS; COMBUSTION PROCESSES,F,23.0,,,,COMBUSTION APPARATUS; COMBUSTION PROCESSES adj...
2,36baf228038e314b,lower trunnion,lower locating,B60,B60,VEHICLES IN GENERAL,B,60.0,,,,VEHICLES IN GENERAL lower trunnion
3,1f37ead645e7f0c8,cap component,upper portion,D06,D06,TREATMENT OF TEXTILES OR THE LIKE; LAUNDERING;...,D,6.0,,,,TREATMENT OF TEXTILES OR THE LIKE; LAUNDERING;...
4,71a5b6ad068d531f,neural stimulation,artificial neural network,H04,H04,ELECTRIC COMMUNICATION TECHNIQUE,H,4.0,,,,ELECTRIC COMMUNICATION TECHNIQUE neural stimul...
5,474c874d0c07bd21,dry corn,dry corn starch,C12,C12,BIOCHEMISTRY; BEER; SPIRITS; WINE; VINEGAR; MI...,C,12.0,,,,BIOCHEMISTRY; BEER; SPIRITS; WINE; VINEGAR; MI...
6,442c114ed5c4e3c9,tunneling capacitor,capacitor housing,G11,G11,INFORMATION STORAGE,G,11.0,,,,INFORMATION STORAGE tunneling capacitor
7,b8ae62ea5e1d8bdb,angular contact bearing,contact therapy radiation,B23,B23,MACHINE TOOLS; METAL-WORKING NOT OTHERWISE PRO...,B,23.0,,,,MACHINE TOOLS; METAL-WORKING NOT OTHERWISE PRO...
8,faaddaf8fcba8a3f,produce liquid hydrocarbons,produce a treated stream,C10,C10,"PETROLEUM, GAS OR COKE INDUSTRIES; TECHNICAL G...",C,10.0,,,,"PETROLEUM, GAS OR COKE INDUSTRIES; TECHNICAL G..."
9,ae0262c02566d2ce,diesel fuel tank,diesel fuel tanks,F02,F02,COMBUSTION ENGINES; HOT-GAS OR COMBUSTION-PROD...,F,2.0,,,,COMBUSTION ENGINES; HOT-GAS OR COMBUSTION-PROD...


# tokenize and encode the test data

In [9]:
encoded_test_data = encode_text(test_data[["text", "target"]].values.tolist(),tokenizer, max_seq_len)
print(encoded_test_data["input_ids"].shape)
print(encoded_test_data["input_ids"][0])
print(encoded_test_data["attention_masks"][0])
print(encoded_test_data["token_type_ids"][0])

(36, 80)
[    2 20691  6393  1943  6608     3 27921  5967  8328  8231 16426  6608
     3     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0]
[0 0 0 0 0 0 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0]


In [10]:
test_x = [encoded_test_data["input_ids"], encoded_test_data["attention_masks"], encoded_test_data["token_type_ids"]]
print("test x shape : ", test_x[0].shape, test_x[1].shape, test_x[2].shape)

test x shape :  (36, 80) (36, 80) (36, 80)


# Load USPPM Fine tuning model trained by Google BFP model

In [11]:
mirrored_strategy = tf.distribute.MirroredStrategy()
with mirrored_strategy.scope():
    # Encoded token ids from BERT tokenizer.
    input_ids = tf.keras.layers.Input(
        shape=(max_seq_len,), dtype=tf.int32, name="input_ids"
    )
    # Attention masks indicates to the model which tokens should be attended to.
    attention_masks = tf.keras.layers.Input(
        shape=(max_seq_len,), dtype=tf.int32, name="attention_masks"
    )
    # Token type ids are binary masks identifying different sequences in the model.
    token_type_ids = tf.keras.layers.Input(
        shape=(max_seq_len,), dtype=tf.int32, name="token_type_ids"
    )
    # Loading pretrained BERT model.
    base_model = TFBertModel.from_pretrained(pt_model_dir, from_pt=True)

    base_model_output = base_model(
        input_ids, attention_mask=attention_masks, token_type_ids=token_type_ids
    )

    last_hidden_state = base_model_output.last_hidden_state
    print(last_hidden_state.shape)
    
#     cls_out = keras.layers.Lambda(lambda seq: seq[:, 0, :])(last_hidden_state)
#     output = tf.keras.layers.Dense(1, activation="linear")(cls_out)
    
    avg_pool = tf.keras.layers.GlobalAveragePooling1D()(last_hidden_state)
    dropout = tf.keras.layers.Dropout(0.3)(avg_pool)
    output = tf.keras.layers.Dense(1, activation="linear")(dropout)

    model = tf.keras.models.Model(
        inputs=[input_ids, attention_masks, token_type_ids], outputs=output
    )

    model.compile(
        optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate),
#         optimizer = tf.keras.optimizers.Adam(),
        loss='mse',
        metrics=["mse"]
    )

# 전체 신경망 모델 요약 출력
model.summary()

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'bert.embeddings.position_ids', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the 

(None, 80, 1024)
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 80)]         0                                            
__________________________________________________________________________________________________
attention_masks (InputLayer)    [(None, 80)]         0                                            
__________________________________________________________________________________________________
token_type_ids (InputLayer)     [(None, 80)]         0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     TFBaseModelOutputWit 344702976   input_ids[0][0]                  
                                                                 attention_ma

In [12]:
model.load_weights(ft_model_dir)

# Prediction

In [13]:
pred = model.predict(test_x)

# Submit

In [14]:
submission = pd.read_csv("/kaggle/input/us-patent-phrase-to-phrase-matching/sample_submission.csv")
submission['score'] = pred
submission['score'] = submission.score.apply(lambda x: 0 if x < 0 else x)
submission['score'] = submission.score.apply(lambda x: 1 if x > 1 else x)
submission.to_csv("submission.csv",index=False)
submission

Unnamed: 0,id,score
0,4112d61851461f60,0.331659
1,09e418c93a776564,0.331659
2,36baf228038e314b,0.331659
3,1f37ead645e7f0c8,0.331659
4,71a5b6ad068d531f,0.331659
5,474c874d0c07bd21,0.331659
6,442c114ed5c4e3c9,0.331659
7,b8ae62ea5e1d8bdb,0.331659
8,faaddaf8fcba8a3f,0.331659
9,ae0262c02566d2ce,0.331659
