# Triplet-loss model using extracted features from BERT

- Extract features from pre-trained BERT
- Create and train a model with triplet loss

## Extract features from pre-trained BERT

In [1]:
import sys
sys.path.append("../notebook/bert")

In [2]:
import os
import re
import csv
import time
import codecs
import collections
import tempfile

import modeling
import optimization
import tokenization
import tensorflow as tf

In [3]:
tf.__version__

'1.12.0'

In [4]:
from extract_features import convert_examples_to_features
from extract_features import InputExample
from extract_features import read_examples
from extract_features import _truncate_seq_pair
from extract_features import InputFeatures
from extract_features import input_fn_builder
from extract_features import model_fn_builder

In [5]:
import pandas as pd
import numpy as np

In [6]:
train_data = pd.read_csv("../data/bert_train_1000.tsv", sep="\t")
test_data = pd.read_csv("../data/bert_dev_1000.tsv", sep="\t")

In [7]:
train_data.head(2)

Unnamed: 0,index,claim_app,claim_cited_grant,label
0,0,1 . A process comprising the following steps:(...,"1. A liquid supply apparatus, comprising:a wal...",not_entailment
1,1,1 - 10 . (canceled) 11 . A method for open-loo...,"1. A fuel supply apparatus for an engine, comp...",entailment


In [8]:
train_data.tail(2)

Unnamed: 0,index,claim_app,claim_cited_grant,label
2562,2562,1 . A method implemented at least in part by a...,"1. In a caching device, a method for providing...",entailment
2563,2563,1 . A nonvolatile memory device comprising:a m...,"1. A non-volatile storage system, comprising:a...",entailment


In [9]:
test_data.head(2)

Unnamed: 0,index,claim_app,claim_cited_grant,label
0,0,1 . A rotational angle detection device for a ...,1. A tangible computer-readable medium having ...,not_entailment
1,1,1 . A method comprising:sensing electrocardiog...,"1. A medical system, comprising:an implantable...",entailment


In [10]:
# train_data = train_data.sort_values(['claim_app', 'label'])
# test_data = test_data.sort_values(['claim_app', 'label'])
# train_data = train_data[0:50]
# test_data = test_data[0:50]

In [11]:
tr_claim_app_txt = tempfile.NamedTemporaryFile(mode='r+')
train_data['claim_app'].to_csv(tr_claim_app_txt.name, header=None, index=None)
!sed -i -e 's/\"//g' {tr_claim_app_txt.name}

tr_claim_cited_grant_txt = tempfile.NamedTemporaryFile(mode='r+')
train_data['claim_cited_grant'].to_csv(tr_claim_cited_grant_txt.name, header=None, index=None)
!sed -i -e 's/\"//g' {tr_claim_cited_grant_txt.name}

In [12]:
len(train_data)

2564

In [13]:
te_claim_app_txt = tempfile.NamedTemporaryFile(mode='r+')
test_data['claim_app'].to_csv(te_claim_app_txt.name, header=None, index=None)
!sed -i -e 's/\"//g' {te_claim_app_txt.name}

te_claim_cited_grant_txt = tempfile.NamedTemporaryFile(mode='r+')
test_data['claim_cited_grant'].to_csv(te_claim_cited_grant_txt.name, header=None, index=None)
!sed -i -e 's/\"//g' {te_claim_cited_grant_txt.name}

In [14]:
len( test_data )

2502

In [15]:
class FLAGS(object):
    '''Parameters.'''
    def __init__(self):
        self.vocab_file = "./bert/model/uncased_L-12_H-768_A-12/vocab.txt"
        self.do_lower_case = True
        self.use_tpu = False
        self.layers = "-1"
        self.bert_config_file = "./bert/model/uncased_L-12_H-768_A-12/bert_config.json"
        self.max_seq_length = 512
        self.init_checkpoint = "./bert/model/uncased_L-12_H-768_A-12/bert_model.ckpt"
        self.use_one_hot_embeddings = False
        self.batch_size = 16
        
        # The following parameters are not used in predictions.
        # Just use to create RunConfig.
        self.master = None
        self.save_checkpoints_steps = 1
        self.iterations_per_loop = 1
        self.num_tpu_cores = 1
        self.learning_rate = 0
        self.num_warmup_steps = 0
        self.num_train_steps = 0
        self.train_batch_size = 0
        self.eval_batch_size = 0

FLAGS = FLAGS()

In [16]:
layer_indexes = [int(x) for x in FLAGS.layers.split(",")]

bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

tokenizer = tokenization.FullTokenizer(
  vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
run_config = tf.contrib.tpu.RunConfig(
  master=FLAGS.master,
  tpu_config=tf.contrib.tpu.TPUConfig(
      num_shards=FLAGS.num_tpu_cores,
      per_host_input_for_training=is_per_host))

In [17]:
model_fn = model_fn_builder(
  bert_config=bert_config,
  init_checkpoint=FLAGS.init_checkpoint,
  layer_indexes=layer_indexes,
  use_tpu=FLAGS.use_tpu,
  use_one_hot_embeddings=FLAGS.use_one_hot_embeddings)

In [18]:
estimator = tf.contrib.tpu.TPUEstimator(
  use_tpu=FLAGS.use_tpu,
  model_fn=model_fn,
  config=run_config,
  predict_batch_size=FLAGS.batch_size)

INFO:tensorflow:Using config: {'_train_distribute': None, '_eval_distribute': None, '_task_id': 0, '_task_type': 'worker', '_is_chief': True, '_master': '', '_global_id_in_cluster': 0, '_service': None, '_cluster': None, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=2, num_shards=1, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None, input_partition_dims=None), '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_experimental_distribute': None, '_save_summary_steps': 100, '_evaluation_master': '', '_log_step_count_steps': None, '_num_ps_replicas': 0, '_model_dir': '/tmp/tmpeaqh4r2k', '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_save_checkpoints_secs': 600, '_tf_random_seed': None, '_keep_checkpoint_max': 5, '_protocol': None, '_device_fn': None, '_cluster_spec': <tensorflow.python.training.server_lib

In [19]:
start = time.time()

In [20]:
examples = read_examples(tr_claim_app_txt.name)
features = convert_examples_to_features(examples=examples, seq_length=FLAGS.max_seq_length, tokenizer=tokenizer)
unique_id_to_feature = {}
for feature in features:
    unique_id_to_feature[feature.unique_id] = feature

input_fn = input_fn_builder(features=features, seq_length=FLAGS.max_seq_length)

results =  np.empty((0,768), float)

for result in estimator.predict(input_fn, yield_single_examples=True):
    results = np.append(results, result['layer_output_0'][0].reshape(1,768), axis=0 )

train_data['feature_claim_app'] = [elem for elem in results]

In [21]:
examples = read_examples(tr_claim_cited_grant_txt.name)
features = convert_examples_to_features(examples=examples, seq_length=FLAGS.max_seq_length, tokenizer=tokenizer)
unique_id_to_feature = {}
for feature in features:
    unique_id_to_feature[feature.unique_id] = feature

input_fn = input_fn_builder(features=features, seq_length=FLAGS.max_seq_length)

results =  np.empty((0,768), float)

for result in estimator.predict(input_fn, yield_single_examples=True):
    results = np.append(results, result['layer_output_0'][0].reshape(1,768), axis=0 )

train_data['feature_claim_cited_grant'] = [elem for elem in results]

In [22]:
train_data.to_pickle("../data/bert_extracted_feature_train_1000.pkl")

In [23]:
examples = read_examples(te_claim_app_txt.name)
features = convert_examples_to_features(examples=examples, seq_length=FLAGS.max_seq_length, tokenizer=tokenizer)
unique_id_to_feature = {}
for feature in features:
    unique_id_to_feature[feature.unique_id] = feature

input_fn = input_fn_builder(features=features, seq_length=FLAGS.max_seq_length)

results =  np.empty((0,768), float)

for result in estimator.predict(input_fn, yield_single_examples=True):
    results = np.append(results, result['layer_output_0'][0].reshape(1,768), axis=0 )

test_data['feature_claim_app'] = [elem for elem in results]

In [24]:
examples = read_examples(te_claim_cited_grant_txt.name)
features = convert_examples_to_features(examples=examples, seq_length=FLAGS.max_seq_length, tokenizer=tokenizer)
unique_id_to_feature = {}
for feature in features:
    unique_id_to_feature[feature.unique_id] = feature

input_fn = input_fn_builder(features=features, seq_length=FLAGS.max_seq_length)

results =  np.empty((0,768), float)

for result in estimator.predict(input_fn, yield_single_examples=True):
    results = np.append(results, result['layer_output_0'][0].reshape(1,768), axis=0 )

test_data['feature_claim_cited_grant'] = [elem for elem in results]

In [25]:
test_data.to_pickle("../data/bert_extracted_feature_test_1000.pkl")

In [26]:
elapsed_time = time.time() - start
print ("elapsed_time:{0}".format(elapsed_time) + "[sec]")

elapsed_time:24374.943607330322[sec]


## Create and train a model with triplet loss

Please restart kernel before executing the following cells.

In [1]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np

tf.enable_eager_execution()
tfe = tf.contrib.eager

In [2]:
train_feature = pd.read_pickle("../data/bert_extracted_feature_train_1000.pkl")
test_feature = pd.read_pickle("../data/bert_extracted_feature_test_1000.pkl")

In [3]:
train_feature = train_feature.sort_values(['claim_app', 'label'])
test_feature = test_feature.sort_values(['claim_app', 'label'])

In [4]:
train_feature.head(2)

Unnamed: 0,index,claim_app,claim_cited_grant,label,feature_claim_app,feature_claim_cited_grant
2246,2246,"1 An LED (light emitting diode) lamp, compris...",1. An LED lighting device having heat convecti...,entailment,"[-0.718012809753418, 0.4575798809528351, -0.58...","[-0.819080114364624, 0.6711320877075195, -0.60..."
2408,2408,"1 An LED (light emitting diode) lamp, compris...",1. A method of operating a turbocharged intern...,not_entailment,"[-0.718012809753418, 0.4575798809528351, -0.58...","[-1.1225836277008057, 0.0022592851892113686, -..."


Normalize features.

In [5]:
train_feature['feature_claim_app'] = [
    v/np.linalg.norm(v) for v in train_feature['feature_claim_app']
]

train_feature['feature_claim_cited_grant'] = [
    v/np.linalg.norm(v) for v in train_feature['feature_claim_cited_grant']
]

In [6]:
test_feature['feature_claim_app'] = [
    v/np.linalg.norm(v) for v in test_feature['feature_claim_app']
]

test_feature['feature_claim_cited_grant'] = [
    v/np.linalg.norm(v) for v in test_feature['feature_claim_cited_grant']
]

In [7]:
train_feature.head(2)

Unnamed: 0,index,claim_app,claim_cited_grant,label,feature_claim_app,feature_claim_cited_grant
2246,2246,"1 An LED (light emitting diode) lamp, compris...",1. An LED lighting device having heat convecti...,entailment,"[-0.04264463837636313, 0.027176852956473388, -...","[-0.049404172770041124, 0.04048044267115554, -..."
2408,2408,"1 An LED (light emitting diode) lamp, compris...",1. A method of operating a turbocharged intern...,not_entailment,"[-0.04264463837636313, 0.027176852956473388, -...","[-0.06754759109007105, 0.0001359446800674143, ..."


In [8]:
class Model(object):
    def __init__(self, input_shape, output_shape):
        self.input_shape = input_shape
        self.output_shape = output_shape
        self.W = tfe.Variable( tf.random_normal( [self.input_shape, self.output_shape] ), name='weight' )
        self.B = tfe.Variable( tf.random_normal( [self.output_shape] ), name='bias' ) 
        self.variables = [ self.W, self.B ]
    
    def frwrd_pass(self,X_train):
        out = tf.matmul( X_train, self.W ) + self.B
        
        return out

In [9]:
def tripletloss(anchor_out, positive_out, negative_out, margin=0.2):
    norm_a_out = tf.nn.l2_normalize(anchor_out, axis=1)
    norm_p_out = tf.nn.l2_normalize(positive_out, axis=1)
    norm_n_out = tf.nn.l2_normalize(negative_out, axis=1)
    
    d_pos = tf.losses.cosine_distance(norm_a_out, norm_p_out, axis=1)
    d_neg = tf.losses.cosine_distance(norm_a_out, norm_n_out, axis=1)
    
    loss = tf.maximum(0.0, margin + d_pos - d_neg)
    
    return tf.reduce_mean(loss)

In [10]:
def train(input_data_np, batch_size, epochs):
#     optimizer = tf.train.GradientDescentOptimizer(learning_rate = 0.001)
    optimizer = tf.train.AdamOptimizer(learning_rate=0.00001)
    data_num = int(input_data_np.shape[1])

    for i in range(epochs):
        rand_idx = np.random.permutation(data_num)
        index_data_np = np.array([
            input_data_np[0][rand_idx], 
            input_data_np[1][rand_idx], 
            input_data_np[2][rand_idx]])

        input_data = tf.convert_to_tensor(input_data_np, dtype=tf.float32)
        anchor_data, positive_data, negative_data = input_data

        for iter_id in range(data_num // batch_size):        
            with tf.GradientTape() as tape:
                anchor_out = model.frwrd_pass(anchor_data[iter_id*batch_size : (iter_id+1)*batch_size])
                positive_out = model.frwrd_pass(positive_data[iter_id*batch_size : (iter_id+1)*batch_size])
                negative_out = model.frwrd_pass(negative_data[iter_id*batch_size : (iter_id+1)*batch_size])
                curr_loss = tripletloss(anchor_out, positive_out, negative_out)
            grads = tape.gradient( curr_loss, model.variables )
            optimizer.apply_gradients(zip(grads, model.variables), global_step=tf.train.get_or_create_global_step())

        if i % 10 == 0:
            print( "Loss at step {:d}: {:.5f}".format(i, curr_loss) )

In [11]:
# def train(input_data, epochs):
#     optimizer = tf.train.GradientDescentOptimizer(learning_rate = 0.005)
#     anchor_data, positive_data, negative_data = input_data

#     for i in range(epochs):
#         with tf.GradientTape() as tape:
#             anchor_out = model.frwrd_pass(anchor_data)
#             positive_out = model.frwrd_pass(positive_data)
#             negative_out = model.frwrd_pass(negative_data)
#             curr_loss = tripletloss(anchor_out, positive_out, negative_out)
#         grads = tape.gradient( curr_loss, model.variables )
#         optimizer.apply_gradients(zip(grads, model.variables), global_step=tf.train.get_or_create_global_step())

#         if i % 10 == 0:
#             print( "Loss at step {:d}: {:.5f}".format(i, curr_loss) )

In [12]:
# class Model(tf.keras.Model):
#     """
#     Model which extracts features for computing similarities.
#     """
#     def __init__(self, hidden_units, output_units):
#         super(Model, self).__init__()
#         self.hidden = tf.keras.layers.Dense(hidden_units, activation='relu')
#         self.output_feature = tf.keras.layers.Dense(output_units)

#     def call(self, inputs, training=None, mask=None):
#         x = self.hidden(inputs)
#         output = self.output_feature(x)

#         return output

In [13]:
anchor_list = []
positive_list = []
negative_list = []

for row in train_feature.itertuples():
    if row.label == 'entailment':
        anchor_list.append(row.feature_claim_app)
        positive_list.append(row.feature_claim_cited_grant)
    elif row.label == 'not_entailment':
        negative_list.append(row.feature_claim_cited_grant)

In [14]:
input_data = np.array([
    np.array(anchor_list),
    np.array(positive_list),
    np.array(negative_list),    
])

In [15]:
model = Model(input_shape=768, output_shape=100)

In [16]:
%%time
train(input_data, 10, 71)

Loss at step 0: 0.17352
Loss at step 10: 0.16483
Loss at step 20: 0.15072
Loss at step 30: 0.12698
Loss at step 40: 0.08736
Loss at step 50: 0.03721
Loss at step 60: 0.00000
Loss at step 70: 0.00000
CPU times: user 2min 6s, sys: 9.33 s, total: 2min 15s
Wall time: 1min 49s


In [17]:
test = np.array(input_data[0])
test = tf.convert_to_tensor(test[0:1,:], dtype=tf.float32)
test

<tf.Tensor: id=3836160, shape=(1, 768), dtype=float32, numpy=
array([[-4.26446386e-02,  2.71768533e-02, -3.46934721e-02,
         6.11349288e-03,  1.23090707e-02,  1.48237087e-02,
         1.01030609e-02,  4.48753759e-02, -2.84725763e-02,
        -2.75032986e-02, -1.47425728e-02, -1.68808531e-02,
        -2.12277211e-02,  3.94932404e-02, -1.79138761e-02,
         8.56063142e-03,  4.04272713e-02,  2.63840053e-02,
         4.34927503e-03, -9.89720647e-05, -3.65315862e-02,
        -1.96782369e-02,  4.38990742e-02,  8.60057306e-03,
        -1.34681806e-03, -1.43365394e-02, -4.57647219e-02,
        -2.24162303e-02, -3.56590673e-02,  1.17474897e-02,
         2.53382269e-02,  2.07901131e-02,  7.65671022e-04,
        -5.18207699e-02,  2.77366191e-02, -7.08562583e-02,
         3.34829241e-02, -3.55359614e-02,  3.25572118e-02,
        -2.92556435e-02, -3.42517681e-02,  1.56697072e-02,
         3.41315456e-02, -1.62944924e-02, -2.98880413e-02,
         4.43438403e-02, -2.20281690e-01,  6.60540443

In [18]:
model.frwrd_pass(test)

<tf.Tensor: id=3836165, shape=(1, 100), dtype=float32, numpy=
array([[ 0.04587567,  0.95603406,  0.19799237,  0.27132607,  0.5182936 ,
         0.24566728,  0.5247544 , -1.0414523 ,  2.0494282 ,  0.76284647,
         0.3533501 ,  0.06776845,  0.21529436, -0.38886002,  0.05726548,
        -0.34342277, -0.25012064,  0.36369675, -0.7488153 , -0.28223535,
        -0.11707094, -0.6332027 ,  1.068301  , -0.56316173, -0.52404666,
        -0.71921134,  0.06751603, -0.2854171 ,  0.19256648, -1.4145049 ,
         0.2158823 , -0.49014825,  0.00318229,  0.489711  ,  0.08123231,
        -0.18804863,  0.98251975, -0.21690783, -0.32022658,  0.08053899,
        -0.24395107, -0.16364872,  0.01122761,  0.24132544,  0.631406  ,
         0.14846647, -0.20164928,  0.6126946 , -0.4700961 ,  1.0737244 ,
        -0.36492234,  0.09478116,  0.21037029,  0.28478354,  0.54165417,
         0.72990406,  0.4772601 ,  0.8061782 , -0.5030485 , -0.32453632,
        -0.15377998, -0.5035307 ,  0.03122225,  0.32946926, -0

Save the trained model.

In [19]:
os.makedirs('../trained_model/tripletloss', exist_ok=True)

In [20]:
saver = tfe.Saver(model.variables)

In [21]:
saver.save("../trained_model/tripletloss/ckpt")

'../trained_model/tripletloss/ckpt'

## Inferece with trained model

In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
citations_info_target = pd.read_pickle("../data/citations_info_2000.df.gz")
test_app_feature_1000 = pd.read_pickle("../data/testset_app_feature_1000.pkl")
grants_feature_2000 = pd.read_pickle("../data/grants_feature_2000.pkl")

In [3]:
test_normalized_feature_dict_1000 = { 
    k:v/np.linalg.norm(v) for k,v in zip(test_app_feature_1000['app_id'], test_app_feature_1000['feature'])
}

grants_normalized_feature_dict_2000 = { 
    k:v/np.linalg.norm(v) for k,v in zip(grants_feature_2000['parsed'], grants_feature_2000['feature'])
}

In [4]:
def sort_similarity_by_value(sim_dict, app_id):
    '''
    input:
        sim_dict: similary dictionary
        app_id: target application id
    return:
        [(parsed1, sim1), (parsed2, sim2), ...] sorted by similarities
    '''
    return [(parsed, sim_dict[app_id][parsed]) for parsed in sorted(sim_dict[app_id], key=sim_dict[app_id].get)]

In [5]:
def get_cited_grants(citations_info_target, app_id):
    '''
    input:
        citations_info_target: DataFrame of citation relationships
        app_id: target application id
    return:
        {parsed1, parsed2, ...} that are cited to reject app_id
    '''
    return set(citations_info_target[citations_info_target['app_id'] == app_id]['parsed'])

In [6]:
import os
import tensorflow as tf

tf.enable_eager_execution()
tfe = tf.contrib.eager

class Model(object):
    def __init__(self, input_shape, output_shape):
        self.input_shape = input_shape
        self.output_shape = output_shape
        self.W = tfe.Variable( tf.random_normal( [self.input_shape, self.output_shape] ), name='weight' )
        self.B = tfe.Variable( tf.random_normal( [self.output_shape] ), name='bias' ) 
        self.variables = [ self.W, self.B ]
    
    def frwrd_pass(self,X_train):
        out = tf.matmul( X_train, self.W ) + self.B
        
        return out

In [7]:
model = Model(input_shape=768, output_shape=100)

In [8]:
tfe.Saver((model.variables)).restore("../trained_model/tripletloss/ckpt")

INFO:tensorflow:Restoring parameters from ../trained_model/tripletloss/ckpt


In [9]:
sorted_keys = sorted(test_normalized_feature_dict_1000.keys())

test_feature_tensors = tf.convert_to_tensor(
    np.array([ test_normalized_feature_dict_1000[k] for k in sorted_keys ]),
    dtype=tf.float32)

In [10]:
test_extracted_features = model.frwrd_pass(test_feature_tensors).numpy()

In [11]:
test_extracted_features.shape

(1000, 100)

In [12]:
test_extracted_features_df = pd.DataFrame({ 
    'app_id':sorted_keys, 'extracted_feature':[ v/np.linalg.norm(v) for v in test_extracted_features ]
})

In [13]:
test_extracted_features_df.head(2)

Unnamed: 0,app_id,extracted_feature
0,12000862,"[-0.031397596, 0.13662174, -0.21509674, 0.1348..."
1,12003258,"[0.025878849, 0.088063695, 0.08393917, -0.0568..."


In [14]:
sorted_keys = sorted(grants_normalized_feature_dict_2000.keys())

grants_feature_tensors = tf.convert_to_tensor(
    np.array([ grants_normalized_feature_dict_2000[k] for k in sorted_keys ]),
    dtype=tf.float32)

In [15]:
grants_extracted_features = model.frwrd_pass(grants_feature_tensors).numpy()

In [16]:
grants_extracted_features.shape

(2524, 100)

In [17]:
grants_extracted_features_df = pd.DataFrame({ 
    'parsed':sorted_keys, 'extracted_feature':[ v/np.linalg.norm(v) for v in grants_extracted_features ]
})

In [18]:
grants_extracted_features_df.head(2)

Unnamed: 0,extracted_feature,parsed
0,"[0.11370609, 0.10061456, 0.1965108, 0.05344611...",6837383
1,"[0.12420972, -0.012481616, -0.061706085, -0.03...",6837647


In [19]:
%%time

sim_dict = {
    app_id:{ parsed:np.sum(test_f*grants_f)
            for parsed, grants_f 
            in zip(grants_extracted_features_df['parsed'], grants_extracted_features_df['extracted_feature']) } 
    for app_id, test_f 
    in zip(test_extracted_features_df['app_id'], test_extracted_features_df['extracted_feature'])
}

CPU times: user 12.7 s, sys: 185 ms, total: 12.9 s
Wall time: 12.9 s


In [20]:
%%time

all_ranks = []

for app_id in test_extracted_features_df['app_id']:
    cited_grants = get_cited_grants(citations_info_target, app_id)
    sorted_kv = reversed(sort_similarity_by_value(sim_dict, app_id))  # higher score, similar patent

    idx = 1
    for k,v in sorted_kv:
        if k in cited_grants:
            all_ranks.append(idx)
        idx += 1

CPU times: user 3.99 s, sys: 39 ms, total: 4.03 s
Wall time: 3.97 s


In [21]:
import collections
counter = collections.Counter(all_ranks)
print(counter)

Counter({1: 59, 2: 33, 3: 19, 4: 18, 5: 14, 18: 14, 12: 13, 6: 12, 8: 12, 9: 11, 13: 11, 7: 10, 10: 10, 17: 10, 24: 10, 14: 9, 33: 9, 65: 9, 37: 8, 11: 7, 15: 7, 16: 7, 21: 7, 26: 7, 28: 7, 60: 7, 19: 6, 20: 6, 40: 6, 45: 6, 49: 6, 50: 6, 27: 5, 30: 5, 35: 5, 36: 5, 54: 5, 91: 5, 105: 5, 147: 5, 164: 5, 43: 5, 25: 4, 34: 4, 38: 4, 39: 4, 41: 4, 42: 4, 53: 4, 59: 4, 64: 4, 74: 4, 78: 4, 82: 4, 87: 4, 104: 4, 120: 4, 134: 4, 177: 4, 185: 4, 188: 4, 202: 4, 213: 4, 239: 4, 240: 4, 51: 4, 88: 4, 32: 4, 22: 3, 29: 3, 31: 3, 44: 3, 47: 3, 57: 3, 63: 3, 67: 3, 69: 3, 70: 3, 71: 3, 77: 3, 85: 3, 89: 3, 95: 3, 98: 3, 107: 3, 114: 3, 115: 3, 126: 3, 128: 3, 130: 3, 131: 3, 23: 3, 141: 3, 146: 3, 149: 3, 204: 3, 218: 3, 256: 3, 263: 3, 287: 3, 372: 3, 441: 3, 499: 3, 755: 3, 321: 3, 48: 2, 52: 2, 55: 2, 56: 2, 66: 2, 68: 2, 73: 2, 75: 2, 80: 2, 86: 2, 92: 2, 1118: 2, 96: 2, 97: 2, 101: 2, 102: 2, 113: 2, 116: 2, 118: 2, 122: 2, 125: 2, 129: 2, 132: 2, 138: 2, 154: 2, 155: 2, 159: 2, 160: 2, 1185:

### Use BERT features (NO any additional training)

In [22]:
citations_info_target = pd.read_pickle("../data/citations_info_2000.df.gz")
test_app_feature_1000 = pd.read_pickle("../data/testset_app_feature_1000.pkl")
grants_feature_2000 = pd.read_pickle("../data/grants_feature_2000.pkl")

In [23]:
test_app_feature_1000.head(2)

Unnamed: 0,app_id,feature
0,14307191,"[-0.7771687507629395, 0.003387326840311289, -0..."
1,13137006,"[-1.0681793689727783, 0.43092426657676697, -0...."


In [26]:
test_app_feature_1000['feature'] = [
    v/np.linalg.norm(v) for v in test_app_feature_1000['feature']
]

grants_feature_2000['feature'] = [
    v/np.linalg.norm(v) for v in grants_feature_2000['feature']
]

In [27]:
test_app_feature_1000.head(2)

Unnamed: 0,app_id,feature
0,14307191,"[-0.048201614517536365, 0.00021008902177461973..."
1,13137006,"[-0.06514257634824648, 0.026279778238728124, -..."


In [29]:
%%time

sim_dict = {
    app_id:{ parsed:np.sum(test_f*grants_f)
            for parsed, grants_f 
            in zip(grants_feature_2000['parsed'], grants_feature_2000['feature']) } 
    for app_id, test_f 
    in zip(test_app_feature_1000['app_id'], test_app_feature_1000['feature'])
}

CPU times: user 17.3 s, sys: 198 ms, total: 17.5 s
Wall time: 17.5 s


In [30]:
%%time

all_ranks = []

for app_id in test_app_feature_1000['app_id']:
    cited_grants = get_cited_grants(citations_info_target, app_id)
    sorted_kv = reversed(sort_similarity_by_value(sim_dict, app_id))  # higher score, similar patent

    idx = 1
    for k,v in sorted_kv:
        if k in cited_grants:
            all_ranks.append(idx)
        idx += 1

CPU times: user 3.61 s, sys: 75.6 ms, total: 3.68 s
Wall time: 3.53 s


In [31]:
import collections
counter = collections.Counter(all_ranks)
print(counter)

Counter({1: 87, 3: 28, 2: 23, 4: 22, 6: 15, 5: 13, 8: 12, 10: 12, 17: 11, 7: 10, 11: 10, 25: 10, 27: 9, 12: 8, 13: 8, 14: 8, 18: 8, 20: 7, 21: 7, 71: 7, 15: 6, 23: 6, 24: 6, 30: 6, 32: 6, 33: 6, 67: 6, 9: 5, 29: 5, 35: 5, 40: 5, 58: 5, 124: 5, 209: 5, 16: 4, 26: 4, 31: 4, 39: 4, 42: 4, 43: 4, 49: 4, 51: 4, 52: 4, 54: 4, 59: 4, 63: 4, 72: 4, 83: 4, 99: 4, 112: 4, 19: 4, 136: 4, 157: 4, 215: 4, 38: 4, 563: 4, 530: 4, 36: 3, 41: 3, 45: 3, 47: 3, 48: 3, 53: 3, 55: 3, 65: 3, 69: 3, 87: 3, 94: 3, 109: 3, 137: 3, 122: 3, 129: 3, 138: 3, 167: 3, 174: 3, 176: 3, 182: 3, 207: 3, 213: 3, 231: 3, 238: 3, 241: 3, 250: 3, 260: 3, 309: 3, 342: 3, 412: 3, 443: 3, 151: 3, 85: 3, 515: 3, 1045: 3, 686: 3, 115: 3, 127: 3, 812: 3, 147: 3, 64: 3, 271: 3, 2020: 3, 22: 2, 28: 2, 34: 2, 1062: 2, 46: 2, 60: 2, 66: 2, 68: 2, 74: 2, 75: 2, 81: 2, 82: 2, 89: 2, 111: 2, 114: 2, 120: 2, 130: 2, 132: 2, 133: 2, 1161: 2, 141: 2, 146: 2, 149: 2, 153: 2, 155: 2, 156: 2, 158: 2, 159: 2, 160: 2, 161: 2, 175: 2, 178: 2, 18

# ===== Trial and error =====

In [None]:
!echo 'Who was Jim Henson ? ||| Jim Henson was a puppeteer' >> ./bert/tmp/input.txt

In [None]:
!python3 ./bert/extract_features.py \
  --input_file=./bert/tmp/input.txt \
  --output_file=./bert/tmp/output.json \
  --vocab_file=./bert/model/uncased_L-12_H-768_A-12/vocab.txt \
  --bert_config_file=./bert/model/uncased_L-12_H-768_A-12/bert_config.json \
  --init_checkpoint=./bert/model/uncased_L-12_H-768_A-12/bert_model.ckpt \
  --layers=-1 \
  --max_seq_length=512 \
  --batch_size=8

In [None]:
import json
with open("./bert/tmp/output.json") as f:
    output = json.load(f)

In [16]:
results =  np.empty((0,768), float)

for result in estimator.predict(input_fn, yield_single_examples=True):
    results = np.append(results, result['layer_output_0'][0].reshape(1,768), axis=0 )

In [21]:
results.shape

(2, 768)

In [22]:
result['layer_output_0'].shape

(512, 768)

In [23]:
result['layer_output_0'][0].shape

(768,)

In [24]:
train_data.head()

Unnamed: 0,index,claim_app,claim_cited_grant,label
0,0,1 . A process comprising the following steps:(...,"1. A liquid supply apparatus, comprising:a wal...",not_entailment
1,1,1 - 10 . (canceled) 11 . A method for open-loo...,"1. A fuel supply apparatus for an engine, comp...",entailment
2,2,1 . A handpiece for treating biological tissue...,1. A method for irradiating tissue having abso...,entailment
3,3,1 . A power cable comprising:a power input com...,1. A temperature regulating system for a vehic...,not_entailment
4,4,1 . A cutting insert having a substantially cu...,1. A toolholder comprising:a) a cutter body ro...,entailment


In [28]:
test = []

for _ in range(len(train_data)):
    test.append(results[0])

In [29]:
train_data['test'] = test

In [32]:
train_data

Unnamed: 0,index,claim_app,claim_cited_grant,label,test
0,0,1 . A process comprising the following steps:(...,"1. A liquid supply apparatus, comprising:a wal...",not_entailment,"[-0.628110945224762, 0.19321474432945251, -0.7..."
1,1,1 - 10 . (canceled) 11 . A method for open-loo...,"1. A fuel supply apparatus for an engine, comp...",entailment,"[-0.628110945224762, 0.19321474432945251, -0.7..."
2,2,1 . A handpiece for treating biological tissue...,1. A method for irradiating tissue having abso...,entailment,"[-0.628110945224762, 0.19321474432945251, -0.7..."
3,3,1 . A power cable comprising:a power input com...,1. A temperature regulating system for a vehic...,not_entailment,"[-0.628110945224762, 0.19321474432945251, -0.7..."
4,4,1 . A cutting insert having a substantially cu...,1. A toolholder comprising:a) a cutter body ro...,entailment,"[-0.628110945224762, 0.19321474432945251, -0.7..."
5,5,"1 . A multimedia system, comprising:a multimed...","1. An illumination module, comprising:an integ...",entailment,"[-0.628110945224762, 0.19321474432945251, -0.7..."
6,6,1 .- 10 . (canceled) 11 . A blade for a comput...,1. A slide apparatus comprising:a slider beam ...,entailment,"[-0.628110945224762, 0.19321474432945251, -0.7..."
7,7,1 . A method for performing radio usage measur...,1. A mechanical torque wrench for engaging a w...,not_entailment,"[-0.628110945224762, 0.19321474432945251, -0.7..."
8,8,1 . A system configured to exchange energy wir...,1. A submersible table and seat assembly for u...,not_entailment,"[-0.628110945224762, 0.19321474432945251, -0.7..."
9,9,1 . A computer-implemented method for gray bal...,"1. A method for calibrating a printing device,...",entailment,"[-0.628110945224762, 0.19321474432945251, -0.7..."
