In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
    

import tensorflow as tf
import tensorflow_addons as tfa
import tqdm
import random
from tensorflow.keras import backend as K
from sklearn.model_selection import StratifiedKFold
from scipy.stats import pearsonr

from transformers import AutoTokenizer, TFAutoModel
import transformers

from datasets import Dataset
import tensorflow_probability as tfp
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import math_ops
import gc
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
#Setting up a TPU
try:
    # TPU config
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
    auto = tf.data.experimental.AUTOTUNE
    replicas = strategy.num_replicas_in_sync
    print(f'TPU: {tpu.master()}')
except:
    strategy = tf.distribute.get_strategy()
    auto = tf.data.experimental.AUTOTUNE
    replicas = strategy.num_replicas_in_sync

# XLA acceleartion
tf.config.optimizer.set_jit(True)
print(f'Replicas: {replicas}')

# BertLegal (Inspired by: https://www.kaggle.com/code/mohamadmerchant/us-phrase-matching-tf-keras-train-tpu)


In [3]:
# Configuration setup for BERT
class Config():
    seed = 69
    epochs = 40
    num_folds = 5
    max_length = 412
    batch_size = 128
    # THIS LEARNING RATE IS FOR THE TRANSFORMER
    lr1 = 1e-10
    
    # THIS LEARNING RATE IS FOR THE OUTPUT
    lr2 = 1e-4
    base = "anferico/bert-for-patents"
    shuffle = True
    
    def __init__(self, **kwargs):
        for k, v in kwargs.items():
            setattr(self, k, v)
            
#Setting a constant seed
def seed_everything(seed=69):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    
seed_everything(seed=69)
config = Config()

In [4]:
#Initializing the tokenizer for BERT
tokenizer = AutoTokenizer.from_pretrained(config.base, special_tokens=True)

In [5]:
#Creating a dictionary for each context code
codes = pd.read_csv("../input/cpc-codes/titles.csv")

In [6]:
#Creating a dictionary for each context code
mapping = dict(zip(codes.code, codes.title))

In [7]:
def process_data(file_path):
    file = pd.read_csv(f"../input/us-patent-phrase-to-phrase-matching/{file_path}")
    
    #Using dictionary to get the context code and the text associated with each code
    file["context_text"] = file["context"].map(lambda code: mapping.get(code, ""))
    #Creating a custom token for each tokenizer to hopefully get some deeper understanding of the text
    file["section_context"] = "[" + file.context.str[0] + "]"
    # Creating the final input to bert
    file["input"] = file.section_context + " " + file.context_text.str.lower() + " [SEP] " + file.anchor.str.lower() + " [SEP] " + file.target.str.lower()
    return file

In [8]:
train = process_data("train.csv")

In [9]:
# Feeding in the custom tokens to BERT
context_tokens = list(train.section_context.unique())
tokenizer.add_special_tokens({'additional_special_tokens': context_tokens})

In [10]:
test = process_data("test.csv")

In [11]:
# Preparing data to model using Tensorflow data api
#NOTE: Honestly quite slow... could be much faster using data_collators, but TPUs don't support them
def prepare_for_model(data, shuffle=True):
    inputs = tokenizer(list(data["input"].values), padding = "max_length", max_length = config.max_length, truncation = True)
    
    dataset = ""
    if shuffle:
        dataset = tf.data.Dataset.from_tensor_slices((inputs.data, data['score'].tolist())).shuffle(1024).batch(config.batch_size).prefetch(tf.data.AUTOTUNE)
    else:
        dataset = tf.data.Dataset.from_tensor_slices((inputs.data, data['score'].tolist())).batch(config.batch_size).prefetch(tf.data.AUTOTUNE)
    
    
    return dataset

In [12]:
# Custom Loss function (Pearson Correlation implementation: Also the metric for the competition!!!)
def correlationLoss(y_actual, y_pred, axis=-2):
    """Metric returning the Pearson correlation coefficient of two tensors over some axis, default -2."""
#     import pdb; pdb.set_trace()
    #Ignore the bad variables names here I didn't write this code
    x = tf.convert_to_tensor(y_actual)
    y = math_ops.cast(y_pred, x.dtype)
    n = tf.cast(tf.shape(x)[axis], x.dtype)
    xsum = tf.reduce_sum(x, axis=axis)
    ysum = tf.reduce_sum(y, axis=axis)
    xmean = xsum / n
    ymean = ysum / n
    xvar = tf.reduce_sum( tf.math.squared_difference(x, xmean), axis=axis)
    yvar = tf.reduce_sum( tf.math.squared_difference(y, ymean), axis=axis)
    cov = tf.reduce_sum( (x - xmean) * (y - ymean), axis=axis)
    corr = cov / tf.sqrt(xvar * yvar)
    return tf.constant(1.0, dtype=x.dtype) - corr

In [13]:
#Creating BERT
def createBERT(config):
    #For TPUs. If you don't have a TPU, remove strategy.scope!
    with strategy.scope():
        
        steps_per_epoch = 36473 // config.batch_size
        
        clr1 = tfa.optimizers.CyclicalLearningRate(initial_learning_rate=config.lr1 / 10,
            maximal_learning_rate=config.lr1,
            scale_fn=lambda x: 1/(2.**(x-1)),
            step_size=2 * steps_per_epoch
        )
        
        clr2 = tfa.optimizers.CyclicalLearningRate(initial_learning_rate=config.lr2 / 10,
            maximal_learning_rate=config.lr2,
            scale_fn=lambda x: 1/(2.**(x-1)),
            step_size=2 * steps_per_epoch
        )
        
        
        input_ids = tf.keras.layers.Input(
            shape=(config.max_length,), dtype=tf.int32, name="input_ids"
        )

        attention_masks = tf.keras.layers.Input(
            shape=(config.max_length,), dtype=tf.int32, name="attention_mask"
        )
        
        token_type_ids = tf.keras.layers.Input(
            shape=(config.max_length,), dtype=tf.int32, name="token_type_ids"
        )

        base_model = transformers.TFAutoModel.from_pretrained(config.base, from_pt=True)

        base_model_output = base_model(
            input_ids, attention_mask=attention_masks, token_type_ids=token_type_ids
        )

        last_hidden_state = base_model_output.last_hidden_state[:, 0, :]
        
        dropout = tf.keras.layers.Dropout(base_model.config.hidden_dropout_prob)(last_hidden_state)
        
        dense = tf.keras.layers.Dense(base_model.config.hidden_size, activation = "tanh")(dropout)
        
        dropout = tf.keras.layers.Dropout(base_model.config.hidden_dropout_prob)(dense)
        
        model_output = tf.keras.layers.Dense(1, activation = "sigmoid")(dropout)
        
        model = tf.keras.models.Model(
            inputs=[input_ids, attention_masks, token_type_ids], outputs=model_output
        )
        
#         optimizers = [
#             tfa.optimizers.AdamW(weight_decay=config.weight_decay, learning_rate=config.lr1),
#             tfa.optimizers.AdamW(weight_decay=config.weight_decay, learning_rate=config.lr2)
#         ]
        
        optimizers = [
            tf.keras.optimizers.Adam(learning_rate=clr1),
            tf.keras.optimizers.Adam(learning_rate=clr2)
        ]
        
        optimizers_and_layers = [(optimizers[0], model.layers[0:4]), (optimizers[1], model.layers[3:])]
        optimizer = tfa.optimizers.MultiOptimizer(optimizers_and_layers)
        
        
#         loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits = False)
        loss_fn = correlationLoss

        model.compile(
            optimizer = optimizer,
            loss=loss_fn
        )
    
    return model

In [20]:
train.head()

In [24]:
def train_folds(train, config):
    
    oof = np.zeros(config.num_folds)
    
    skf = StratifiedKFold(n_splits=config.num_folds, 
                      shuffle=True,
                      random_state=config.seed)

    #Break training set into multiple folds
    train['score_map'] = train['score'].map({0.00: 0, 0.25: 1, 0.50: 2, 0.75: 3, 1.00: 4})
    for fold, (train_idx, val_idx) in enumerate(skf.split(train, train["score_map"])):
        print("*" * 25)
        print(f"Training fold: {fold+1}")

        # Clearing TPU and freeing memory
        K.clear_session()
        #Remove this if no TPU
        tf.tpu.experimental.initialize_tpu_system(tpu)
        train_df = train.loc[train_idx].reset_index(drop=True)
        val_df = train.loc[val_idx].reset_index(drop=True)
        
        
        train_set = prepare_for_model(train_df)
        val_set = prepare_for_model(val_df, shuffle = False)

        checkpoint = tf.keras.callbacks.ModelCheckpoint(f'model-{fold+1}.h5',
                                                        monitor='val_loss',
                                                        mode='min',
                                                        save_best_only=True,
                                                        save_weights_only=True,
                                                        save_freq='epoch',
                                                        verbose=1)
        
        early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                          mode='min',
                                                          patience=5,
                                                          verbose=1)
        

        
        model = createBERT(config)
        history = model.fit(
                        train_set,
                        validation_data=val_set,
                        epochs=config.epochs,
                        callbacks=[checkpoint, 
                                   early_stopping],
                        verbose=1
                    )
        
#         import pdb; pdb.set_trace()
        model.load_weights(f'model-{fold+1}.h5')
        y_hat = model.predict(val_set, batch_size = config.batch_size).reshape(-1)
        
        #Using the metric for evaluation
        performance = pearsonr(y_hat, val_df["score"].tolist())[0]
        oof[fold] = performance
        #Freeing up memory
        del model
        gc.collect()
        print(f"PearsonR: {performance}")
    
    
    return oof

In [30]:
train = process_data("train.csv")

In [31]:
config = Config()
oof = train_folds(train, config)

In [None]:
config = Config()
bert_results = train_folds(train, config)

In [None]:
bert_results

In [None]:
np.mean(bert_results)