In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
    

import tensorflow as tf
import tensorflow_addons as tfa
import tqdm
import random
from tensorflow.keras import backend as K
from sklearn.model_selection import StratifiedKFold
from scipy.stats import pearsonr

from transformers import AutoTokenizer, TFAutoModel
import transformers

from datasets import Dataset
import tensorflow_probability as tfp
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import math_ops
import gc
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
inferences = None

In [None]:
codes = pd.read_csv("../input/cpc-codes/titles.csv")

In [None]:
mapping = dict(zip(codes.code, codes.title))

In [None]:
def get_tokenizer(config):
    return AutoTokenizer.from_pretrained(config.base, special_tokens=True)

In [None]:
#Slow
# def prepare_for_inference(data, tokenizer, config):
#     inputs = tokenizer(list(data["input"].values), padding = "max_length", max_length = config.max_length, truncation = True)
#     dataset = tf.data.Dataset.from_tensor_slices(inputs.data).batch(config.batch_size).prefetch(tf.data.AUTOTUNE)
    
#     return dataset

In [None]:
from transformers import DefaultDataCollator
from datasets import Dataset

def prepare_for_inference(data, tokenizer, config):
    data_collator = DefaultDataCollator(return_tensors="tf")
    dataset = Dataset.from_pandas(data)
    dataset = dataset.map(lambda x: tokenizer(x["input"], padding="max_length", truncation=True, max_length = config.max_length), batched=True, num_proc=8)
    dataset = dataset.to_tf_dataset(
        columns=["attention_mask", "input_ids", "token_type_ids"],
        shuffle=False,
        collate_fn=data_collator,
        batch_size=config.batch_size,
    )
    
    return dataset

In [None]:
def get_inferences(model, test_data, config, name, num):
    global inferences
    for fold in tqdm.tqdm(range(num)):
        model.load_weights(f"../input/patent-weights/Patent-Weights/{name}/model-{fold + 1}.h5")
        if inferences is not None:
            inferences = np.vstack((inferences, model.predict(test_data).reshape(-1)))
        else:
            inferences = model.predict(test_data, batch_size = config.batch_size).reshape(-1)
        print(f"{model} {fold + 1} inferences: {inferences[-1]}")

In [None]:
def correlationLoss(y_actual, y_pred, axis=-2):
    """Metric returning the Pearson correlation coefficient of two tensors over some axis, default -2."""
#     import pdb; pdb.set_trace()
    #Ignore the bad variables names here I didn't write this code
    x = tf.convert_to_tensor(y_actual)
    y = math_ops.cast(y_pred, x.dtype)
    n = tf.cast(tf.shape(x)[axis], x.dtype)
    xsum = tf.reduce_sum(x, axis=axis)
    ysum = tf.reduce_sum(y, axis=axis)
    xmean = xsum / n
    ymean = ysum / n
    xvar = tf.reduce_sum( tf.math.squared_difference(x, xmean), axis=axis)
    yvar = tf.reduce_sum( tf.math.squared_difference(y, ymean), axis=axis)
    cov = tf.reduce_sum( (x - xmean) * (y - ymean), axis=axis)
    corr = cov / tf.sqrt(xvar * yvar)
    return tf.constant(1.0, dtype=x.dtype) - corr

# Electra Inferencing

In [None]:
class ElectraConfig():
    seed = 69
    epochs = 40
    num_folds = 5
    max_length = 412
    batch_size = 256
    # THIS LEARNING RATE IS FOR THE TRANSFORMER
    lr1 = 1e-6
    
    # THIS LEARNING RATE IS FOR THE OUTPUT
    lr2 = 1e-4
    base = "../input/initialization/electra-base-sst2"
#     base = "google/electra-base-generator"
#     base = "google/electra-base-discriminator"
    shuffle = True
    
    def __init__(self, **kwargs):
        for k, v in kwargs.items():
            setattr(self, k, v)
            
def seed_everything(seed=69):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

electra_config = ElectraConfig()
seed_everything(electra_config.seed)
del seed_everything
del ElectraConfig

In [None]:
electra_tokenizer = get_tokenizer(electra_config)

In [None]:
def electra_process_data(file_path):
    file = pd.read_csv(f"../input/us-patent-phrase-to-phrase-matching/{file_path}")
    file["context_text"] = file["context"].map(lambda code: mapping.get(code, ""))
    file["section_context"] = "[" + file.context.str[0] + "]"
    file["input"] = file.section_context + " " + file.context_text.str.lower() + " [SEP] " + file.anchor.str.lower() + " [SEP] " + file.target.str.lower()
    return file

In [None]:
electra_train = electra_process_data("train.csv")

In [None]:
electra_train.head()

In [None]:
context_tokens = list(electra_train.section_context.unique())
electra_tokenizer.add_special_tokens({'additional_special_tokens': context_tokens})

In [None]:
del electra_train, context_tokens

In [None]:
electra_test = electra_process_data("test.csv")

In [None]:
del electra_process_data

In [None]:
electra_test = prepare_for_inference(electra_test, electra_tokenizer, electra_config)

In [None]:
del electra_tokenizer

In [None]:
def createElectra(config):
    steps_per_epoch = 36473 // config.batch_size

    clr1 = tfa.optimizers.CyclicalLearningRate(initial_learning_rate=config.lr1 / 10,
        maximal_learning_rate=config.lr1,
        scale_fn=lambda x: 1/(2.**(x-1)),
        step_size=2 * steps_per_epoch
    )

    clr2 = tfa.optimizers.CyclicalLearningRate(initial_learning_rate=config.lr2 / 10,
        maximal_learning_rate=config.lr2,
        scale_fn=lambda x: 1/(2.**(x-1)),
        step_size=2 * steps_per_epoch
    )


    input_ids = tf.keras.layers.Input(
        shape=(config.max_length,), dtype=tf.int32, name="input_ids"
    )

    attention_masks = tf.keras.layers.Input(
        shape=(config.max_length,), dtype=tf.int32, name="attention_mask"
    )

    token_type_ids = tf.keras.layers.Input(
        shape=(config.max_length,), dtype=tf.int32, name="token_type_ids"
    )

    base_model = transformers.TFAutoModel.from_pretrained(config.base)

    base_model_output = base_model(
        input_ids, attention_mask=attention_masks, token_type_ids=token_type_ids
    )

    #Cls token from output
    last_hidden_state = base_model_output.last_hidden_state[:, 0, :]

    dropout = tf.keras.layers.Dropout(base_model.config.hidden_dropout_prob)(last_hidden_state)

    dense = tf.keras.layers.Dense(base_model.config.hidden_size, activation = "gelu")(dropout)

    dropout = tf.keras.layers.Dropout(base_model.config.hidden_dropout_prob)(dense)

    model_output = tf.keras.layers.Dense(1, activation = "sigmoid")(dropout)

    model = tf.keras.models.Model(
        inputs=[input_ids, attention_masks, token_type_ids], outputs=model_output
    )

#         optimizers = [
#             tfa.optimizers.AdamW(weight_decay=config.weight_decay, learning_rate=config.lr1),
#             tfa.optimizers.AdamW(weight_decay=config.weight_decay, learning_rate=config.lr2)
#         ]

    optimizers = [
        tf.keras.optimizers.Adam(learning_rate=clr1),
        tf.keras.optimizers.Adam(learning_rate=clr2)
    ]

    optimizers_and_layers = [(optimizers[0], model.layers[0:4]), (optimizers[1], model.layers[3:])]
    optimizer = tfa.optimizers.MultiOptimizer(optimizers_and_layers)


#         loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits = False)
    loss_fn = correlationLoss

    model.compile(
        optimizer = optimizer,
        loss=loss_fn,
        steps_per_execution=steps_per_epoch
    )
    
    return model

In [None]:
Electra = createElectra(electra_config)

In [None]:
del createElectra

In [None]:
get_inferences(Electra, electra_test, electra_config, "Electra", 5)

In [None]:
del Electra, electra_config, electra_test

In [None]:
gc.collect()

# BERT Inferencing

In [None]:
class BERTConfig():
    seed = 69
    epochs = 40
    num_folds = 5
    max_length = 412
    batch_size = 128
    # THIS LEARNING RATE IS FOR THE TRANSFORMER
    lr1 = 1e-10
    
    # THIS LEARNING RATE IS FOR THE OUTPUT
    lr2 = 1e-4
    base = "../input/initialization/bert-for-patents"
    shuffle = True
    
    def __init__(self, **kwargs):
        for k, v in kwargs.items():
            setattr(self, k, v)
            
def seed_everything(seed=69):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

bert_config = BERTConfig()
seed_everything(bert_config.seed)
del seed_everything
del BERTConfig

In [None]:
bert_tokenizer = get_tokenizer(bert_config)

In [None]:
def bert_process_data(file_path):
    file = pd.read_csv(f"../input/us-patent-phrase-to-phrase-matching/{file_path}")
    file["context_text"] = file["context"].map(lambda code: mapping.get(code, ""))
    file["section_context"] = "[" + file.context.str[0] + "]"
    file["input"] = file.section_context + " " + file.context_text.str.lower() + " [SEP] " + file.anchor.str.lower() + " [SEP] " + file.target.str.lower()
    return file

In [None]:
bert_train = bert_process_data("train.csv")

In [None]:
context_tokens = list(bert_train.section_context.unique())
bert_tokenizer.add_special_tokens({'additional_special_tokens': context_tokens})

In [None]:
del bert_train, context_tokens

In [None]:
bert_test = bert_process_data("test.csv")

In [None]:
del bert_process_data

In [None]:
bert_test = prepare_for_inference(bert_test, bert_tokenizer, bert_config)

In [None]:
del bert_tokenizer

In [None]:
def createBERT(config):
    steps_per_epoch = 36473 // config.batch_size

    clr1 = tfa.optimizers.CyclicalLearningRate(initial_learning_rate=config.lr1 / 10,
        maximal_learning_rate=config.lr1,
        scale_fn=lambda x: 1/(2.**(x-1)),
        step_size=2 * steps_per_epoch
    )

    clr2 = tfa.optimizers.CyclicalLearningRate(initial_learning_rate=config.lr2 / 10,
        maximal_learning_rate=config.lr2,
        scale_fn=lambda x: 1/(2.**(x-1)),
        step_size=2 * steps_per_epoch
    )


    input_ids = tf.keras.layers.Input(
        shape=(config.max_length,), dtype=tf.int32, name="input_ids"
    )

    attention_masks = tf.keras.layers.Input(
        shape=(config.max_length,), dtype=tf.int32, name="attention_mask"
    )

    token_type_ids = tf.keras.layers.Input(
        shape=(config.max_length,), dtype=tf.int32, name="token_type_ids"
    )

    base_model = transformers.TFAutoModel.from_pretrained(config.base)

    base_model_output = base_model(
        input_ids, attention_mask=attention_masks, token_type_ids=token_type_ids
    )

    last_hidden_state = base_model_output.last_hidden_state[:, 0, :]

    dropout = tf.keras.layers.Dropout(base_model.config.hidden_dropout_prob)(last_hidden_state)

    dense = tf.keras.layers.Dense(base_model.config.hidden_size, activation = "tanh")(dropout)

    dropout = tf.keras.layers.Dropout(base_model.config.hidden_dropout_prob)(dense)

    model_output = tf.keras.layers.Dense(1, activation = "sigmoid")(dropout)

    model = tf.keras.models.Model(
        inputs=[input_ids, attention_masks, token_type_ids], outputs=model_output
    )

#         optimizers = [
#             tfa.optimizers.AdamW(weight_decay=config.weight_decay, learning_rate=config.lr1),
#             tfa.optimizers.AdamW(weight_decay=config.weight_decay, learning_rate=config.lr2)
#         ]

    optimizers = [
        tf.keras.optimizers.Adam(learning_rate=clr1),
        tf.keras.optimizers.Adam(learning_rate=clr2)
    ]

    optimizers_and_layers = [(optimizers[0], model.layers[0:4]), (optimizers[1], model.layers[3:])]
    optimizer = tfa.optimizers.MultiOptimizer(optimizers_and_layers)


#         loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits = False)
    loss_fn = correlationLoss

    model.compile(
        optimizer = optimizer,
        loss=loss_fn
    )

    return model

In [None]:
BERT = createBERT(bert_config)

In [None]:
del createBERT

In [None]:
get_inferences(BERT, bert_test, bert_config, "BERT", 4)

In [None]:
del BERT, bert_config, bert_test

In [None]:
gc.collect()

# XLNet Inferencing

In [None]:
class XLConfig():
    seed = 69
    epochs = 40
    num_folds = 5
    max_length = 412
    batch_size = 128
    # THIS LEARNING RATE IS FOR THE TRANSFORMER
    lr1 = 1e-6
    
    # THIS LEARNING RATE IS FOR THE OUTPUT
    lr2 = 2e-4
    base = "../input/initialization/xlnet-base-cased-STS-B"
    shuffle = True
    
    def __init__(self, **kwargs):
        for k, v in kwargs.items():
            setattr(self, k, v)
            
def seed_everything(seed=69):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

XL_config = XLConfig()
seed_everything(XL_config.seed)
del seed_everything
del XLConfig

In [None]:
XL_tokenizer = get_tokenizer(XL_config)

In [None]:
def XL_process_data(file_path):
    file = pd.read_csv(f"../input/us-patent-phrase-to-phrase-matching/{file_path}")
    file["context_text"] = file["context"].map(lambda code: mapping.get(code, ""))
    file["section_context"] = "<" + file.context.str[0] + ">"
    file["input"] = file.section_context + " " + file.context_text.str.lower() + " <sep> " + file.anchor.str.lower() + " <sep> " + file.target.str.lower()
    return file

In [None]:
XL_train = XL_process_data("train.csv")

In [None]:
context_tokens = list(XL_train.section_context.unique())
XL_tokenizer.add_special_tokens({'additional_special_tokens': context_tokens})

In [None]:
del context_tokens

In [None]:
del XL_train

In [None]:
XL_test = XL_process_data("test.csv")

In [None]:
del XL_process_data

In [None]:
XL_test = prepare_for_inference(XL_test, XL_tokenizer, XL_config)

In [None]:
del XL_tokenizer

In [None]:
def createXL(config):
    steps_per_epoch = 36473 // config.batch_size

    clr1 = tfa.optimizers.CyclicalLearningRate(initial_learning_rate=config.lr1 / 10,
        maximal_learning_rate=config.lr1,
        scale_fn=lambda x: 1/(2.**(x-1)),
        step_size=2 * steps_per_epoch
    )

    clr2 = tfa.optimizers.CyclicalLearningRate(initial_learning_rate=config.lr2 / 10,
        maximal_learning_rate=config.lr2,
        scale_fn=lambda x: 1/(2.**(x-1)),
        step_size=2 * steps_per_epoch
    )


    input_ids = tf.keras.layers.Input(
        shape=(config.max_length,), dtype=tf.int32, name="input_ids"
    )

    attention_masks = tf.keras.layers.Input(
        shape=(config.max_length,), dtype=tf.int32, name="attention_mask"
    )

    token_type_ids = tf.keras.layers.Input(
        shape=(config.max_length,), dtype=tf.int32, name="token_type_ids"
    )

    base_model = transformers.TFAutoModel.from_pretrained(config.base)

    base_model_output = base_model(
        input_ids, attention_mask=attention_masks, token_type_ids=token_type_ids
    )

    last_hidden_state = base_model_output.last_hidden_state[:, -1, :]

    dropout = tf.keras.layers.Dropout(0.1)(last_hidden_state)

    dense = tf.keras.layers.Dense(base_model.config.hidden_size, activation = "gelu")(dropout)

    dropout = tf.keras.layers.Dropout(0.1)(dense)

    model_output = tf.keras.layers.Dense(1, activation = "sigmoid")(dropout)

    model = tf.keras.models.Model(
        inputs=[input_ids, attention_masks, token_type_ids], outputs=model_output
    )

#         optimizers = [
#             tfa.optimizers.AdamW(weight_decay=config.weight_decay, learning_rate=config.lr1),
#             tfa.optimizers.AdamW(weight_decay=config.weight_decay, learning_rate=config.lr2)
#         ]

    optimizers = [
        tf.keras.optimizers.Adam(learning_rate=clr1),
        tf.keras.optimizers.Adam(learning_rate=clr2)
    ]

    optimizers_and_layers = [(optimizers[0], model.layers[0:4]), (optimizers[1], model.layers[3:])]
    optimizer = tfa.optimizers.MultiOptimizer(optimizers_and_layers)


#         loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits = False)
    loss_fn = correlationLoss

    model.compile(
        optimizer = optimizer,
        loss=loss_fn
    )

    return model

In [None]:
XL = createXL(XL_config)

In [None]:
del createXL

In [None]:
get_inferences(XL, XL_test, XL_config, "XLNET")

In [None]:
del XL, XL_config, XL_test

In [None]:
gc.collect()

In [None]:
final_predictions = np.mean(np.array(inferences), axis = 0)

In [None]:
final_predictions

In [None]:
test = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/test.csv")

In [None]:
buckets = np.array([0, 0.25, 0.5, 0.75, 1])
processed_predictions = [buckets[(np.abs(buckets - i)).argmin()] for i in final_predictions]

In [None]:
processed_predictions

In [None]:
output = pd.concat([test.id, pd.Series(processed_predictions, name = "score")], axis=1).reset_index(drop=True)

In [None]:
output.to_csv('submission.csv', index=False)

In [None]:
output