In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
    

import tensorflow as tf
import tensorflow_addons as tfa
import tqdm
import random
from tensorflow.keras import backend as K
from sklearn.model_selection import StratifiedKFold
from scipy.stats import pearsonr

from transformers import AutoTokenizer, TFAutoModel
import transformers

from datasets import Dataset
import tensorflow_probability as tfp
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import math_ops
import gc
!pip install coral-ordinal
import coral_ordinal as coral
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
! pip install -q chemparse
! pip install -q pyvalem
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pickle
from pyvalem.formula import Formula
import chemparse

In [None]:
with open('../input/periodictable/periodic_table.p', 'rb') as fin:
    per_table = pickle.load(fin)

def atoms_to_str(atoms):
    return ' '.join([per_table.get(x.lower(), '') for x in atoms])
    
def parse_formula(text):
    tokenized = text.split(' ')
    
    results = []
    
    for tok in tokenized:
        atoms = chemparse.parse_formula(tok).keys()
        formula = atoms_to_str(atoms)
        if len(formula) < 2 or len(tok) < 3:
            results.append(tok)
        else:
            try:
                f = Formula(tok.upper())
                atoms = f.atoms
                formula = ' '.join([x.name.lower() for x in atoms])
            except Exception as e:
                pass
            
            results.append(formula)
    
    return ' '.join(results)
 
def parse_df_formulas(df):
    df = df.copy()
    df.loc[:, 'target'] = df.target.apply(parse_formula)
    return df

In [None]:
try:
    # TPU config
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
    auto = tf.data.experimental.AUTOTUNE
    replicas = strategy.num_replicas_in_sync
    print(f'TPU: {tpu.master()}')
except:
    strategy = tf.distribute.get_strategy()
    auto = tf.data.experimental.AUTOTUNE
    replicas = strategy.num_replicas_in_sync

# XLA acceleartion
tf.config.optimizer.set_jit(True)
print(f'Replicas: {replicas}')

# BertLegal (Inspired by: https://www.kaggle.com/code/mohamadmerchant/us-phrase-matching-tf-keras-train-tpu)


In [None]:
class Config():
    seed = 69
    epochs = 40
    num_folds = 5
    max_length = 412
    batch_size = 32
    # THIS LEARNING RATE IS FOR THE TRANSFORMER
    lr1 = 1e-10
    
    # THIS LEARNING RATE IS FOR THE OUTPUT
    lr2 = 1e-4
    base = "anferico/bert-for-patents"
    shuffle = True
    
    def __init__(self, **kwargs):
        for k, v in kwargs.items():
            setattr(self, k, v)
            
def seed_everything(seed=69):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    
seed_everything(seed=69)
config = Config()

In [None]:
tokenizer = AutoTokenizer.from_pretrained(config.base, special_tokens=True)

In [None]:
codes = pd.read_csv("../input/cpc-codes/titles.csv")

In [None]:
mapping = dict(zip(codes.code, codes.title))

In [None]:
def process_data(file_path):
    file = pd.read_csv(f"../input/us-patent-phrase-to-phrase-matching/{file_path}")
    file = parse_df_formulas(file)
    file["context_text"] = file["context"].map(lambda code: mapping.get(code, ""))
    file["section_context"] = "[" + file.context.str[0] + "]"
    file["input"] = file.section_context + " " + file.context_text.str.lower() + " [SEP] " + file.anchor.str.lower() + " [SEP] " + file.target.str.lower()
    file["y"] = file.score * 4
    return file

In [None]:
train = process_data("train.csv")

In [None]:
train.head()

In [None]:
context_tokens = list(train.section_context.unique())
tokenizer.add_special_tokens({'additional_special_tokens': context_tokens})

In [None]:
def prepare_for_model(data, train):
    inputs = tokenizer(list(data["input"].values), padding = "max_length", max_length = config.max_length, truncation = True)
    
    dataset = ""
    if train:
        dataset = tf.data.Dataset.from_tensor_slices((inputs.data, data['y'].tolist())).shuffle(1024).batch(config.batch_size).prefetch(tf.data.AUTOTUNE)
    else:
        dataset = tf.data.Dataset.from_tensor_slices(inputs.data).batch(config.batch_size).prefetch(tf.data.AUTOTUNE)
    
    
    return dataset

In [None]:
def createBERT(config):
    with strategy.scope():
        steps_per_epoch = 36473 // config.batch_size
        
        clr1 = tfa.optimizers.CyclicalLearningRate(initial_learning_rate=config.lr1 / 10,
            maximal_learning_rate=config.lr1,
            scale_fn=lambda x: 1/(2.**(x-1)),
            step_size=2 * steps_per_epoch
        )
        
        clr2 = tfa.optimizers.CyclicalLearningRate(initial_learning_rate=config.lr2 / 10,
            maximal_learning_rate=config.lr2,
            scale_fn=lambda x: 1/(2.**(x-1)),
            step_size=2 * steps_per_epoch
        )
        
        
        input_ids = tf.keras.layers.Input(
            shape=(config.max_length,), dtype=tf.int32, name="input_ids"
        )

        attention_masks = tf.keras.layers.Input(
            shape=(config.max_length,), dtype=tf.int32, name="attention_mask"
        )
        
        token_type_ids = tf.keras.layers.Input(
            shape=(config.max_length,), dtype=tf.int32, name="token_type_ids"
        )

        base_model = transformers.TFAutoModel.from_pretrained(config.base, from_pt=True)

        base_model_output = base_model(
            input_ids, attention_mask=attention_masks, token_type_ids=token_type_ids
        )

        last_hidden_state = base_model_output.last_hidden_state[:, 0, :]
        
        dropout = tf.keras.layers.Dropout(base_model.config.hidden_dropout_prob)(last_hidden_state)
        
        dense = tf.keras.layers.Dense(base_model.config.hidden_size, activation = "tanh")(dropout)
        
        dropout = tf.keras.layers.Dropout(base_model.config.hidden_dropout_prob)(dense)
        
        model_output = coral.CoralOrdinal(num_classes = 5)(dropout)
        
        model = tf.keras.models.Model(
            inputs=[input_ids, attention_masks, token_type_ids], outputs=model_output
        )
        
#         optimizers = [
#             tfa.optimizers.AdamW(weight_decay=config.weight_decay, learning_rate=config.lr1),
#             tfa.optimizers.AdamW(weight_decay=config.weight_decay, learning_rate=config.lr2)
#         ]
        
        optimizers = [
            tf.keras.optimizers.Adam(learning_rate=clr1),
            tf.keras.optimizers.Adam(learning_rate=clr2)
        ]
        
        optimizers_and_layers = [(optimizers[0], model.layers[0:4]), (optimizers[1], model.layers[3:])]
        optimizer = tfa.optimizers.MultiOptimizer(optimizers_and_layers)
        
        
#         loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits = False)
        importance_weights = [0.1, 0.2, 0.3, 0.4]
        loss_fn = coral.OrdinalCrossEntropy(importance_weights = importance_weights)

        model.compile(
            optimizer = optimizer,
            loss=loss_fn,
        )
    
    return model

In [None]:
config = Config()
createBERT(config).summary()

In [None]:
from scipy import special
class Pearsonr(tf.keras.callbacks.Callback):
    def __init__(self, val_data, y_val):
        self.val_data = val_data
        self.y_val = y_val
    def on_epoch_end(self, epoch, logs):
        import pdb; pdb.set_trace() 
        logits = model.predict(self.val_data, batch_size = config.batch_size).reshape(-1)
        
        y_hat = tf.math.sigmoid(logits)
        
        y_hat = coral.cumprobs_to_label(y_hat, threshold = 0.5)
        
        val_pearsonr = pearsonr(self.y_val, y_hat)[0]

        print(f"val_pearsonr: {val_pearsonr:.4f}\n")
        logs["val_pearsonr"] = val_pearsonr

In [None]:
def train_folds(train, config):
    
    oof = np.zeros(config.num_folds)
    
    skf = StratifiedKFold(n_splits=config.num_folds, 
                      shuffle=True,
                      random_state=config.seed)
    
    train['score_map'] = train['score'].map({0.00: 0, 0.25: 1, 0.50: 2, 0.75: 3, 1.00: 4})
    for fold, (train_idx, val_idx) in enumerate(skf.split(train, train["score_map"])):
        print("*" * 25)
        print(f"Training fold: {fold+1}")

        K.clear_session()
        tf.tpu.experimental.initialize_tpu_system(tpu)
        train_df = train.loc[train_idx].reset_index(drop=True)
        val_df = train.loc[val_idx].reset_index(drop=True)
        
        
        train_set = prepare_for_model(train_df, train = True)
        val_set = prepare_for_model(val_df, train = False)

        checkpoint = tf.keras.callbacks.ModelCheckpoint(f'model-{fold+1}.h5',
                                                        monitor='val_loss',
                                                        mode='min',
                                                        save_best_only=True,
                                                        save_weights_only=True,
                                                        save_freq='epoch',
                                                        verbose=1)
        
        early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                          mode='min',
                                                          patience=5,
                                                          verbose=1)
        

        metric = Pearsonr(val_set, val_df["y"].tolist())
        
        model = createBERT(config)
        history = model.fit(
                        train_set,
                        validation_data=val_set,
                        epochs = 1,
#                         epochs=config.epochs,
                        callbacks=[checkpoint, 
                                   early_stopping,
                                  metric],
                        verbose=1
                    )
        
#         import pdb; pdb.set_trace()
        model.load_weights(f'model-{fold+1}.h5')
        import pdb; pdb.set_trace()        
        
        logits = model.predict(val_set, batch_size = config.batch_size).reshape(-1)
        
        y_hat = tf.math.sigmoid(logits)
        
        y_hat = coral.cumprobs_to_label(y_hat, threshold = 0.5)
        
        performance = pearsonr(y_hat, val_df["y"].tolist())[0]
        
        oof[fold] = performance
        del model
        gc.collect()
        print(f"PearsonR: {performance}")
    
    
    return oof

In [None]:
config = Config()
bert_results = train_folds(train, config)

In [None]:
# Roberta with Bi-LSTM: np.mean(np.array([0.80018084, 0.80536421, 0.80386591, 0.81383098, 0.81148292]))
# ROberta + Linear: np.mean(np.array([0.8099835003914698, 0.789178461208096, 0.8024664431203516, 0.8100770823958504, 0.8085313492150906]))

In [None]:
bert_results

In [None]:
np.mean(bert_results)