In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
    

import tensorflow as tf
import tensorflow_addons as tfa
import tqdm
import random
from tensorflow.keras import backend as K
from sklearn.model_selection import StratifiedKFold
from scipy.stats import pearsonr

from transformers import AutoTokenizer, TFAutoModel
import transformers

from datasets import Dataset
import tensorflow_probability as tfp
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import math_ops
import gc
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
try:
    # TPU config
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
    auto = tf.data.experimental.AUTOTUNE
    replicas = strategy.num_replicas_in_sync
    print(f'TPU: {tpu.master()}')
except:
    strategy = tf.distribute.get_strategy()
    auto = tf.data.experimental.AUTOTUNE
    replicas = strategy.num_replicas_in_sync

# XLA acceleartion
tf.config.optimizer.set_jit(True)
print(f'Replicas: {replicas}')

# XLNet STSB-2 (Inspired by: https://www.kaggle.com/code/mohamadmerchant/us-phrase-matching-tf-keras-train-tpu)


# Important note: The model I'm ensembling with was trained with xlnet-base-cased. This notebook was me trying to use the larger version of the model for training. Due to its size, it hasn't been stable so far...

# To see the model I used for ensembling, look at the commented out lines in my Configuration and functions create_XL and train normal XL. Not the large ones.


# Key differences betweeen large-XL and normal-XL
* Normal XL has two learning rates while large XL has only one learning rate with the XL-base being frozen
* Different seed out of desperation
* BCE loss for large vs Pearson loss for normal. Reasoning for using BCE was for smoother gradients and the model potentially learning different features (problem is that large XL gets stuck in a lot of local minimas even with OneCycleLearning and Adam!!!!)

In [None]:
class Config():
    seed = 420
#     seed = 69
    epochs = 40
    num_folds = 5
    max_length = 256
    batch_size = 64
    # THIS LEARNING RATE IS FOR THE TRANSFORMER
#     lr1 = 1e-6 FOR BASE
# FOR LARGE
    lr1 = 0 
    
    # THIS LEARNING RATE IS FOR THE OUTPUT
    lr2 = 2e-5
#     base = "textattack/xlnet-base-cased-STS-B"
    base = "textattack/xlnet-large-cased-STS-B"
    shuffle = True
    
    def __init__(self, **kwargs):
        for k, v in kwargs.items():
            setattr(self, k, v)
            
def seed_everything(seed=69):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    
seed_everything(seed=69)
config = Config()

In [None]:
tokenizer = AutoTokenizer.from_pretrained(config.base, special_tokens=True)

In [None]:
codes = pd.read_csv("../input/cpc-codes/titles.csv")

In [None]:
mapping = dict(zip(codes.code, codes.title))

In [None]:
def process_data(file_path):
    file = pd.read_csv(f"../input/us-patent-phrase-to-phrase-matching/{file_path}")
    file["context_text"] = file["context"].map(lambda code: mapping.get(code, ""))
    # Special formatting for XL-Net, a bit different from BERT
    file["section_context"] = "<" + file.context.str[0] + ">"
    file["input"] = file.section_context + " " + file.context_text.str.lower() + " <sep> " + file.anchor.str.lower() + " <sep> " + file.target.str.lower()
    return file

In [None]:
train = process_data("train.csv")

In [None]:
context_tokens = list(train.section_context.unique())
tokenizer.add_special_tokens({'additional_special_tokens': context_tokens})

In [None]:
tokenizer.all_special_tokens

In [None]:
test = process_data("test.csv")

In [None]:
def prepare_for_model(data, shuffle=True):
    inputs = tokenizer(list(data["input"].values), padding = "max_length", max_length = config.max_length, truncation = True)
    
    dataset = ""
    if shuffle:
        dataset = tf.data.Dataset.from_tensor_slices((inputs.data, data['score'].tolist())).shuffle(1024).batch(config.batch_size).prefetch(tf.data.AUTOTUNE)
    else:
        dataset = tf.data.Dataset.from_tensor_slices((inputs.data, data['score'].tolist())).batch(config.batch_size).prefetch(tf.data.AUTOTUNE)
    
    
    return dataset

In [None]:
def prepare_for_prediction(data):
    inputs = tokenizer(list(data["input"].values), padding = "max_length", max_length = config.max_length, truncation = True)
    
    dataset = tf.data.Dataset.from_tensor_slices(inputs.data).batch(config.batch_size).prefetch(tf.data.AUTOTUNE)
    
    
    return dataset

In [None]:
def correlationLoss(y_actual, y_pred, axis=-2):
    """Metric returning the Pearson correlation coefficient of two tensors over some axis, default -2."""
#     import pdb; pdb.set_trace()
    #Ignore the bad variables names here I didn't write this code
    x = tf.convert_to_tensor(y_actual)
    y = math_ops.cast(y_pred, x.dtype)
    n = tf.cast(tf.shape(x)[axis], x.dtype)
    xsum = tf.reduce_sum(x, axis=axis)
    ysum = tf.reduce_sum(y, axis=axis)
    xmean = xsum / n
    ymean = ysum / n
    xvar = tf.reduce_sum( tf.math.squared_difference(x, xmean), axis=axis)
    yvar = tf.reduce_sum( tf.math.squared_difference(y, ymean), axis=axis)
    cov = tf.reduce_sum( (x - xmean) * (y - ymean), axis=axis)
    corr = cov / tf.sqrt(xvar * yvar)
    return tf.constant(1.0, dtype=x.dtype) - corr

In [None]:
def createLargeXL(config):
    with strategy.scope():
        steps_per_epoch = 36473 // config.batch_size
        
        clr = tfa.optimizers.CyclicalLearningRate(initial_learning_rate=config.lr2 / 10,
            maximal_learning_rate=config.lr2,
            scale_fn=lambda x: 1/(2.**(x-1)),
            step_size=2 * steps_per_epoch
        )
        
        
        input_ids = tf.keras.layers.Input(
            shape=(config.max_length,), dtype=tf.int32, name="input_ids"
        )

        attention_masks = tf.keras.layers.Input(
            shape=(config.max_length,), dtype=tf.int32, name="attention_mask"
        )
        
        token_type_ids = tf.keras.layers.Input(
            shape=(config.max_length,), dtype=tf.int32, name="token_type_ids"
        )

        base_model = transformers.TFAutoModel.from_pretrained(config.base, from_pt=True)
        for layer in base_model.layers:
            layer.trainable=False
            for w in layer.weights: 
                w._trainable=False
        
        base_model_output = base_model(
            input_ids, attention_mask=attention_masks, token_type_ids=token_type_ids
        )

        last_hidden_state = base_model_output.last_hidden_state[:, -1, :]
        
        dropout = tf.keras.layers.Dropout(0.1)(last_hidden_state)
        
        dense = tf.keras.layers.Dense(base_model.config.hidden_size, activation = "gelu")(dropout)
        
        dropout = tf.keras.layers.Dropout(0.1)(dense)
        
        model_output = tf.keras.layers.Dense(1, activation = "sigmoid")(dropout)
        
        model = tf.keras.models.Model(
            inputs=[input_ids, attention_masks, token_type_ids], outputs=model_output
        )
        
#         optimizers = [
#             tfa.optimizers.AdamW(weight_decay=config.weight_decay, learning_rate=config.lr1),
#             tfa.optimizers.AdamW(weight_decay=config.weight_decay, learning_rate=config.lr2)
#         ]
        
        
        loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits = False)
#         optimizer = tfa.optimizers.AdamW(0.01, learning_rate=config.lr2) 
        optimizer = tf.keras.optimizers.Adam(learning_rate=config.lr2) 
#         loss_fn = correlationLoss

        model.compile(
            optimizer = optimizer,
            loss=loss_fn
        )
    
    return model

In [None]:
m = createLargeXL(config)
m.summary()

In [None]:
def createXL(config):
    with strategy.scope():
        steps_per_epoch = 36473 // config.batch_size
        
        clr1 = tfa.optimizers.CyclicalLearningRate(initial_learning_rate=config.lr1 / 10,
            maximal_learning_rate=config.lr1,
            scale_fn=lambda x: 1/(2.**(x-1)),
            step_size=2 * steps_per_epoch
        )
        
        clr2 = tfa.optimizers.CyclicalLearningRate(initial_learning_rate=config.lr2 / 10,
            maximal_learning_rate=config.lr2,
            scale_fn=lambda x: 1/(2.**(x-1)),
            step_size=2 * steps_per_epoch
        )
        
        
        input_ids = tf.keras.layers.Input(
            shape=(config.max_length,), dtype=tf.int32, name="input_ids"
        )

        attention_masks = tf.keras.layers.Input(
            shape=(config.max_length,), dtype=tf.int32, name="attention_mask"
        )
        
        token_type_ids = tf.keras.layers.Input(
            shape=(config.max_length,), dtype=tf.int32, name="token_type_ids"
        )

        base_model = transformers.TFAutoModel.from_pretrained(config.base, from_pt=True)

        base_model_output = base_model(
            input_ids, attention_mask=attention_masks, token_type_ids=token_type_ids
        )

        last_hidden_state = base_model_output.last_hidden_state[:, -1, :]
        
        dropout = tf.keras.layers.Dropout(0.1)(last_hidden_state)
        
        dense = tf.keras.layers.Dense(base_model.config.hidden_size, activation = "gelu")(dropout)
        
        dropout = tf.keras.layers.Dropout(0.1)(dense)
        
        model_output = tf.keras.layers.Dense(1, activation = "sigmoid")(dropout)
        
        model = tf.keras.models.Model(
            inputs=[input_ids, attention_masks, token_type_ids], outputs=model_output
        )
        
#         optimizers = [
#             tfa.optimizers.AdamW(weight_decay=config.weight_decay, learning_rate=config.lr1),
#             tfa.optimizers.AdamW(weight_decay=config.weight_decay, learning_rate=config.lr2)
#         ]
        
        optimizers = [
            tf.keras.optimizers.Adam(learning_rate=clr1),
            tf.keras.optimizers.Adam(learning_rate=clr2)
        ]
        
        optimizers_and_layers = [(optimizers[0], model.layers[0:4]), (optimizers[1], model.layers[3:])]
        optimizer = tfa.optimizers.MultiOptimizer(optimizers_and_layers)
        
        
#         loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits = False)
        loss_fn = correlationLoss

        model.compile(
            optimizer = optimizer,
            loss=loss_fn
        )
    
    return model

In [None]:
def train_folds_normal_XL(train, config):
    
    oof = np.zeros(config.num_folds)
    
    skf = StratifiedKFold(n_splits=config.num_folds, 
                      shuffle=True,
                      random_state=config.seed)
    
    train['score_map'] = train['score'].map({0.00: 0, 0.25: 1, 0.50: 2, 0.75: 3, 1.00: 4})
    for fold, (train_idx, val_idx) in enumerate(skf.split(train, train["score_map"])):
        print("*" * 25)
        print(f"Training fold: {fold+1}")

        K.clear_session()
        
        train_df = train.loc[train_idx].reset_index(drop=True)
        val_df = train.loc[val_idx].reset_index(drop=True)
        
        
        train_set = prepare_for_model(train_df)
        val_set = prepare_for_model(val_df, shuffle = False)

        checkpoint = tf.keras.callbacks.ModelCheckpoint(f'model-{fold+1}.h5',
                                                        monitor='val_loss',
                                                        mode='min',
                                                        save_best_only=True,
                                                        save_weights_only=True,
                                                        save_freq='epoch',
                                                        verbose=1)
        
        early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                          mode='min',
                                                          patience=3,
                                                          verbose=1)
        

        
        model = createXL(config)
        history = model.fit(
                        train_set,
                        validation_data=val_set,
                        epochs=config.epochs,
                        callbacks=[checkpoint, 
                                   early_stopping],
                        verbose=1
                    )
        
#         import pdb; pdb.set_trace()
        model.load_weights(f'model-{fold+1}.h5')
        y_hat = model.predict(val_set, batch_size = config.batch_size).reshape(-1)
        performance = pearsonr(y_hat, val_df["score"].tolist())[0]
        oof[fold] = performance
        model.save(f'XLNet-STSB-{fold + 1}')
        del model
        gc.collect()
        print(f"PearsonR: {performance}")
    
    
    return oof

In [1]:
def train_folds_large_XL(train, config):
    
    oof = np.zeros(config.num_folds)
    
    skf = StratifiedKFold(n_splits=config.num_folds, 
                      shuffle=True,
                      random_state=config.seed)
    
    train['score_map'] = train['score'].map({0.00: 0, 0.25: 1, 0.50: 2, 0.75: 3, 1.00: 4})
    for fold, (train_idx, val_idx) in enumerate(skf.split(train, train["score_map"])):
        print("*" * 25)
        print(f"Training fold: {fold+1}")

        K.clear_session()
        tf.tpu.experimental.initialize_tpu_system(tpu)
        train_df = train.loc[train_idx].reset_index(drop=True)
        val_df = train.loc[val_idx].reset_index(drop=True)
        
        
        train_set = prepare_for_model(train_df)
        val_set = prepare_for_model(val_df, shuffle = False)

        checkpoint = tf.keras.callbacks.ModelCheckpoint(f'model-{fold+1}.h5',
                                                        monitor='val_loss',
                                                        mode='min',
                                                        save_best_only=True,
                                                        save_weights_only=True,
                                                        save_freq='epoch',
                                                        verbose=1)
        
        early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                          mode='min',
                                                          patience=5,
                                                          verbose=1)
        

        
        model = createLargeXL(config)
        history = model.fit(
                        train_set,
                        validation_data=val_set,
                        epochs=config.epochs,
                        callbacks=[checkpoint, 
                                   early_stopping],
                        verbose=1
                    )
        
#         import pdb; pdb.set_trace()
        model.load_weights(f'model-{fold+1}.h5')
        y_hat = model.predict(val_set).reshape(-1)
        performance = pearsonr(y_hat, val_df["score"].tolist())[0]
        oof[fold] = performance
#         model.save(f'XLNet-STSB-large-{fold + 1}')
        del model
        gc.collect()
        print(f"PearsonR: {performance}")
    
    
    return oof

In [None]:
config = Config()
xls_large = train_folds_large_XL(train, config)

In [None]:
# Roberta with Bi-LSTM: np.mean(np.array([0.80018084, 0.80536421, 0.80386591, 0.81383098, 0.81148292]))
# Roberta + Linear: np.mean(np.array([0.8099835003914698, 0.789178461208096, 0.8024664431203516, 0.8100770823958504, 0.8085313492150906]))
# Roberta + Classification-Layering & Variable learning_rates & Special_Tokenization & 1cycle: 
# BERT + Classification-Layering & Variable learning_rates & Special_Tokenization & 1cycle: 

In [None]:
xls_large

In [None]:
np.mean(xls_large)