In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
os.environ["WANDB_API_KEY"] = "0" ## to silence warning

In [None]:
from transformers import BertTokenizer, TFBertModel
from transformers import AutoTokenizer, TFAutoModel
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_addons as tfa
from tqdm import tqdm

In [None]:
! pip install nlpaug

In [None]:
import nlpaug.augmenter.word as naw

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError:
    strategy = tf.distribute.get_strategy() # for CPU and single GPU
    print('Number of replicas:', strategy.num_replicas_in_sync)

## Downloading Data

In [None]:
train = pd.read_csv("../input/contradictory-my-dear-watson/train.csv")

In [None]:
len(train.index)

In [None]:
labels, frequencies = np.unique(train.language.values, return_counts = True)

In [None]:
model_name = 'distilbert-base-multilingual-cased'
# model_name = 'jplu/tf-xlm-roberta-large'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
def encode_sentence(s):
   tokens = list(tokenizer.tokenize(s))
   tokens.append('[SEP]')
   return tokenizer.convert_tokens_to_ids(tokens)

In [None]:
def bert_encode(premises, hypotheses, tokenizer, split=0):  
    num_examples = len(hypotheses)
  
    sentence1 = tf.ragged.constant([
      encode_sentence(s)
      for s in np.array(hypotheses)])
    sentence2 = tf.ragged.constant([
      encode_sentence(s)
       for s in np.array(premises)])

    cls = [tokenizer.convert_tokens_to_ids(['[CLS]'])]*sentence1.shape[0]
    input_word_ids = tf.concat([cls, sentence1, sentence2], axis=-1)

    input_mask = tf.ones_like(input_word_ids).to_tensor()

    type_cls = tf.zeros_like(cls)
    type_s1 = tf.zeros_like(sentence1)
    type_s2 = tf.ones_like(sentence2)
    input_type_ids = tf.concat(
      [type_cls, type_s1, type_s2], axis=-1).to_tensor()

    
    
    if split != 0:
        inputs = {
      'input_word_ids': input_word_ids.to_tensor()[:split],
      'input_mask': input_mask[:split]}
        val = {
          'input_word_ids': input_word_ids.to_tensor()[split:],
          'input_mask': input_mask[split:]}

        return inputs, val
    else:
        inputs = {
          'input_word_ids': input_word_ids.to_tensor(),
          'input_mask': input_mask}

        return inputs

In [None]:
def bert_encode_wo_premises(premises, hypotheses, tokenizer, split=0):  
    num_examples = len(hypotheses)
  
    sentence1 = tf.ragged.constant([
      encode_sentence(s)
      for s in np.array(hypotheses)])
    #sentence2 = tf.ragged.constant([
    #  encode_sentence(s)
    #   for s in np.array(premises)])

    cls = [tokenizer.convert_tokens_to_ids(['[CLS]'])]*sentence1.shape[0]
    input_word_ids = tf.concat([cls, sentence1], axis=-1)
    #input_word_ids = tf.concat([cls, sentence1, sentence2], axis=-1)

    input_mask = tf.ones_like(input_word_ids).to_tensor()

    type_cls = tf.zeros_like(cls)
    type_s1 = tf.zeros_like(sentence1)
    input_type_ids = tf.concat(
      [type_cls, type_s1], axis=-1).to_tensor()
    #type_s2 = tf.ones_like(sentence2)
    #input_type_ids = tf.concat(
    #  [type_cls, type_s1, type_s2], axis=-1).to_tensor()

    
    
    if split != 0:
        inputs = {
      'input_word_ids': input_word_ids.to_tensor()[:split],
      'input_mask': input_mask[:split]}
        val = {
          'input_word_ids': input_word_ids.to_tensor()[split:],
          'input_mask': input_mask[split:]}

        return inputs, val
    else:
        inputs = {
          'input_word_ids': input_word_ids.to_tensor(),
          'input_mask': input_mask}

        return inputs

In [None]:
# aug = naw.WordEmbsAug(model_type='glove', model_path='../input/glove6b/glove.6B.50d.txt')

In [None]:
VAL = 2000
MAX_N = 6000

premises = train.premise.values
hypothesis = train.hypothesis.values
labels = train.label.values

to_aug_premises = premises[:MAX_N]
to_aug_hypothesis = hypothesis[:MAX_N]
to_aug_labels = labels[:MAX_N]

val_premises = premises[-VAL:]
val_hypothesis = hypothesis[-VAL:]
val_labels = labels[-VAL:]

other_premises = premises[MAX_N:-VAL]
other_hypothesis = hypothesis[MAX_N:-VAL]
other_labels = labels[MAX_N:-VAL]

In [None]:
def augment(sentence, i):
    if train.lang_abv.values[i] == 'en':
        return aug.augment(sentence, n=3, num_thread=4)
    else:
        return [sentence]

In [None]:
train[:MAX_N].loc[train.lang_abv == "en"]

In [None]:
#premises_augmented = np.concatenate([augment(p, i) for (i, p) in tqdm(enumerate(to_aug_premises))])
#hypothesis_augmented = np.concatenate([augment(h, i) for (i, h) in tqdm(enumerate(to_aug_hypothesis))])

In [None]:
#premises = np.concatenate([premises_augmented, other_premises, val_premises])
#hypothesis = np.concatenate([hypothesis_augmented, other_hypothesis, val_hypothesis])

In [None]:

#labels = np.array([])
#for (i, y) in tqdm(enumerate(train.label.values[:MAX_N])):
#    if train.lang_abv.values[i] == 'en':
#        labels = np.concatenate([labels, [y, y, y]])
#    else:
#        labels = np.concatenate([labels, [y]])

#labels = np.concatenate([[label] * 3 if train.lang_abv.values[i] == 'en' else [label] for (i, label) in tqdm(enumerate(to_aug_labels))])
#labels = np.concatenate([labels, other_labels, val_labels])

In [None]:
import pickle

In [None]:
pickle.dump(labels, open('/kaggle/working/labels.pkl', 'wb'))
pickle.dump(premises, open('/kaggle/working/premises.pkl', 'wb'))
pickle.dump(hypothesis, open('/kaggle/working/hypothesis.pkl', 'wb'))

In [None]:
premises = pickle.load(open('../input/augmented-watson/premises.pkl', 'rb'))
hypothesis = pickle.load(open('../input/augmented-watson/hypothesis.pkl', 'rb'))
labels = pickle.load(open('../input/augmented-watson/labels.pkl', 'rb'))

In [None]:
train_input, val = bert_encode(premises, hypothesis, tokenizer, -VAL)

In [None]:
class SGDRScheduler(tf.keras.callbacks.Callback):
    '''Cosine annealing learning rate scheduler with periodic restarts.

    # Usage
        ```python
            schedule = SGDRScheduler(min_lr=1e-5,
                                     max_lr=1e-2,
                                     lr_decay=0.9,
                                     cycle_length=5,
                                     mult_factor=1.5)
            model.fit(X_train, Y_train, epochs=100, callbacks=[schedule])
        ```

    # Arguments
        min_lr: The lower bound of the learning rate range for the experiment.
        max_lr: The upper bound of the learning rate range for the experiment. 
        lr_decay: Reduce the max_lr after the completion of each cycle.
                  Ex. To reduce the max_lr by 20% after each cycle, set this value to 0.8.
        cycle_length: Initial number of epochs in a cycle.
        mult_factor: Scale epochs_to_restart after each full cycle completion.

    # References
        Original paper: http://arxiv.org/abs/1608.03983
    '''
    def __init__(self,
                 min_lr,
                 max_lr,
                 lr_decay=1,
                 cycle_length=10,
                 mult_factor=2):

        self.min_lr = min_lr
        self.max_lr = max_lr
        self.lr_decay = lr_decay

        self.batch_since_restart = 0
        self.next_restart = cycle_length

        self.cycle_length = cycle_length
        self.mult_factor = mult_factor

        self.history = {}

    def clr(self):
        '''Calculate the learning rate.'''
        fraction_to_restart = self.batch_since_restart / (self.steps_per_epoch * self.cycle_length)
        lr = self.min_lr + 0.5 * (self.max_lr - self.min_lr) * (1 + np.cos(fraction_to_restart * np.pi))
        return lr

    def on_train_begin(self, logs={}):
        '''Initialize the learning rate to the minimum value at the start of training.'''
        self.steps_per_epoch = self.params['steps'] if self.params['steps'] is not None else round(self.params['samples'] / self.params['batch_size'])
        logs = logs or {}
        tf.keras.backend.set_value(self.model.optimizer.lr, self.max_lr)

    def on_batch_end(self, batch, logs={}):
        '''Record previous batch statistics and update the learning rate.'''
        logs = logs or {}
        self.history.setdefault('lr', []).append(tf.keras.backend.get_value(self.model.optimizer.lr))
        for k, v in logs.items():
            self.history.setdefault(k, []).append(v)

        self.batch_since_restart += 1
        tf.keras.backend.set_value(self.model.optimizer.lr, self.clr())

    def on_epoch_end(self, epoch, logs={}):
        '''Check for end of current cycle, apply restarts when necessary.'''
        if epoch + 1 == self.next_restart:
            self.batch_since_restart = 0
            self.cycle_length = np.ceil(self.cycle_length * self.mult_factor)
            self.next_restart += self.cycle_length
            self.max_lr *= self.lr_decay
            self.best_weights = self.model.get_weights()

    def on_train_end(self, logs={}):
        '''Set weights to the values from the end of the most recent cycle for best performance.'''
        self.model.set_weights(self.best_weights)

In [None]:
schedule = SGDRScheduler(min_lr=1e-5,
            max_lr=1e-2,
            lr_decay=0.9,
            cycle_length=4,
            mult_factor=1)

## Creating & Training Model

In [None]:
max_len = 100
# max_len = 50

def build_model():
    # bert_encoder = AutoModelForMaskedLM.from_pretrained(model_name)
    bert_encoder = TFAutoModel.from_pretrained(model_name)
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    
    embedding = bert_encoder([input_word_ids, input_mask])[0]
    features = embedding[:,0,:]
    # features = tf.keras.layers.GlobalAveragePooling1D()(embedding)
    output = tf.keras.layers.Dense(3, activation='softmax')(features)
    
    model = tf.keras.Model(inputs=[input_word_ids, input_mask], outputs=output)
    opt = tf.keras.optimizers.SGD(1e-5)
    opt = tfa.optimizers.SWA(opt, average_period=4)
    # opt = tf.keras.optimizers.Adam(lr=1e-5)
    
    
    model.compile(opt, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
with strategy.scope():
    model = build_model()
    model.summary()

In [None]:
# model.fit(train_input, train.label.values, epochs = 10, verbose = 1, batch_size = 256, validation_split = 0.2)
callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=12, restore_best_weights=True)

model.fit(train_input, labels[:(-VAL)], epochs = 24, verbose = 1, batch_size = 64, validation_data=(val, labels[-VAL:]), callbacks=[callback, schedule])

In [None]:
test = pd.read_csv("../input/contradictory-my-dear-watson/test.csv")
test_input_all = bert_encode(test.premise.values, test.hypothesis.values, tokenizer, 0)

In [None]:
test.head()

## Prediction

In [None]:
raw_predictions_all = model.predict(test_input_all)

predictions_all = [np.argmax(i) for i in raw_predictions_all]

In [None]:
submission_all = test.id.copy().to_frame()
submission_all['prediction'] = predictions_all
submission_all

In [None]:
submission_all.to_csv("submission.csv", index = False)