In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
os.makedirs(os.path.dirname('/kaggle/temp/'), exist_ok=True)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pickle
from tensorflow.keras.preprocessing.text import Tokenizer

def glove():
    print("loading embeddings")
    with open('/kaggle/input/pickled-glove840b300d-for-10sec-loading/glove.840B.300d.pkl', 'rb') as fp:
        embeddings = pickle.load(fp)
    return embeddings

def matrix(corpus):
    embeddings = glove()

    print("training tokenizer")
    tokenizer = Tokenizer(num_words=50000)
    tokenizer.fit_on_texts(corpus)

    with open("/kaggle/temp/tokenizer.pickle", "wb") as f:
        pickle.dump(tokenizer, f)

    index = tokenizer.word_index
    emb_matrix = np.zeros((len(index) + 1, 300))

    print("building embeddings matrix")
    for word, ind in index.items():
        vector = embeddings.get(word)

        if vector is not None:
            emb_matrix[ind] = vector

    return emb_matrix, tokenizer

In [None]:
import pandas as pd
from tensorflow.keras.utils import to_categorical
import pickle
from sklearn.model_selection import train_test_split


from tensorflow.keras.preprocessing.sequence import pad_sequences


def get_data(data: pd.DataFrame):
    premises = data['premise'].to_numpy()
    hypothesis = data['hypothesis'].to_numpy()
    labels = np.zeros(premises.shape[0]) if 'label' not in data.columns else data['label']

    all = [premises[i] + ' ' + hypothesis[i] for i in range(len(labels))]

    return [premises, hypothesis, labels], all


def load_data():
    traindata = pd.read_csv("/kaggle/input/contradictory-my-dear-watson/train.csv")
    testdata = pd.read_csv("/kaggle/input/contradictory-my-dear-watson/test.csv")
    # train, test = train_test_split(traindata, test_size=0.2)

    return traindata, testdata


def process(data, tokenizer):
    return pad_sequences(tokenizer.texts_to_sequences(data[0]), 50), \
        pad_sequences(tokenizer.texts_to_sequences(data[1]), 50), \
        to_categorical(data[2])


def preprocess_train(traindata):
    data, text = get_data(traindata)

    embeddings, tokenizer = matrix(text)

    print("preprocessing train data")
    train = process(data, tokenizer)

    return train, embeddings


def preprocess_test(testdata):
    data, _ = get_data(testdata)

    with open("/kaggle/temp/tokenizer.pickle", "rb") as f:
        tokenizer = pickle.load(f)

    print("preprocessing test data")
    test = process(data, tokenizer)
    return test



In [None]:
import tensorflow_addons as tfa
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError:
    strategy = tf.distribute.get_strategy() # for CPU and single GPU
    print('Number of replicas:', strategy.num_replicas_in_sync)

In [None]:

from tensorflow.keras.layers import Input, Embedding, Dense, Dropout
from tensorflow.keras.layers import Bidirectional, LSTM, concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import SGD, Adam



def BiLSTM(data):
    with strategy.scope():
        train, embeddings = preprocess_train(data)

        embedding = Embedding(input_dim=embeddings.shape[0], output_dim=300, weights=[embeddings], input_length=50)
        lstm = Bidirectional(LSTM(64))

        premise_x = Input(shape=(100, ), dtype='int32')
        hypothesis_x = Input(shape=(100, ), dtype='int32')

        premise, hypothesis = embedding(premise_x), embedding(hypothesis_x)
        premise, hypothesis = lstm(premise), lstm(hypothesis)

        train_input = concatenate([premise, hypothesis])
        train_input = Dropout(0.3)(train_input)

        for _ in range(3):
            train_input = Dense(600, activation='relu')(train_input)
            train_input = Dropout(0.2)(train_input)

        pred = Dense(3, activation='softmax')(train_input)


        optimizer = SGD(0.01)
        # optimizer = tfa.optimizers.SWA(optimizer)
        model = Model(inputs=[premise_x, hypothesis_x], outputs=pred)
        #optimizer = Adam()
        model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
        model.summary()

        print("training")

        callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=40, restore_best_weights=True)

        history = model.fit(x=[train[0], train[1]],
                            y=train[2],
                            batch_size=256,
                            epochs=600,
                            validation_split=0.2,
                            verbose=1,
                            callbacks=[callback])

        model.save("/kaggle/temp/BiLSTM.h5")


In [None]:
from tensorflow.keras.models import load_model, Model
import pandas as pd
import numpy as np


def test_model(data):
    # Preprocess the data
    test = preprocess_test(data)
    model: Model = load_model('/kaggle/temp/BiLSTM.h5')

    # Evaluate the loaded model with test data
    # loss, accuracy = model.evaluate(x=[test[0], test[1]], y=test[2], batch_size=16)
    # print("Test Loss: {:.2f}, Test Accuracy: {:.2f}%\n".format(loss, (accuracy * 100)))

    pred = np.argmax(model.predict([test[0], test[1]]), axis=1)
    df: pd.DataFrame = pd.DataFrame({'prediction': pred}, index=data['id'])
    df.to_csv("/kaggle/working/submission.csv")

In [None]:
train, test = load_data()

BiLSTM(train)

test_model(test)
