In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/us-patent-phrase-to-phrase-matching/sample_submission.csv
/kaggle/input/us-patent-phrase-to-phrase-matching/train.csv
/kaggle/input/us-patent-phrase-to-phrase-matching/test.csv


In [2]:
import tensorflow as tf
from tensorflow import keras
from keras import layers
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.utils import shuffle

In [3]:
path = '../input/us-patent-phrase-to-phrase-matching/train.csv'
path_test = '../input/us-patent-phrase-to-phrase-matching/test.csv'
df = pd.read_csv(path)
df_test = pd.read_csv(path_test)


df = df.drop(columns=['id', 'context'])
test_id = df_test['id']
df_test = df_test.drop(columns=['id', 'context'])
df = shuffle(df)
df = df.reset_index(drop=True)

In [4]:
df_test.shape

(36, 2)

In [5]:
x_data_1 = df['anchor']
x_data_2 = df['target']
score = df['score']

In [6]:
test_combined = df_test['anchor'] + ' ' + df_test['target']
x_combined = x_data_1 + " " + x_data_2
df_tokens = pd.concat([test_combined, x_combined])
df_tokens.shape

(36509,)

In [7]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_tokens)

In [8]:
anchor_tokenized = tokenizer.texts_to_sequences(x_data_1)
target_tokenized = tokenizer.texts_to_sequences(x_data_2)

In [9]:
padded_anchor = tf.keras.preprocessing.sequence.pad_sequences(anchor_tokenized, maxlen=7)
padded_target = tf.keras.preprocessing.sequence.pad_sequences(target_tokenized, maxlen=17)

In [10]:
from sklearn.preprocessing import LabelEncoder
LE = LabelEncoder()
y_score = LE.fit_transform(score)

In [11]:
class PositionalEmbedding(keras.layers.Layer):
    def __init__(self, vocab_size, output_dim, input_dim):
        super(PositionalEmbedding, self).__init__()
        self.word_embedding = layers.Embedding(vocab_size, output_dim=output_dim, input_length=input_dim)
        self.postional_embedding = layers.Embedding(input_dim, output_dim)
        
    def call(self, inputs):
        position_indices = tf.range(tf.shape(inputs)[-1])
        embedded_words = self.word_embedding(inputs)
        embedded_indices = self.postional_embedding(position_indices)
        return embedded_words + embedded_indices

In [12]:
class Transformer(keras.layers.Layer):
    def __init__(self,num_heads, embed_dim, ff_dim, rate=0.1):
        super(Transformer,self).__init__()
        self.att = keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = keras.layers.Dropout(rate)
        self.dropout2 = keras.layers.Dropout(rate)
    def call(self, inputs, training):
        out1 = self.att(inputs, inputs)
        out1 = self.dropout1(out1, training=training)
        out1 = self.layernorm1(inputs + out1)
        out2 = self.ffn(out1)
        out2 = self.dropout2(out2, training=training)
        output = self.layernorm2(out1 + out2)
        
        return output

In [13]:
class AutoEncoderModel(keras.Model):
    def __init__(self, vocab_size, num_heads, embed_dim, ff_dim, output_dim, input_dim_1, input_dim_2):
        super(AutoEncoderModel, self).__init__()
        self.embed_layer1 = PositionalEmbedding(vocab_size, output_dim, input_dim_1)
        self.att1 = Transformer(num_heads, embed_dim, ff_dim)
        self.embed_layer2 = PositionalEmbedding(vocab_size, output_dim, input_dim_2)
        self.att2 = Transformer(num_heads, embed_dim, ff_dim)
        # self.drop_out1 = layers.Dropout(rate=0.1)
        # self.drop_out2 = layers.Dropout(rate=0.1)
        self.drop_out_clf = layers.Dropout(rate=0.2)
        self.global_avg1 = layers.GlobalAveragePooling1D()
        self.global_avg2 = layers.GlobalAveragePooling1D()
        self.dense1 = layers.Dense(128, activation='relu')
        self.dense2 = layers.Dense(64, activation='relu')
        self.dense3 = layers.Dense(64, activation='relu')
        self.dense4 = layers.Dense(32)
        self.dense5 = layers.Dense(16)
        self.dense_clf = layers.Dense(5, activation='softmax')
    def call(self, inputs):
        anchor, target = inputs
        out_anchor = self.embed_layer1(anchor)
        out_anchor = self.att1(out_anchor)
        out_anchor = self.global_avg1(out_anchor)
        # out_anchor = self.drop_out1(out_anchor)
        
        out_target = self.embed_layer2(target)
        out_target = self.att2(out_target)
        out_target = self.global_avg2(out_target)
        # out_target = self.drop_out2(out_target)
        
        output = layers.Concatenate(axis=1)([out_anchor, out_target])
        output = self.dense1(output)
        output = self.dense2(output)
        output = self.dense3(output)
        output = self.dense4(output)
        output = self.dense5(output)
        output = self.drop_out_clf(output)
        output = self.dense_clf(output)
        return output

In [14]:
vocab_size = len(tokenizer.word_index)
output_dim = 32
input_dim_1 = 7
input_dim_2 = 17
num_heads = 8
embed_dim = 32
ff_dim = 256

In [15]:
model = AutoEncoderModel(vocab_size, num_heads, embed_dim, ff_dim, output_dim, input_dim_1, input_dim_2)

In [16]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [17]:
x_anchor = padded_anchor[:33000]
x_target = padded_target[:33000]
anchor_val = padded_anchor[33000:]
target_val = padded_target[33000:]
y_data = y_score[:33000]
y_val = y_score[33000:]

In [18]:
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)
history = model.fit([x_anchor, x_target], y_data, epochs=100, batch_size=128, callbacks=[callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [19]:
#model.evaluate([anchor_val, target_val], y_val)

In [20]:


model.summary()



Model: "auto_encoder_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 positional_embedding (Posit  multiple                 285984    
 ionalEmbedding)                                                 
                                                                 
 transformer (Transformer)   multiple                  50368     
                                                                 
 positional_embedding_1 (Pos  multiple                 286304    
 itionalEmbedding)                                               
                                                                 
 transformer_1 (Transformer)  multiple                 50368     
                                                                 
 dropout_4 (Dropout)         multiple                  0         
                                                                 
 global_average_pooling1d (G  multiple          

In [21]:
pre = model.predict([anchor_val[:20], target_val[:20]])
predicted = []
for x in pre:
    predicted.append(np.argmax(x))
predicted = LE.inverse_transform(predicted)
predicted



array([0.5 , 0.25, 0.25, 0.5 , 0.5 , 0.25, 0.5 , 0.5 , 0.25, 0.5 , 0.75,
       0.5 , 0.75, 0.25, 0.5 , 0.  , 0.5 , 0.25, 0.  , 0.5 ])

In [22]:
True_values = LE.inverse_transform(y_val[:20])
True_values

array([0.  , 0.75, 0.25, 0.5 , 0.25, 0.75, 0.5 , 0.5 , 0.5 , 0.5 , 1.  ,
       0.5 , 0.25, 0.25, 0.  , 1.  , 0.  , 0.25, 0.  , 0.75])

In [23]:
anchor_test= tokenizer.texts_to_sequences(df_test['anchor'])
target_test = tokenizer.texts_to_sequences(df_test['target'])

In [24]:
padded_anchor_test = tf.keras.preprocessing.sequence.pad_sequences(anchor_test, maxlen=7)
padded_target_test = tf.keras.preprocessing.sequence.pad_sequences(target_test, maxlen=17)

In [25]:
test_predicted = model.predict([padded_anchor_test[:], padded_target_test[:]])



In [26]:
predicted_arr = []
for x in test_predicted:
    predicted_arr.append(np.argmax(x))

In [27]:
predicted_arr = LE.inverse_transform(predicted_arr)

In [28]:
test_id_1 = np.array(test_id)
predicted_arr_1 = np.array(predicted_arr)
print(test_id_1.shape, predicted_arr_1.shape)

(36,) (36,)


In [29]:
Submission = pd.DataFrame({'id': test_id_1, 'score': predicted_arr_1})

In [30]:
# import os
# os.makedirs('Submissions')
filename = 'submission.csv'
Submission.to_csv(filename, index=False)