In [1]:
#!nvidia-smi

In [1]:
import sys
sys.path.append('keras_elmo_bert/')

import os
#os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = '1'

In [17]:
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns

import itertools
import string
import numpy as np 
import pandas as pd 
import tensorflow as tf
import tensorflow_hub as hub
import re

from keras import backend as K
import keras.layers as layers
from keras.models import Model, load_model
from keras.layers import Layer
from keras.utils import Sequence
from keras.layers import Activation, Conv1D, Dense, Dropout, Embedding, Flatten, GlobalMaxPool1D, LSTM, Dot, Lambda
from keras.layers import Input, MaxPooling1D, GlobalAveragePooling1D, multiply, concatenate, Reshape
from keras.layers import Bidirectional
from keras.optimizers import Adam
import keras.initializers as initializers
import keras.regularizers as regularizers
import keras.constraints as constraints
from keras.callbacks import EarlyStopping


from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score
from sklearn.utils import shuffle
import pickle

sess = tf.Session()

# Reduce logging output.
tf.logging.set_verbosity(tf.logging.ERROR)

In [3]:
max_seq_length = 64
bert_path = "https://tfhub.dev/google/bert_cased_L-12_H-768_A-12/1"
elmo_path = "https://tfhub.dev/google/elmo/2"

# Create train test split

Чтобы понять обобщающую способность модели, необходимо валидироватсья не тех обьектах, которые не участвовали в обучении. Соотвественно, мы выбираем некоторое количество индексов и смотрим, чтобы они не находились в валидации в любом из полей qid1 или qid2  

In [4]:
all_ques_df = pd.read_pickle('data/all_ques_df.pickle')
df = pd.read_pickle('data/df.pickle')

In [5]:
qid_arr = np.unique(all_ques_df['new_qid'].values)

In [6]:
train_ind, val_ind =  train_test_split(qid_arr, test_size=0.3, random_state=42)

In [7]:
train_df = df[((df.qid1_new.isin(train_ind)) | (df.qid2_new.isin(train_ind)))]

In [8]:
test_df = df[((~df.qid1_new.isin(train_ind)) & (~df.qid2_new.isin(train_ind)))]

In [9]:
print(f'Train size is {len(train_df)/len(df)}')
print(f'Test size is {len(test_df)/len(df)}')

Train size is 0.9112964518580743
Test size is 0.08870354814192567


# ELMO pipeline

## Elmo Tokenizer

In [10]:
from elmo_tokenizer import ELMO_tokenizer

In [11]:
tokenizer = ELMO_tokenizer(max_seq_length)

In [12]:
train_text = [tokenizer.predict(train_df['question1'].tolist()), tokenizer.predict(train_df['question2'].tolist())]
train_label = train_df['is_duplicate'].values

test_text = [tokenizer.predict(test_df['question1'].tolist()), tokenizer.predict(test_df['question2'].tolist())]
test_label = test_df['is_duplicate'].values

HBox(children=(IntProgress(value=0, description='Converting examples to tokens', max=91126, style=ProgressStyl…




HBox(children=(IntProgress(value=0, description='Converting examples to tokens', max=91126, style=ProgressStyl…




HBox(children=(IntProgress(value=0, description='Converting examples to tokens', max=8870, style=ProgressStyle…




HBox(children=(IntProgress(value=0, description='Converting examples to tokens', max=8870, style=ProgressStyle…




## Elmo Model

In [13]:
from elmo_layer import ElmoLayer

In [14]:
def RocAuc(y_true, y_pred):
    return tf.py_func(roc_auc_score, (y_true, y_pred), tf.double)

In [18]:
def build_model_elmo_siamese(max_seq_length): 
    
    input_tokens_1 = Input(shape=(max_seq_length,),dtype="string")
    input_tokens_2 = Input(shape=(max_seq_length,),dtype="string")
    
    def get_elmo_with_head():
        input_tokens = Input(shape=(max_seq_length,),dtype="string")
        elmo_output = ElmoLayer(trainable=True, tf_hub = elmo_path, output_representation='default')(input_tokens)
        dense = Dense(256, activation='sigmoid')(elmo_output)
        
        return  Model(inputs=input_tokens, outputs=dense)
    
    ELMO = get_elmo_with_head()
    dense_tokens_1 = ELMO(input_tokens_1)
    dense_tokens_2 = ELMO(input_tokens_2)
    L1_layer = Lambda(lambda tensors:K.abs(tensors[0] - tensors[1]))
    L1_distance = L1_layer([dense_tokens_1, dense_tokens_2])
    
    pred = Dense(1, activation='sigmoid')(L1_distance)
    model = Model(inputs=[input_tokens_1,input_tokens_2 ], outputs=pred)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy',RocAuc])
    model.summary()
    
    return model

def initialize_vars(sess):
    sess.run(tf.local_variables_initializer())
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    K.set_session(sess)

In [19]:
elmo_model = build_model_elmo_siamese(max_seq_length)
initialize_vars(sess)


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            (None, 64)           0                                            
__________________________________________________________________________________________________
input_5 (InputLayer)            (None, 64)           0                                            
__________________________________________________________________________________________________
model_3 (Model)                 (None, 256)          93863252    input_4[0][0]                    
                                                                 input_5[0][0]                    
__________________________________________________________________________________________________
lambda_2 (Lambda)               (None, 256)          0           model_3[1][0]                    
          

In [20]:
elmo_model.trainable_weights

[<tf.Variable 'elmo_layer_2_module/aggregation/weights:0' shape=(3,) dtype=float32>,
 <tf.Variable 'elmo_layer_2_module/aggregation/scaling:0' shape=() dtype=float32>,
 <tf.Variable 'dense_3/kernel:0' shape=(1024, 256) dtype=float32_ref>,
 <tf.Variable 'dense_3/bias:0' shape=(256,) dtype=float32_ref>,
 <tf.Variable 'dense_4/kernel:0' shape=(256, 1) dtype=float32_ref>,
 <tf.Variable 'dense_4/bias:0' shape=(1,) dtype=float32_ref>]

In [21]:
callbacks = [
    EarlyStopping(monitor = 'val_acc', restore_best_weights = True, patience = 2)
]

In [None]:
elmo_model.fit(
    train_text,
    train_label,
    validation_data=(test_text, test_label),
    epochs=5,
    verbose = 1,
    batch_size=32,
    callbacks=callbacks
)


Train on 91126 samples, validate on 8870 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

## Load ELMO model

In [43]:
elmo_model = build_model_elmo_siamese(max_seq_length)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_10 (InputLayer)           (None, 64)           0                                            
__________________________________________________________________________________________________
input_11 (InputLayer)           (None, 64)           0                                            
__________________________________________________________________________________________________
model_7 (Model)                 (None, 256)          93863252    input_10[0][0]                   
                                                                 input_11[0][0]                   
__________________________________________________________________________________________________
lambda_4 (Lambda)               (None, 256)          0           model_7[1][0]                    
          

In [44]:
elmo_model.load_weights('models/ELMOModel.h5')

In [56]:
elmo_model.evaluate(test_text, test_label)



[0.4514144848876768, 0.7816234498443303, 0.8619958389695731]

## Find 100 most similar objects for qid =1

In [57]:
all_ques_df.head()

Unnamed: 0,qid,questions,num_of_words,new_qid
0,1,What is the step by step guide to invest in share market in india?,14,1
1,3,What is the story of Kohinoor (Koh-i-Noor) Diamond?,8,3
2,5,How can I increase the speed of my internet connection while using a VPN?,14,5
3,7,Why am I mentally very lonely? How can I solve it?,11,7
4,9,"Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?",13,9


In [58]:
search_df = all_ques_df.query('qid < 121424')

In [59]:
search_df = search_df.drop_duplicates('new_qid')

In [60]:
qid_1_question = search_df.query('qid == 1').questions.values[0]

In [61]:
search_df = search_df[['questions','qid']]

In [62]:
search_df.head()

Unnamed: 0,questions,qid
0,What is the step by step guide to invest in share market in india?,1
1,What is the story of Kohinoor (Koh-i-Noor) Diamond?,3
2,How can I increase the speed of my internet connection while using a VPN?,5
3,Why am I mentally very lonely? How can I solve it?,7
4,"Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?",9


In [63]:
search_df['qid_1_question'] = qid_1_question

In [64]:
search_df.head()

Unnamed: 0,questions,qid,qid_1_question
0,What is the step by step guide to invest in share market in india?,1,What is the step by step guide to invest in share market in india?
1,What is the story of Kohinoor (Koh-i-Noor) Diamond?,3,What is the step by step guide to invest in share market in india?
2,How can I increase the speed of my internet connection while using a VPN?,5,What is the step by step guide to invest in share market in india?
3,Why am I mentally very lonely? How can I solve it?,7,What is the step by step guide to invest in share market in india?
4,"Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?",9,What is the step by step guide to invest in share market in india?


In [65]:
search_text = [tokenizer.predict(search_df['qid_1_question'].tolist()), tokenizer.predict(search_df['questions'].tolist())]

HBox(children=(IntProgress(value=0, description='Converting examples to tokens', max=106103, style=ProgressSty…




HBox(children=(IntProgress(value=0, description='Converting examples to tokens', max=106103, style=ProgressSty…




In [66]:
prediction = elmo_model.predict(search_text, verbose=1, batch_size=128)



In [67]:
top100_ind = prediction.argsort(axis=0)[::-1][:100,0]

In [68]:
search_df['prediction'] = prediction.flatten()

In [69]:
pd.set_option('max_colwidth', 200)
search_df.iloc[top100_ind].to_csv('results/top100_prediction_elmo.csv', index = False)

In [71]:
search_df.to_csv('results/all_prediction_elmo.csv', index = False)