In [8]:
from gensim.models import Word2Vec

In [9]:
import numpy as np
import re

In [10]:
from keras.models import Model, load_model, save_model, Sequential
from keras.layers import LSTM, Dense, Input, Dropout, BatchNormalization, concatenate
from sklearn.preprocessing import LabelBinarizer

In [11]:
w2v = Word2Vec.load("ars_w2vnew1.bin")

In [12]:
def text_cleanup(desc, autocomplete_flg=False):
    urls = r"[href:]*http[s]*://.+"
    s = str(desc).lower()
    s = re.sub(urls, '', s)
    s = re.sub("[^a-z0-9\- ]"," ", s)
    s = re.sub('(\d+)(\D)',r'\1 \2',s)
    s = re.sub("\s+"," ", s)
    if autocomplete_flg:
        try:
            f = s.split(" ")
            f[-1] = autocomplete.predict(f[-2],f[-1].replace(".",""),1)[0][0] if s.endswith("..")  else f[-1].strip(".")
            s = " ".join(f)
        except:
            pass
    s = s.strip(".")
    return s

def preprocess_description(description, embeddings=w2v):
    words = [ x.strip().lower() for x in text_cleanup(description,autocomplete_flg=False).split(" ")[:40] if x!='']
    words = words + [0 for _ in range(40-len(words))]
    words = np.array([embeddings[word] if word in embeddings.wv.vocab else np.zeros((50,)) for word in words]).reshape(40,50)
    return words


In [13]:
import pandas as pd
import dask.dataframe as dd

In [46]:
siam_data = pd.read_csv("final_out.csv",delimiter = '^' ,low_memory=True,nrows=500000)

In [47]:
siam_data['SIM_SCORE'] = siam_data['SIM_SCORE'].apply(lambda x: round(float(x),1))

In [48]:
siam_data['SIM_SCORE'] = siam_data['SIM_SCORE'].fillna(0)

In [49]:
sia_ars = load_model('ARS_Siamese_model_new.hdf5')

In [50]:
sia_ars.reset_states()

In [51]:
sia_ars.compile(optimizer='RMSprop',
              loss='mean_squared_logarithmic_error',
              metrics=['accuracy'])

In [52]:
sia_ars.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
sequential_2_input (InputLayer) (None, 40, 50)       0                                            
__________________________________________________________________________________________________
sequential_3_input (InputLayer) (None, 40, 50)       0                                            
__________________________________________________________________________________________________
head2_head1_head2_head1_head2_h (None, 200)          200800      sequential_2_input[0][0]         
                                                                 sequential_3_input[0][0]         
__________________________________________________________________________________________________
head2_head1_head2_head1_head2_h (None, 158)          31758       head2_head1_head2_head1_head2_hea
          

In [53]:
features1 = np.array([preprocess_description(x) for x in siam_data.ITM_DESC1.values])



In [54]:
features2 = np.array([preprocess_description(x) for x in siam_data.ITM_DESC2.values])



In [55]:
bin_otpt = siam_data.SIM_SCORE.apply(round)

In [57]:
sia_ars.fit([features1,features2], bin_otpt.values, epochs = 1, shuffle=True, batch_size=500, validation_split=0.1)

Train on 450000 samples, validate on 50000 samples
Epoch 1/1

KeyboardInterrupt: 

In [63]:
sia_ars.save("ARS_Siamese_model_new_1apr.hdf5")

In [62]:
a = preprocess_description('Cheese - Natural Milk fixed weight').reshape(1,40,50)
b = preprocess_description('Cheese - Natural Mlk fixed').reshape(1,40,50)
print(sia_ars.predict([a,b]))

[[0.6875293]]


