In [2]:
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
from gensim.models import KeyedVectors
from scipy.stats import skew, kurtosis
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis

# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [0]:
import pickle
classifier = pickle.load(open('ANN.model', 'rb'))

In [6]:
classifier.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_8 (Dense)              (None, 8)                 136       
_________________________________________________________________
dense_9 (Dense)              (None, 8)                 72        
_________________________________________________________________
dense_10 (Dense)             (None, 8)                 72        
_________________________________________________________________
dense_11 (Dense)             (None, 8)                 72        
_________________________________________________________________
dense_12 (Dense)             (None, 8)                 72        
_________________________________________________________________
dense_13 (Dense)             (None, 8)                 72        
_________________________________________________________________
dense_14 (Dense)             (None, 1)                 9         
Total para

In [0]:
def wordmoverdistance(s1, s2):
    s1 = str(s1).lower().split()
    s2 = str(s2).lower().split()
    s1 = [lemmatizer.lemmatize(w) for w in s1 if w not in stopwords.words('english')]
    s2 = [lemmatizer.lemmatize(w) for w in s2 if w not in stopwords.words('english')]
    return model.wmdistance(s1, s2)

In [0]:
def sent2vec(s):
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in stopwords.words('english')]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(model[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    return v / np.sqrt((v ** 2).sum())

In [0]:
X_Scaler = pickle.load(open('XScaler','rb'))

In [0]:
model = KeyedVectors.load_word2vec_format('drive/My Drive/semantic-question-matching/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [0]:
#question1 = 'What practical applications might evolve from the discovery of the Higgs Boson ?'
#question2 = 'What are some practical benefits of discovery of the Higgs Boson ?'

question1 = 'How can I start an online shopping (e-commerce) website ?'
question2 = 'Which web technology is best suitable for building a big E-Commerce website ?'



In [0]:
diff_len = len(str(question1)) - len(str(question2))
common_words = len(set(str(question1).lower().split()).intersection(set(str(question2).lower().split())))
fuzz_qratio = fuzz.QRatio(str(question1), str(question2))
fuzz_WRatio = fuzz.WRatio(str(question1), str(question2))
fuzz_partial_ratio = fuzz.partial_ratio(str(question1), str(question2))
fuzz_partial_token_set_ratio = fuzz.partial_token_set_ratio(str(question1), str(question2))
fuzz_partial_token_sort_ratio = fuzz.partial_token_sort_ratio(str(question1), str(question2))
fuzz_token_set_ratio = fuzz.token_set_ratio(str(question1), str(question2))
fuzz_token_sort_ratio = fuzz.token_sort_ratio(str(question1), str(question2))
wmd = wordmoverdistance(question1, question2)

In [0]:
question1_vectors = sent2vec(question1)
question2_vectors = sent2vec(question2)

cosine_distance = cosine(question1_vectors, question2_vectors)

cityblock_distance = cityblock(question1_vectors, question2_vectors)

canberra_distance = canberra(question1_vectors, question2_vectors)

euclidean_distance = euclidean(question1_vectors, question2_vectors)

minkowski_distance = minkowski(question1_vectors, question2_vectors, 3)

braycurtis_distance = braycurtis(question1_vectors, question2_vectors)

In [23]:
X = np.array([diff_len, common_words, fuzz_qratio, fuzz_WRatio, fuzz_partial_ratio, fuzz_partial_token_set_ratio, 
             fuzz_partial_token_sort_ratio, fuzz_token_set_ratio, fuzz_token_sort_ratio, wmd, cosine_distance,
             cityblock_distance, canberra_distance, euclidean_distance, minkowski_distance, braycurtis_distance
             ])
print(X)

[-20.           2.          60.          60.          58.
 100.          60.          55.          53.           2.59372402
   0.38819116  12.29121685 156.39157534   0.88112557   0.39143173
   0.49061381]


In [0]:
X = X_Scaler.transform(X.reshape(1,-1))

In [18]:
X

array([[ 0.44513407,  1.09804822,  0.44057859,  0.16956127,  0.00825133,
         0.20255278,  0.18230737,  0.54504029,  0.47123678, -0.31154066,
        -0.542105  , -0.30564021, -0.0391188 , -0.33920903, -0.34273853,
        -0.35185591]])

In [27]:
y_pred = classifier.predict(X)
print(y_pred)
print(y_pred > 0.5)

[[0.27985862]]
[[False]]
