## Word Vec Similarity

In [1]:
import utils
import pandas as pd
import numpy as np

from sklearn import metrics

In [2]:
X_train = utils.load('X_train')
X_train_lemma = utils.load('X_train_lemma')

In [3]:
X_train_stack = utils.stack_questions(X_train)

In [4]:

vecs = []
cos_sim = []
counter = 1
for doc in utils.nlp.pipe(X_train_stack[:10], disable=['parser', 'ner']):
    vecs.append(np.array(doc.vector).reshape(1, -1))
    if counter % 2 == 0:
        cos_sim.append(metrics.pairwise.cosine_similarity(vecs[0], vecs[1])[0,0])
        vecs = []
    counter += 1
cos_sim

[0.98668426, 0.9249996, 0.9290682, 0.73074234, 0.846616]

In [6]:
X_train_lemma = [str(x) for x in X_train_lemma]

In [7]:
vecs = []
cos_sim = []
counter = 1
for doc in utils.nlp.pipe(X_train_lemma[:10], disable=['parser', 'ner']):
    vecs.append(np.array(doc.vector).reshape(1, -1))
    if counter % 2 == 0:
        cos_sim.append(metrics.pairwise.cosine_similarity(vecs[0], vecs[1])[0,0])
        vecs = []
    counter += 1
cos_sim

[0.9682379, 0.8167546, 0.8931898, 0.55023444, 0.7400342]

In [8]:
X_train_lemma[0]

'step step guide invest share market india'

In [9]:
from scipy.spatial.distance import pdist

## Generate min/max/avg distance features between words.

Can use the below to find the min/max/average distance between words for each sentence.

In [10]:
def calc_min_max_avg_distance(v, metric):
    dist = pdist(v, metric=metric)
    return [np.min(dist), np.max(dist), np.mean(dist)]

In [11]:
dist = []
for doc in utils.nlp.pipe(X_train_stack[7:8], disable=['parser', 'ner']):
    print([tok for tok in doc])
    vecs = [tok.vector for tok in doc if tok.vector.sum() != 0] # accounts for white space vector of 0
#     print(vecs[4])
    dist.append(calc_min_max_avg_distance(vecs, 'euclidean'))
    dist.append(calc_min_max_avg_distance(vecs, 'cosine'))
    dist.append(calc_min_max_avg_distance(vecs, 'cityblock'))

    
dist

[Find, the, remainder, when, [, math]23^{24}[/math, ], is, divided, by, 24,23, ?]


[[2.4431328502651244, 8.243298300698449, 6.451411427066599],
 [0.0811163002913935, 0.9673326969514745, 0.7217991310274229],
 [33.83734008832835, 110.19580763683189, 86.96829082302811]]

In [13]:
# latex tags should be stripped!!
import re

# replace math tags with blank spaces
math_re = re.compile('\[math.*math\]')
X_train_stack_math = [math_re.sub('', x) for x in X_train_stack]

# remove punctuations
import string
punctuations = re.compile(f'[{re.escape(string.punctuation)}]')

num_re = re.compile('[0-9]')

# X_train_stack_math = [num_re.sub('', x) for x in X_train_stack_math]
X_train_stack_math = [punctuations.sub('',x) for x in X_train_stack_math]
X_train_stack_math = [re.sub(r"""\w*\d\w*""", '', x) for x in X_train_stack_math]

X_train_stack_math[7]

'Find the remainder when  is divided by '

In [None]:
alphanumeric = lambda x: re.sub(r"""\w*\d\w*""", ' ', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

In [123]:
for x in X_train_stack_math:
    if '[' in x:
        print(x)
#         print(math_re.sub('', x))
        print()

How do I get my website on the first page on Google [Search for specific Keyword] ?

[Late 2015 / Early 2016] Why does the United Kingdom want to leave the European Union? Why do some people there want to quit the EU?

[Before Brexit vote] What are the pros and cons of Britain exiting/staying in the EU?

[SPOILER] How big of a shock was it that Darth Vader was Luke's father when it was first revealed?

[Force Awakens Spoilers]: When Kylo Ren killed his father, why did it take around 25 seconds for Chewbacca and co to react violently?

What is the solution of this:- [{√ (√5+2) +√ (√5-2)} /√ (√5+1)]-√ (3-2√2)?

[Late 2015 / Early 2016] Why does the United Kingdom want to leave the European Union? Why do some people there want to quit the EU?

At the end of Rogue One, how did the rebels feel when they saw [SPOILER]?

[SPOILERS] Why did Mary save Sherlock from the bullet in the first episode of season four of Sherlock?

How do I solve √ [5+2(√6)]?

Game of Thrones Season 5 Episode 8 (Hardh

In [81]:
doc = utils.nlp(X_train_lemma[0], disable=['parser', 'ner'])
len(doc.i)

array([[ 4.79147255e-01, -2.58618927e+00,  1.56436813e+00,
        -5.49367666e-01, -4.39957237e+00,  2.93904471e+00,
        -2.45051503e-01, -1.13136673e+00, -2.08668995e+00,
        -2.78407395e-01,  1.14885020e+00, -2.38947153e+00,
         1.88138735e+00, -3.44085515e-01, -1.37388539e+00,
         6.10929549e-01,  7.64436662e-01,  1.71793485e+00,
         1.87052262e+00,  1.18563247e+00,  2.43010592e+00,
         2.94689178e-01, -1.64125192e+00,  9.06525135e-01,
        -1.02346873e+00, -4.37238598e+00, -2.83000779e+00,
         3.38336992e+00,  1.45144224e+00, -1.69031262e+00,
        -4.59301519e+00, -6.91419363e-01, -1.50986671e-01,
        -5.45370638e-01,  2.07257676e+00, -2.92190456e+00,
         2.66925383e+00, -5.71201384e-01,  2.50380182e+00,
        -2.12484449e-01, -3.41648030e+00, -3.39322615e+00,
         5.64731956e-02, -1.27080798e+00, -1.60847437e+00,
         2.19853687e+00, -1.79510927e+00,  3.86635351e+00,
         1.22364032e+00,  3.21989179e+00,  2.51179123e+0

In [57]:
vecs

[array([ 2.0800e-01,  5.8627e-01, -4.4369e-02,  2.9184e-01,  2.9278e-01,
         3.1826e-02, -2.1332e-01,  4.8224e-02,  2.8272e-01,  2.0181e+00,
        -1.5303e-01,  2.3402e-03,  3.5969e-01,  2.6506e-01, -9.9319e-02,
         1.8374e-01,  2.8495e-02,  2.0117e+00, -1.7409e-01,  1.6781e-01,
        -9.9691e-02, -7.9988e-02, -1.0375e-01,  3.0586e-01, -2.8488e-01,
         3.0294e-01,  2.0665e-01, -7.4481e-01, -4.2651e-01, -4.2368e-01,
        -2.1342e-01,  1.7955e-01,  6.8568e-01, -3.0493e-01,  6.2961e-01,
        -1.7612e-01,  3.4853e-01, -3.9878e-02,  6.6102e-01, -7.4104e-01,
        -1.9306e-01, -1.1007e-01,  4.0026e-02, -2.5655e-01, -4.6562e-01,
        -6.6866e-03, -9.1500e-03, -4.1531e-02, -4.8741e-01, -3.5074e-01,
         9.9260e-02, -1.6402e-03,  1.3336e-01,  1.5092e-01, -2.0644e-01,
        -1.5461e-01,  2.8522e-01,  1.4320e-01,  7.4280e-03, -5.0149e-02,
        -1.7733e-02,  4.4678e-01, -7.8900e-01,  2.0365e-03,  2.6537e-01,
        -6.1249e-02, -4.2717e-02,  2.8436e-01,  1.9

In [41]:
tot = 0
for i in range(1,15):
    tot+=i
tot

105