## Word Vec Similarity

In [1]:
import utils
import pandas as pd
import numpy as np

from sklearn import metrics

In [2]:
X_train = utils.load('X_train')
# X_train_lemma = utils.load('X_train_lemma')

In [3]:
X_train_stack = utils.stack_questions(X_train)

In [4]:

vecs = []
cos_sim = []
counter = 1
for doc in utils.nlp.pipe(X_train_stack[:10], disable=['parser', 'ner']):
    vecs.append(np.array(doc.vector).reshape(1, -1))
    if counter % 2 == 0:
        cos_sim.append(metrics.pairwise.cosine_similarity(vecs[0], vecs[1])[0,0])
        vecs = []
    counter += 1
cos_sim

[0.98668426, 0.9249996, 0.9290682, 0.73074234, 0.846616]

In [6]:
X_train_lemma = [str(x) for x in X_train_lemma]

In [7]:
vecs = []
cos_sim = []
counter = 1
for doc in utils.nlp.pipe(X_train_lemma[:10], disable=['parser', 'ner']):
    vecs.append(np.array(doc.vector).reshape(1, -1))
    if counter % 2 == 0:
        cos_sim.append(metrics.pairwise.cosine_similarity(vecs[0], vecs[1])[0,0])
        vecs = []
    counter += 1
cos_sim

[0.9682379, 0.8167546, 0.8931898, 0.55023444, 0.7400342]

In [8]:
X_train_lemma[0]

'step step guide invest share market india'

In [9]:
from scipy.spatial.distance import pdist

## Generate min/max/avg distance features between words.

Can use the below to find the min/max/average distance between words for each sentence.

In [15]:
def calc_min_max_avg_distance(v, metric):
    ''' Calculates the min / max / avg distance of vectors.
    
    v: array
    Array of vectors.
    
    metric: string
    Any valid string metric for scipy.spatial.distance.pdist
    
    returns: (min, max, avg) float
    
    '''
    
    dist = pdist(v, metric=metric)
    return [np.min(dist), np.max(dist), np.mean(dist)]

In [14]:
def add_min_max_avg_distance_features(X):
    ''' Engineers min/max/avg distance features between words for a single question.
    
    X: array
    Array of questions
    
    return: array (n_questions, 3)
    Each question will have min, max, and avg word vector distances calculated.
    '''
    dist = []
    for doc in utils.nlp.pipe(X, disable=['parser', 'ner']):
        vecs = [tok.vector for tok in doc if tok.vector.sum() != 0] # accounts for white space vector of 0
        dist.append(calc_min_max_avg_distance(vecs, 'euclidean'))
        dist.append(calc_min_max_avg_distance(vecs, 'cosine'))
        dist.append(calc_min_max_avg_distance(vecs, 'cityblock'))


    return dist

In [13]:
# latex tags should be stripped!!
import re

# replace math tags with blank spaces
math_re = re.compile('\[math.*math\]')
X_train_stack_math = [math_re.sub('', x) for x in X_train_stack]

# remove punctuations
import string
punctuations = re.compile(f'[{re.escape(string.punctuation)}]')

num_re = re.compile('[0-9]')

# X_train_stack_math = [num_re.sub('', x) for x in X_train_stack_math]
X_train_stack_math = [punctuations.sub('',x) for x in X_train_stack_math]
X_train_stack_math = [re.sub(r"""\w*\d\w*""", '', x) for x in X_train_stack_math]

X_train_stack_math[7]

'Find the remainder when  is divided by '

In [None]:
alphanumeric = lambda x: re.sub(r"""\w*\d\w*""", ' ', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

In [123]:
for x in X_train_stack_math:
    if '[' in x:
        print(x)
#         print(math_re.sub('', x))
        print()

How do I get my website on the first page on Google [Search for specific Keyword] ?

[Late 2015 / Early 2016] Why does the United Kingdom want to leave the European Union? Why do some people there want to quit the EU?

[Before Brexit vote] What are the pros and cons of Britain exiting/staying in the EU?

[SPOILER] How big of a shock was it that Darth Vader was Luke's father when it was first revealed?

[Force Awakens Spoilers]: When Kylo Ren killed his father, why did it take around 25 seconds for Chewbacca and co to react violently?

What is the solution of this:- [{√ (√5+2) +√ (√5-2)} /√ (√5+1)]-√ (3-2√2)?

[Late 2015 / Early 2016] Why does the United Kingdom want to leave the European Union? Why do some people there want to quit the EU?

At the end of Rogue One, how did the rebels feel when they saw [SPOILER]?

[SPOILERS] Why did Mary save Sherlock from the bullet in the first episode of season four of Sherlock?

How do I solve √ [5+2(√6)]?

Game of Thrones Season 5 Episode 8 (Hardh

## n-gram similarity between two questions

In [18]:
from nltk import ngrams

In [95]:
from spacy.attrs import LOWER

def calc_ngram_similarity(X, n_grams):
    ''' Calculates the ngram similarity between a pair of questions. Similarity is defined as,
            2 · ( |S1| / |S1 ∩ S2| + |S2| / |S1 ∩ S2|)^−1
        where S_i is the ngrams for question i
        
        X: array-like (n_pairs*2,)
        Array of questions with pairs in sequential order.
        
        n_grams: list
        List of n-grams to calculate, i.e. [1, 2, 3]
        
        return: array-like (n_pairs, len(n_grams))
        N-dimensional array of n_gram similarity calculated for the different n_grams.
        
    '''
    counter = 1
    ngram_sim = []
    for doc in utils.nlp.pipe(X, disable=['parser', 'ner'], batch_size=10000):
        tokens = doc.to_array([LOWER])
        if counter % 2 == 1:
            ngram_q1 = [set(ngrams(tokens, i, pad_right=True)) for i in n_grams]
        else:
            ngram_q2 = [set(ngrams(tokens, i, pad_right=True)) for i in n_grams]
            
            doc_ngram_sim = []
            for i in range(len(ngram_q1)):
                try:
                    s1 = len(ngram_q1[i]) / len(ngram_q1[i].intersection(ngram_q2[i]))
                except:
                    s1 = 0

                try:
                    s2 = len(ngram_q2[i]) / len(ngram_q1[i].intersection(ngram_q2[i]))
                except:
                    s2 = 0

                if s1 == 0 and s2 == 0:
                    doc_ngram_sim.append(0)
                else:
                    doc_ngram_sim.append(2 * (s1 + s2)**-1)
            ngram_sim.append(doc_ngram_sim)
        
        counter += 1
    return ngram_sim

In [96]:
calc_ngram_similarity(X_train_lemma[:1000], [1,2,3])

[[0.9090909090909091, 0.7692307692307694, 0.6153846153846154],
 [0.6153846153846154, 0.6153846153846154, 0.6153846153846154],
 [0.5454545454545454, 0, 0],
 [0, 0, 0],
 [0.3076923076923077, 0, 0],
 [0.5, 0.14285714285714285, 0],
 [0.6666666666666666, 0, 0],
 [0.5454545454545454, 0, 0],
 [0, 0, 0],
 [0.6666666666666666, 0.6666666666666666, 0.6666666666666666],
 [1.0, 1.0, 1.0],
 [0.8, 0.4, 0],
 [1.0, 1.0, 1.0],
 [0.5714285714285714, 0.14285714285714285, 0.14285714285714285],
 [0.6, 0.4, 0.2],
 [0.5714285714285714, 0.2857142857142857, 0.2857142857142857],
 [0.5, 0.5, 0.5],
 [0, 0, 0],
 [0.8, 0.4, 0],
 [0.5, 0.25, 0],
 [0.5714285714285714, 0.2857142857142857, 0.2857142857142857],
 [0.8, 0.6, 0.4],
 [0, 0, 0],
 [0.8571428571428572, 0.5714285714285714, 0.2857142857142857],
 [0.5454545454545454, 0, 0],
 [0.3333333333333333, 0.15384615384615385, 0.15384615384615385],
 [0.4285714285714286, 0, 0],
 [0.4, 0, 0],
 [0.75, 0.75, 0.75],
 [0.4444444444444444, 0.2222222222222222, 0],
 [0.8, 0.666666666

In [94]:
for g in ngrams([1], 3, pad_right=True):
    print(g)

(1, None, None)


## Stopwords testing

In [12]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer

In [16]:
# feature engineering pipes
single_question_pipe = Pipeline(
    [
        ('dist', FunctionTransformer(utils.add_min_max_avg_distance_features, validate=False)),
        ('unstack', FunctionTransformer(utils.unstack_questions, validate=False))
    ]
)

pair_question_pipe = Pipeline(
    [
        ('ngram_sim', FunctionTransformer(utils.calc_ngram_similarity, kw_args={'n_grams':[1, 2, 3]}, validate=False))
    ]
)

# clean text pipe
clean_text_pipe = Pipeline(
    [
        ('stack', FunctionTransformer(utils.stack_questions, validate=False)),
        ('clean', FunctionTransformer(utils.clean_questions, validate=False)),
        ('feats', FeatureUnion(
            [
                ('pair', pair_question_pipe),
                ('single', single_question_pipe)
            ]
        ))
    ]
)

# lemma pipe
lemma_pipe = Pipeline(
    [
        ('stack', FunctionTransformer(utils.stack_questions, validate=False)),
        ('clean', FunctionTransformer(utils.clean_questions, validate=False)),
        ('lemma', FunctionTransformer(utils.apply_lemma, validate=False, kw_args={'incl_stop_words':True})),
        ('feats', FeatureUnion(
            [
                ('pair', pair_question_pipe),
                ('single', single_question_pipe)
            ]
        ))
    ]
)

# pre-process pipe
pre_process_pipe = Pipeline(
    [
        ('feats', FeatureUnion(
            [
                ('clean_features', clean_text_pipe),
                ('lemma_pipe', lemma_pipe)
            ]
        ))
    ]
)

test = X_train[X_train.question1 == 'What is the best way to invest $1,000?']
test

Unnamed: 0,id,question1,question2
264574,264574,"What is the best way to invest $1,000?","How should I invest $3,000?"


In [18]:
pre_process_pipe.transform(test)

array([[  0.18181818,   0.18181818,   0.18181818,   4.4516778 ,
          7.68992106,   6.03018017,   0.36869252,   0.74090815,
          0.54074633,  59.87225566, 102.96853832,  81.7893619 ,
          3.32768196,   6.91181574,   5.05989927,   0.22358162,
          0.77860875,   0.48962364,  44.39012259,  96.74042761,
         69.17308505,   0.18181818,   0.18181818,   0.18181818,
          4.4516778 ,   7.68992106,   6.03018017,   0.36869252,
          0.74090815,   0.54074633,  59.87225566, 102.96853832,
         81.7893619 ,   3.32768196,   6.91181574,   4.91637156,
          0.22358162,   0.77860875,   0.45924009,  44.39012259,
         96.74042761,  67.18560721]])