In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import re
import matplotlib.pyplot as plt
from tqdm import tqdm

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint

from sklearn.preprocessing import StandardScaler

import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

nlp = spacy.load('en_core_web_sm')


Using TensorFlow backend.


In [7]:
q1 = "I love nlp".lower()
q2 = "I love to make videos".lower()

In [28]:
tfidf = TfidfVectorizer()
tfidf.fit_transform([q1, q2])

<2x5 sparse matrix of type '<class 'numpy.float64'>'
	with 6 stored elements in Compressed Sparse Row format>

In [29]:
word2tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))
word2tfidf

{'love': 1.0,
 'make': 1.4054651081081644,
 'nlp': 1.4054651081081644,
 'to': 1.4054651081081644,
 'videos': 1.4054651081081644}

In [10]:
doc1 = nlp(q1) 
doc2 = nlp(q2)

In [13]:
type(doc2)

spacy.tokens.doc.Doc

In [35]:
mean_vec1 = np.zeros([len(doc1), len(doc1[0].vector)])

In [36]:
mean_vec1

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.

In [37]:
for word1 in doc1:
    print(word1)

   # word2vec ( Creating Vector Representation of every word ) which is 96

    vec1 = word1.vector

    # Using try and catch to prefent key error [ For the words that are not there in our word2tfidf dict like empty space ]

    try:
        idf = word2tfidf[str(word1)]
    except:
        idf = 0

    # adding up all the words generated in the matrix (word2vec matrix * the word2tfidf Corresponding to that word)
    mean_vec1 += vec1 * idf

i
love
nlp


In [38]:
mean_vec1

array([[-2.40533312, -1.05059409, -2.16031912, -0.1682868 , -2.72275931,
         6.39793825, -0.58368123, -3.26152587,  1.49267551,  1.82573377,
        -6.07416534, -1.37627679,  0.06876779,  1.91047823, -3.87389851,
        -2.66136277,  2.53714967, -0.13287628, -3.02732217,  4.40111685,
        -1.89625588,  6.10395336, -2.46037412, -0.93231577, -1.02036343,
         1.684973  ,  2.20621514,  2.40115649,  3.24768329, -3.27569175,
         0.42884159, -1.65610921, -0.21401179, -1.7018609 , -5.11644816,
         0.85371566,  6.49861884, -7.08591723,  5.09042001, -2.18472046,
         4.02112889,  3.21094513,  2.30548501,  3.46452993, -2.35760111,
        -3.0578981 , -4.23516262,  0.95795107,  0.23030877,  0.15563512,
        10.59127045, -2.1550287 ,  2.55355644, -0.41226816, -6.19214511,
         2.19018841,  2.50945437,  1.07289028, -4.09613264,  0.21933989,
        -2.79309225, -2.50824076, -0.37317985, -3.0361315 ,  1.5452825 ,
         0.81818086,  4.61988199, -5.70382738, -4.0

In [39]:
mean_vec2 = mean_vec1.mean(axis=0)
mean_vec2

array([-2.40533312, -1.05059409, -2.16031912, -0.1682868 , -2.72275931,
        6.39793825, -0.58368123, -3.26152587,  1.49267551,  1.82573377,
       -6.07416534, -1.37627679,  0.06876779,  1.91047823, -3.87389851,
       -2.66136277,  2.53714967, -0.13287628, -3.02732217,  4.40111685,
       -1.89625588,  6.10395336, -2.46037412, -0.93231577, -1.02036343,
        1.684973  ,  2.20621514,  2.40115649,  3.24768329, -3.27569175,
        0.42884159, -1.65610921, -0.21401179, -1.7018609 , -5.11644816,
        0.85371566,  6.49861884, -7.08591723,  5.09042001, -2.18472046,
        4.02112889,  3.21094513,  2.30548501,  3.46452993, -2.35760111,
       -3.0578981 , -4.23516262,  0.95795107,  0.23030877,  0.15563512,
       10.59127045, -2.1550287 ,  2.55355644, -0.41226816, -6.19214511,
        2.19018841,  2.50945437,  1.07289028, -4.09613264,  0.21933989,
       -2.79309225, -2.50824076, -0.37317985, -3.0361315 ,  1.5452825 ,
        0.81818086,  4.61988199, -5.70382738, -4.0865171 ,  3.99