In [167]:
from gensim.models import Word2Vec, KeyedVectors
from gensim.parsing.preprocessing import STOPWORDS
from scipy.sparse.linalg import svds
import numpy as np

In [2]:
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

In [221]:
def simple_average(sent):
    sents_emd = []
    for s in sent:
        words = str(s).lower().split()
        words = [w for w in words if w not in STOPWORDS]
#         print words
        sent_emd = []
        for w in words:
            try:
                sent_emd.append(model[w])
            except:
                continue
        sent_emd = np.array(sent_emd)
        sum_ = sent_emd.sum(axis=0)
        result = sum_/np.sqrt((sum_**2).sum())
        sents_emd.append(result)
    return sents_emd

In [235]:
def tf_idf(sent):
    word_counter = {}
    sentences = []
    total_count = 0
    for s in sent:
        words = str(s).lower().split()
        words = [w for w in words if w not in STOPWORDS]
        for w in words:
            if w in word_counter:
                word_counter[w] = word_counter[w] + 1
            else:
                word_counter[w] = 1
        sentences.append(words)
        total_count = total_count + len(words)
#     print total_count, word_counter, sentences
    no_of_sentences = len(sentences)
    sents_emd = []
    for s in sentences:
        sent_emd = []
        for word in s:
            tf = word_counter[word]/float(len(s))
            idf = np.log(no_of_sentences/float(1+ word_counter[word]))
            try:
                emd = tf*idf*model[word]
                sent_emd.append(emd)
            except:
                continue
        sent_emd = np.array(sent_emd)
        sum_ = sent_emd.sum(axis=0)
        result = sum_/np.sqrt((sum_**2).sum())
        sents_emd.append(result)
    return sents_emd

In [222]:
def smooth_inverse_frequency(sent, a=None):
    # Here sent is list of sentences.
    word_counter = {}
    sentences = []
    total_count = 0
    for s in sent:
        words = str(s).lower().split()
        words = [w for w in words if w not in STOPWORDS]
        for w in words:
            if w in word_counter:
                word_counter[w] = word_counter[w] + 1
            else:
                word_counter[w] = 1
        sentences.append(words)
        total_count = total_count + len(words)
    # print total_count, word_counter, sentences
    no_of_sentences = len(sentences)
    sents_emd = []
    for s in sentences:
        sent_emd = []
        for word in s:
            try:
                if a is None:
                    a = 0.001
                    emd = (a/(a + (word_counter[word]/total_count)))*model[word]
                else:
                    emd = (a/(a + (word_counter[word]/total_count)))*model[word]
                # print emd
            except:
                continue
            sent_emd.append(emd)
        sum_ = np.array(sent_emd).sum(axis=0)
#         print sum_
        sentence_emd = sum_/float(no_of_sentences)
        sents_emd.append(sentence_emd)
    u  = np.array(svds(sents_emd, k=1))
    u = u[2]
    new_sents_emd = []
    for s in sents_emd:
        s = s - s.dot(u.transpose()).dot(u)
        #print s
        new_sents_emd.append(s)
    return new_sents_emd

In [241]:
s1 = "this is a sample sentence with cat and dog"
s2 = "there was a time when computers were very expensive"
s3 = "a sample sentence with cute dog"
s4 = "I'm eagerly waiting for Avengers Infinity War"

In [242]:
sentences = [s1,s2,s3,s4]
sentences_emd1 = smooth_inverse_frequency(sentences)
sentences_emd2 = tf_idf(sentences)
sentences_emd3 = simple_average(sentences)
# print sentences_emd1, sentences_emd2

### For visualisation

In [243]:
# for simple average
a = ""
for i in sentences_emd3:
    for s in i:
        a = a + str(s) + '\t'
    a = a + '\n'
#print a
with open("record3.tsv", "w") as record_file:
    record_file.write(a)

Simple Average: 
<img src="AVG.png">

In [244]:
# for TF-IDF
a = ""
for i in sentences_emd2:
    for s in i:
        a = a + str(s) + '\t'
    a = a + '\n'
#print a
with open("record2.tsv", "w") as record_file:
    record_file.write(a)

TF-IDF: 
<img src="TFIDF.png">

In [245]:
# for SIF
a = ""
for i in sentences_emd1:
    for s in i:
        a = a + str(s) + '\t'
    a = a + '\n'
#print a
with open("record1.tsv", "w") as record_file:
    record_file.write(a)

SIF: 
<img src="SIF.png">