**Word Mover's Distance and application**

Part I: Load 20NG dataset and grab ground truth labels

In [1]:
from nltk import word_tokenize
from nltk import download
from nltk.corpus import stopwords

download('punkt')
download('stopwords')

stop_words = stopwords.words('english')

def preprocess(text):
    text = text.lower()
    doc = word_tokenize(text)
    doc = [word for word in doc if word not in stop_words]
    doc = [word for word in doc if word.isalpha()]
    return doc

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\root\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\root\AppData\Roaming\nltk_data...


[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def filter_docs(corpus, texts, labels, condition_on_doc):
    """
    Filter corpus, texts and labels given the function condition_on_doc which takes
    a doc.
    The document doc is kept if condition_on_doc(doc) is true.
    """
    number_of_docs = len(corpus)

    if texts is not None:
        texts = [text for (text, doc) in zip(texts, corpus)
                 if condition_on_doc(doc)]

    labels = [i for (i, doc) in zip(labels, corpus) if condition_on_doc(doc)]
    corpus = [doc for doc in corpus if condition_on_doc(doc)]

    print("{} docs removed".format(number_of_docs - len(corpus)))

    return (corpus, texts, labels)

snippets = []
snippets_labels = []
snippets_file = "data-web-snippets\\train.txt"
with open(snippets_file, 'r', encoding = 'utf-8') as f:
    for line in f:
        # each line is a snippet: a bag of words separated by spaces and
        # the category
        line = line.split()
        category = line[-1]
        doc = line[:-1]
        snippets.append(doc)
        snippets_labels.append(category)

snippets, _, snippets_labels = filter_docs(snippets, None, snippets_labels,
                                           lambda doc: (len(doc) != 0))

0 docs removed


Part II: Compute the various similiarties between the snippets

First we use Latent Semantic Indexing (LSI)

In [3]:
import numpy as np
from gensim import corpora
from gensim.models import TfidfModel
from gensim.models import LsiModel
from gensim.similarities import MatrixSimilarity


sims = {'snippets': {}}
dictionary_snippets = corpora.Dictionary(snippets)
corpus_gensim_snippets = [dictionary_snippets.doc2bow(doc) for doc in snippets]
tfidf_snippets = TfidfModel(corpus_gensim_snippets)
corpus_tfidf_snippets = tfidf_snippets[corpus_gensim_snippets]
lsi_snippets = LsiModel(corpus_tfidf_snippets,
                        id2word=dictionary_snippets, num_topics=200)
lsi_index_snippets = MatrixSimilarity(lsi_snippets[corpus_tfidf_snippets])
sims['snippets']['LSI'] = np.array([lsi_index_snippets[lsi_snippets[corpus_tfidf_snippets[i]]]
                                    for i in range(len(snippets))])



Using TensorFlow backend.


Now use Word2vec centroids

In [4]:
from gensim.models import Word2Vec
from gensim.models.keyedvectors import KeyedVectors

filename = 'C:\\GoogleNews-vectors-negative300.bin'
word2vec_model = KeyedVectors.load_word2vec_format(filename, binary=True)
word2vec_model.init_sims(replace=True)
def document_vector(word2vec_model, doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc if word in word2vec_model.vocab]
    return np.mean(word2vec_model[doc], axis=0)

def has_vector_representation(word2vec_model, doc):
    """check if at least one word of the document is in the
    word2vec dictionary"""
    return not all(word not in word2vec_model.vocab for word in doc)

snippets, _, snippets_labels = filter_docs(snippets, None, snippets_labels,
                                           lambda doc: has_vector_representation(word2vec_model, doc))

from sklearn.metrics.pairwise import cosine_similarity

sims['snippets']['centroid'] = cosine_similarity(np.array([document_vector(word2vec_model, doc)
                                                           for doc in snippets]))

4 docs removed


Now we use pairwise WMD distances

In [26]:
from sklearn.metrics.pairwise import pairwise_distances

A = np.array([[i] for i in range(len(snippets))])

def f(x, y):
    return word2vec_model.wmdistance(snippets[int(x)], snippets[int(y)])

X_wmd_distance_snippets = pairwise_distances(A, metric=f, n_jobs=-1)

print(X_wmd_distance_snippets)

In [16]:
def most_similar(i, X_sims, topn=None):
    """return the indices of the topn most similar documents with document i
    given the similarity matrix X_sims"""

    r = np.argsort(X_sims[i])[::-1]
    if r is None:
        return r
    else:
        return r[:topn]

#LSI
print("LSI distances")
print(most_similar(0, sims['snippets']['LSI'], 20))

#Centroid
print("centroid distances")
print(most_similar(0, sims['snippets']['centroid'], 20))

from gensim.similarities import WmdSimilarity

wmd_similarity_snippets = WmdSimilarity(snippets, word2vec_model, num_best=20)
most_similars_snippets = wmd_similarity_snippets[snippets[0]]

print(wmd_similarity_snippets)
print(most_similars_snippets)
print("Original Snippet: ")
print(snippets[0])

LSI distances
[   0   13  973    2   14  901    5   15   17   16 6820 6827    7 7513    4
 6831 6832   19  623   12]
centroid distances
[   0   13   15  973  378   17    2 6658 6829 6833 6307   16  974    8 5535
   19   14    5  965   12]


WmdSimilarity<10056 docs, 300 features>
[(0, 1.0), (13, 0.60329492929644768), (973, 0.5286272334018054), (2, 0.52601643645020979), (378, 0.52323760224834481), (16, 0.5177545982856413), (7509, 0.51254049902118537), (12, 0.51098133962722492), (6828, 0.50705399681485708), (19, 0.50653771271015224), (17, 0.50652784909122306), (974, 0.50597894422453937), (7, 0.50416865321814652), (15, 0.5041521835206122), (712, 0.50320841421252649), (6663, 0.50302378894798072), (56, 0.50239767268269631), (6829, 0.5021651306706193), (5, 0.501937836425843), (2169, 0.50181756077408601)]
Original Snippet: 
['manufacture', 'manufacturer', 'directory', 'directory', 'china', 'taiwan', 'products', 'manufacturers', 'directory-', 'taiwan', 'china', 'products', 'manufacturer', 'direcory', 'exporter', 'directory', 'supplier', 'directory', 'suppliers']


In [14]:
for s in most_similars_snippets:
    print("Snippet match, WMD " + str(s[1]))
    print(snippets[s[0]])

Snippet match, WMD 1.0
['manufacture', 'manufacturer', 'directory', 'directory', 'china', 'taiwan', 'products', 'manufacturers', 'directory-', 'taiwan', 'china', 'products', 'manufacturer', 'direcory', 'exporter', 'directory', 'supplier', 'directory', 'suppliers']
Snippet match, WMD 0.603294929296
['allproducts', 'allproducts', 'com', 'manufacturers', 'directory', 'products', 'database', 'global', 'marketplace', 'manufacture', 'directory', 'exporters', 'importers', 'wholesalers', 'volume', 'buyers', 'suppliers', 'product', 'directory']
Snippet match, WMD 0.528627233402
['wand', 'wand', 'com', 'directory', 'suppliers', 'traders', 'products', 'wand', 'com', 'directory', 'suppliers', 'traders', 'providers', 'products', 'services', 'buyers', 'wand', 'com', 'source', 'commodities']
Snippet match, WMD 0.52601643645
['dfma', 'truecost', 'paper', 'true', 'cost', 'overseas', 'manufacture', 'product', 'design', 'costs', 'manufacturing', 'products', 'china', 'manufacturing', 'redesigned', 'produc