# Word mover's distance implementation

Original research paper link :[https://proceedings.mlr.press/v37/kusnerb15.pdf](https://proceedings.mlr.press/v37/kusnerb15.pdf)

In [92]:
!pip3 install --quiet -r requirements.txt

In [93]:
sentence_obama     = 'Obama speaks to the media in Illinois'
sentence_president = 'The president greets the press in Chicago'
sentence_random    = 'We are following a course on optimal transport. This course is given at Polytechnique.'

In [94]:
# Import and download stopwords from NLTK.
from nltk.corpus import stopwords
from nltk import download
download('stopwords')  # Download stopwords list.
stop_words = stopwords.words('english')

def preprocess(sentence):
    return [w for w in sentence.lower().split() if w not in stop_words]

sentence_obama     = preprocess(sentence_obama)
sentence_president = preprocess(sentence_president)
sentence_random    = preprocess(sentence_random)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hrialan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Simple BOW distance

In [95]:
corpus = [' '.join(sentence_obama), 
          ' '.join(sentence_president),
          ' '.join(sentence_random)]

In [96]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

In [97]:
vectorizer.get_feature_names_out()

array(['chicago', 'course', 'following', 'given', 'greets', 'illinois',
       'media', 'obama', 'optimal', 'polytechnique', 'president', 'press',
       'speaks', 'transport'], dtype=object)

In [98]:
X.toarray()

array([[0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0],
       [1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0],
       [0, 2, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1]])

In [99]:
X = (X / X.max()).toarray()

In [100]:
from scipy.spatial import distance
dist = distance.euclidean(X[0], X[1])
print('distance obama/president = %.4f' % dist)

distance obama/president = 1.4142


In [101]:
dist = distance.euclidean(X[1], X[2])
print('distance president/random = %.4f' % dist)

distance president/random = 1.8028


In [102]:
dist = distance.euclidean(X[0], X[2])
print('distance obama/random = %.4f' % dist)

distance obama/random = 1.8028


## Word mover's distance

In [103]:
import gensim.downloader as api
model = api.load('word2vec-google-news-300')

In [108]:
sentence_obama

['obama', 'speaks', 'media', 'illinois']

In [109]:
dist = model.wmdistance(sentence_obama, ['obama', 'speaks', 'media', 'illinois', 'test'])
print('distance obama/president = %.4f' % dist)

distance obama/president = 0.2790


In [104]:
dist = model.wmdistance(sentence_obama, sentence_president)
print('distance obama/president = %.4f' % dist)

distance obama/president = 1.0175


In [105]:
dist = model.wmdistance(sentence_president, sentence_random)
print('distance president/random = %.4f' % dist)

distance president/random = 1.3797


In [106]:
dist = model.wmdistance(sentence_obama, sentence_random)
print('distance obama/random = %.4f' % dist)

distance obama/random = 1.3414
