# Word mover's distance implementation

Original research paper link :[https://proceedings.mlr.press/v37/kusnerb15.pdf](https://proceedings.mlr.press/v37/kusnerb15.pdf)

In [246]:
!pip3 install --quiet -r requirements.txt

In [247]:
sentence_obama     = 'Obama speaks to the media in Illinois'
sentence_obama_bis = 'Obama meets the media in Illinois'
sentence_president = 'The president greets the press in Chicago'
sentence_random    = 'We are following a course on optimal transport. This course is given at Chicago.'

In [248]:
# Import and download stopwords from NLTK.
from nltk.corpus import stopwords
from nltk import download
download('stopwords')  # Download stopwords list.
stop_words = stopwords.words('english')

def preprocess(sentence):
    return [w for w in sentence.lower().split() if w not in stop_words]

sentence_obama     = preprocess(sentence_obama)
sentence_obama_bis = preprocess(sentence_obama_bis)
sentence_president = preprocess(sentence_president)
sentence_random    = preprocess(sentence_random)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hrialan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Simple BOW distance

In [249]:
corpus = [' '.join(sentence_obama),
          ' '.join(sentence_obama_bis),
          ' '.join(sentence_president),
          ' '.join(sentence_random)]

In [250]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

In [251]:
vectorizer.get_feature_names_out()

array(['chicago', 'course', 'following', 'given', 'greets', 'illinois',
       'media', 'meets', 'obama', 'optimal', 'president', 'press',
       'speaks', 'transport'], dtype=object)

In [252]:
X.toarray()

array([[0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0],
       [1, 2, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1]])

In [253]:
X = (X / X.max()).toarray()

In [254]:
from scipy.spatial import distance
dist = distance.euclidean(X[0], X[2])
print('distance obama/president = %.4f' % dist)

distance obama/president = 1.4142


In [255]:
dist = distance.euclidean(X[2], X[3])
print('distance president/random = %.4f' % dist)

distance president/random = 1.6583


In [256]:
dist = distance.euclidean(X[0], X[3])
print('distance obama/random = %.4f' % dist)

distance obama/random = 1.8028


In [257]:
dist = distance.euclidean(X[0], X[1])
print('distance obama/obama_bis = %.4f' % dist)

distance obama/obama_bis = 0.7071


## Word mover's distance

In [258]:
import gensim.downloader as api
model = api.load('word2vec-google-news-300')

In [260]:
dist = model.wmdistance(sentence_obama, sentence_president)
print('distance obama/president = %.4f' % dist)

distance obama/president = 1.0175


In [261]:
dist = model.wmdistance(sentence_president, sentence_random)
print('distance president/random = %.4f' % dist)

distance president/random = 1.3797


In [262]:
dist = model.wmdistance(sentence_obama, sentence_random)
print('distance obama/random = %.4f' % dist)

distance obama/random = 1.3414


In [263]:
dist = model.wmdistance(sentence_obama, sentence_obama_bis)
print('distance obama/obama_bis = %.4f' % dist)

distance obama/obama_bis = 0.2793
