# Exploring Sentence-, Document-, and Character Level Embeddings

Nama  : Ramanda Ajisaka Asyraf

NPM   : 20312067

Kelas : IF Gab 1

## Building a Doc2Vec model

### Importing the libraries and data

In [None]:
from gensim.models import FastText
from gensim.test.utils import common_texts

In [None]:
common_texts

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

### Building a basic model

In [None]:
model = FastText(size=5, window=3, min_count=1)

In [None]:
model.build_vocab(sentences=common_texts)
model.train(sentences=common_texts, total_examples=len(common_texts), epochs=10)

### Check the vocabulary

In [None]:
model.wv.vocab

{'computer': <gensim.models.keyedvectors.Vocab at 0x7f4cbfaed110>,
 'eps': <gensim.models.keyedvectors.Vocab at 0x7f4cbfaed190>,
 'graph': <gensim.models.keyedvectors.Vocab at 0x7f4cbfaed310>,
 'human': <gensim.models.keyedvectors.Vocab at 0x7f4cbfaed090>,
 'interface': <gensim.models.keyedvectors.Vocab at 0x7f4cbfaed0d0>,
 'minors': <gensim.models.keyedvectors.Vocab at 0x7f4cbfaed350>,
 'response': <gensim.models.keyedvectors.Vocab at 0x7f4cbfaed290>,
 'survey': <gensim.models.keyedvectors.Vocab at 0x7f4cbfaed150>,
 'system': <gensim.models.keyedvectors.Vocab at 0x7f4cbfaed250>,
 'time': <gensim.models.keyedvectors.Vocab at 0x7f4cbfaed2d0>,
 'trees': <gensim.models.keyedvectors.Vocab at 0x7f4cbfaed210>,
 'user': <gensim.models.keyedvectors.Vocab at 0x7f4cbfaed1d0>}

In [None]:
model.wv['human']

array([-0.01132617,  0.01409284,  0.04774407,  0.01412516,  0.00770461],
      dtype=float32)

### Checkout the most similar feature

In [None]:
model.wv.most_similar(positive=['computer', 'interface'], negative=['human'])

[('user', 0.8905642628669739),
 ('trees', 0.629560649394989),
 ('survey', 0.44253167510032654),
 ('eps', 0.41045570373535156),
 ('minors', 0.21103717386722565),
 ('time', 0.16884131729602814),
 ('response', 0.016975894570350647),
 ('graph', -0.05780427157878876),
 ('system', -0.203248992562294)]

### min_n and max_n parameters

In [None]:
model = FastText(size=5, window=3, min_count=1, min_n=1, max_n=5)

In [None]:
model.build_vocab(sentences=common_texts)
model.train(sentences=common_texts, total_examples=len(common_texts), epochs=10)

KeyboardInterrupt: ignored

### Let's try and fetch a representation for an out of vocabulary word

In [None]:
model.wv['rubber']

### Checkout the most similar feature using an Out of Vocab term

In [None]:
model.wv.most_similar(positive=['computer', 'human'], negative=['rubber'])

### Extending the built model to incorporate words from new sentences

In [None]:
sentences_to_be_added = [["I", "am", "learning", "Natural", "Language", "Processing"],
                         ["Natural", "Language", "Processing", "is", "cool"]]

In [None]:
model.build_vocab(sentences_to_be_added, update=True)
model.train(sentences=common_texts, total_examples=len(sentences_to_be_added), epochs=10)

In [None]:
model.wv.vocab

## Building paragraph vectors using Doc2Vec

### Import common text corpus, Doc2Vec algorithm and Tagged Document functionality from Gensim

In [None]:
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

### Corpus on which training will happen

In [None]:
common_texts

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

### Building Tagged Documents from the corpus as that's an expectation from the Doc2Vec model

In [None]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(common_texts)]

In [None]:
documents

[TaggedDocument(words=['human', 'interface', 'computer'], tags=[0]),
 TaggedDocument(words=['survey', 'user', 'computer', 'system', 'response', 'time'], tags=[1]),
 TaggedDocument(words=['eps', 'user', 'interface', 'system'], tags=[2]),
 TaggedDocument(words=['system', 'human', 'system', 'eps'], tags=[3]),
 TaggedDocument(words=['user', 'response', 'time'], tags=[4]),
 TaggedDocument(words=['trees'], tags=[5]),
 TaggedDocument(words=['graph', 'trees'], tags=[6]),
 TaggedDocument(words=['graph', 'minors', 'trees'], tags=[7]),
 TaggedDocument(words=['graph', 'minors', 'survey'], tags=[8])]

### Building a basic Doc2Vec model

In [None]:
model = Doc2Vec(documents, vector_size=5, min_count=1, workers=4, epochs = 40)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

### vector size

In [None]:
model.vector_size

5

### How many document vectors did we train?

In [None]:
len(model.docvecs)

9

### check out the vocabulary information for the model we built

In [None]:
len(model.wv.vocab)
model.wv.vocab

{'computer': <gensim.models.keyedvectors.Vocab at 0x7f4d11a30690>,
 'eps': <gensim.models.keyedvectors.Vocab at 0x7f4d11a30710>,
 'graph': <gensim.models.keyedvectors.Vocab at 0x7f4d11a30890>,
 'human': <gensim.models.keyedvectors.Vocab at 0x7f4d11a30610>,
 'interface': <gensim.models.keyedvectors.Vocab at 0x7f4d11a30650>,
 'minors': <gensim.models.keyedvectors.Vocab at 0x7f4d11a308d0>,
 'response': <gensim.models.keyedvectors.Vocab at 0x7f4d11a30810>,
 'survey': <gensim.models.keyedvectors.Vocab at 0x7f4d11a306d0>,
 'system': <gensim.models.keyedvectors.Vocab at 0x7f4d11a307d0>,
 'time': <gensim.models.keyedvectors.Vocab at 0x7f4d11a30850>,
 'trees': <gensim.models.keyedvectors.Vocab at 0x7f4d11a30790>,
 'user': <gensim.models.keyedvectors.Vocab at 0x7f4d11a30750>}

### infer a vector based on the trained Doc2Vec model

In [None]:
vector = model.infer_vector(['user', 'interface', 'for', 'computer'])
print(vector)

[-0.0480847   0.03764581  0.06295776 -0.03526415 -0.02095875]


### Building a new model changing vector size and minimum count eligibility

In [None]:
model = Doc2Vec(documents, vector_size=50, min_count=3, epochs=40)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

In [None]:
len(model.wv.vocab)

4

In [None]:
model.wv.vocab

{'graph': <gensim.models.keyedvectors.Vocab at 0x7f4d26839090>,
 'system': <gensim.models.keyedvectors.Vocab at 0x7f4d13a76dd0>,
 'trees': <gensim.models.keyedvectors.Vocab at 0x7f4d13a76b50>,
 'user': <gensim.models.keyedvectors.Vocab at 0x7f4d13a768d0>}

In [None]:
vector = model.infer_vector(['user', 'interface', 'for', 'computer'])
print(vector)

[-4.7290004e-03  3.7263199e-03  6.2539806e-03 -3.6363567e-03
 -2.2412851e-03  9.8542096e-03  9.6026305e-03  4.5514164e-05
  6.9623888e-03  8.9819934e-03  5.0121266e-03  1.3976209e-04
 -5.7957890e-03  2.3206589e-03 -4.1768043e-03 -8.1720455e-03
  7.9157613e-03 -7.5946003e-03  7.9342043e-03 -4.9406611e-03
  2.6192437e-03  9.1976542e-03  2.6871762e-03 -1.9753834e-03
  5.7196310e-03  5.6139841e-03 -9.1296006e-03 -7.8521706e-03
 -7.1999095e-03 -6.1163637e-03  8.3865114e-03  6.4533702e-03
  6.8257796e-03 -1.4102659e-03  2.2800430e-03 -5.8483169e-03
  4.2195925e-03 -5.5492362e-03  4.2122211e-03  8.3165998e-03
 -1.3400830e-05  9.9227438e-03  8.0850804e-03  9.9254502e-03
 -1.1186136e-03  1.4557549e-03 -8.5783098e-03 -5.7295267e-03
  3.4774744e-03 -5.0495164e-03]


### Doc2Vec built next would be based on the distributed memory model (dm=1)

In [None]:
model = Doc2Vec(documents, vector_size=50, min_count=2, epochs=40, dm=1)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

vector = model.infer_vector(['user', 'interface', 'for', 'computer'])
print(vector)

[-4.8079598e-03  3.7645418e-03  6.2958752e-03 -3.5273975e-03
 -2.0969301e-03  9.8107876e-03  9.7243953e-03 -2.0744797e-04
  6.8832738e-03  9.0580946e-03  5.0702756e-03  1.4342087e-04
 -5.8078067e-03  2.3025186e-03 -4.1236556e-03 -8.2803583e-03
  7.8331083e-03 -7.5195017e-03  7.9661859e-03 -4.7409036e-03
  2.5866569e-03  9.2780264e-03  2.7611407e-03 -2.0034474e-03
  5.7246597e-03  5.5841277e-03 -9.0622734e-03 -7.7881077e-03
 -7.2714286e-03 -6.2589771e-03  8.3711268e-03  6.4807176e-03
  6.8425299e-03 -1.3068308e-03  2.2913995e-03 -5.9663276e-03
  4.0704994e-03 -5.4061548e-03  4.2666802e-03  8.5631059e-03
 -9.3462440e-06  9.9395690e-03  8.0263847e-03  1.0001487e-02
 -1.0375476e-03  1.4974406e-03 -8.8684317e-03 -5.8260341e-03
  3.3567704e-03 -4.9917460e-03]


### Doc2Vec built next would be based on the distributed bag of words approach (dm=0)

In [None]:
model = Doc2Vec(documents, vector_size=50, min_count=2, epochs=40, dm=0)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

vector = model.infer_vector(['user', 'interface', 'for', 'computer'])
print(vector)

[-5.1229820e-03  3.6302335e-03  6.1089024e-03 -2.8427541e-03
 -1.5035116e-03  9.9434098e-03  1.0309704e-02 -7.8990345e-04
  6.4771837e-03  9.3224756e-03  4.9347882e-03  7.5601415e-06
 -5.8228653e-03  2.3642485e-03 -3.9574089e-03 -8.3368681e-03
  7.7268155e-03 -7.4487715e-03  7.9619847e-03 -4.2225113e-03
  2.5055518e-03  9.2065968e-03  2.6920903e-03 -1.9909523e-03
  5.7116910e-03  5.4628477e-03 -8.9553166e-03 -7.5103533e-03
 -7.6145795e-03 -6.5491637e-03  8.3465008e-03  6.6836705e-03
  6.9413888e-03 -1.1866635e-03  2.1076018e-03 -6.2263655e-03
  3.7004370e-03 -4.9845059e-03  4.5927465e-03  9.1011720e-03
 -8.7194865e-05  9.8932078e-03  7.7571631e-03  1.0285491e-02
 -9.3059120e-04  1.5523549e-03 -9.8100053e-03 -6.0939784e-03
  3.0666799e-03 -4.8058992e-03]


### Adding the window size which controls the maximum distance between current and predicted word

In [None]:
model = Doc2Vec(documents, vector_size=50, min_count=2, epochs=40, window=2, dm=0)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

vector = model.infer_vector(['user', 'interface', 'for', 'computer'])
print(vector)

[-5.1229820e-03  3.6302335e-03  6.1089024e-03 -2.8427541e-03
 -1.5035116e-03  9.9434098e-03  1.0309704e-02 -7.8990345e-04
  6.4771837e-03  9.3224756e-03  4.9347882e-03  7.5601415e-06
 -5.8228653e-03  2.3642485e-03 -3.9574089e-03 -8.3368681e-03
  7.7268155e-03 -7.4487715e-03  7.9619847e-03 -4.2225113e-03
  2.5055518e-03  9.2065968e-03  2.6920903e-03 -1.9909523e-03
  5.7116910e-03  5.4628477e-03 -8.9553166e-03 -7.5103533e-03
 -7.6145795e-03 -6.5491637e-03  8.3465008e-03  6.6836705e-03
  6.9413888e-03 -1.1866635e-03  2.1076018e-03 -6.2263655e-03
  3.7004370e-03 -4.9845059e-03  4.5927465e-03  9.1011720e-03
 -8.7194865e-05  9.8932078e-03  7.7571631e-03  1.0285491e-02
 -9.3059120e-04  1.5523549e-03 -9.8100053e-03 -6.0939784e-03
  3.0666799e-03 -4.8058992e-03]


### Adding initial learning rate and to what value should the learning rate drop to linearly over training (alpha and min_alpha)

In [None]:
model = Doc2Vec(documents, vector_size=50, min_count=2, epochs=40, window=2, dm=1, alpha=0.3, min_alpha=0.05)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

vector = model.infer_vector(['user', 'interface', 'for', 'computer'])
print(vector)

[-0.0780084  -0.0029983  -0.18686335  0.43595198  0.3243992   0.06455706
  0.24351035 -0.29817623 -0.2622137   0.25125143 -0.02908649  0.0244542
 -0.06898806 -0.02873833  0.03162578 -0.05206823 -0.07243361  0.05397745
 -0.01398734  0.29375625  0.08328969  0.1267475   0.189965   -0.08932698
  0.00189726 -0.15256779 -0.04141614  0.3237885  -0.20299986 -0.2228819
 -0.03740845  0.2661052   0.18587041  0.11882388  0.1045161  -0.14279559
 -0.26667696  0.15469697  0.09428133  0.39365155  0.0668533   0.17232837
 -0.09025387  0.08931541  0.14521076  0.14106417 -0.3874625  -0.27575654
 -0.18056805  0.06181358]


### Adding the dm_concat parameter to use concatenation of the word vectors

In [None]:
model = Doc2Vec(documents, vector_size=50, min_count=2, epochs=40, window=2, dm=1, alpha=0.3, min_alpha=0.05, dm_concat=1)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

vector = model.infer_vector(['user', 'interface', 'for', 'computer'])
print(vector)

[-0.02083027 -0.18395045 -0.12334169  0.19590855  0.14830089 -0.04019875
  0.10982412 -0.04953833 -0.13981196  0.16854358 -0.14039472 -0.02300855
  0.07567628  0.1005694   0.07760538  0.01526707  0.00352453 -0.12962134
  0.03404624  0.06397147 -0.14018434 -0.13673595 -0.16837388  0.13282529
 -0.06280011 -0.01451559 -0.01675061  0.05887584 -0.06449297 -0.08809994
  0.00757934  0.11288229  0.05480232  0.00324578 -0.15366437  0.04689441
  0.02563367  0.06027535  0.09165619  0.03334437 -0.1384466  -0.04607228
 -0.17002702  0.07708313 -0.04829381 -0.02572853 -0.21518657 -0.03537124
 -0.02686962  0.01376351]


### Adding the dm_mean parameter to use sum of the context word vectors (dm_mean=1)

In [None]:
model = Doc2Vec(documents, vector_size=50, min_count=2, epochs=40, window=2, dm=1, dm_concat=0, dm_mean=1, alpha=0.3, min_alpha=0.05)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

vector = model.infer_vector(['user', 'interface', 'for', 'computer'])
print(vector)

[-0.0780084  -0.0029983  -0.18686335  0.43595198  0.3243992   0.06455706
  0.24351035 -0.29817623 -0.2622137   0.25125143 -0.02908649  0.0244542
 -0.06898806 -0.02873833  0.03162578 -0.05206823 -0.07243361  0.05397745
 -0.01398734  0.29375625  0.08328969  0.1267475   0.189965   -0.08932698
  0.00189726 -0.15256779 -0.04141614  0.3237885  -0.20299986 -0.2228819
 -0.03740845  0.2661052   0.18587041  0.11882388  0.1045161  -0.14279559
 -0.26667696  0.15469697  0.09428133  0.39365155  0.0668533   0.17232837
 -0.09025387  0.08931541  0.14521076  0.14106417 -0.3874625  -0.27575654
 -0.18056805  0.06181358]


### Adding the dm_mean parameter to use mean of the context word vectors (dm_mean=0)

In [None]:
model = Doc2Vec(documents, vector_size=50, min_count=2, epochs=40, window=2, dm=1, dm_concat=0, dm_mean=0, alpha=0.3, min_alpha=0.05)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

vector = model.infer_vector(['user', 'interface', 'for', 'computer'])
print(vector)

[-0.04002835 -0.01270157 -0.11261422  0.33206737  0.27726966 -0.00426551
  0.15789507 -0.27978104 -0.19799115  0.2615781  -0.01130674  0.06910732
 -0.0601967  -0.00235807  0.06369697 -0.09914559 -0.09102184 -0.0065806
  0.00300646  0.227156    0.04198624  0.15265921  0.15316308 -0.02868354
  0.00145327 -0.13448356 -0.05667862  0.3010909  -0.1353416  -0.21153954
 -0.04579882  0.23399208  0.18452133  0.1277111   0.11090421 -0.10758397
 -0.21520337  0.13229519  0.07114536  0.3314709   0.01502123  0.15590566
 -0.11556525  0.06824988  0.13447195  0.1600065  -0.3125271  -0.22876881
 -0.13060687  0.05845591]


## Building a Spell Corrector/Text Suggestor using fastText

### Reading the dataimport nltk


In [4]:
import nltk
import re
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer 
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import FastText
import io
import collections

words = []
data = []
with io.open('/content/drive/MyDrive/Colab Notebooks/PBA/comments.txt', 'r') as file:
    for entry in file:
        entry = entry.strip()
        data.append(entry)
        words.extend(entry.split())

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


### Checking for common terms in the data

In [5]:
unique_words = []
unique_words = collections.Counter(words)
unique_words.most_common(10)

[('the', 445892),
 ('to', 288753),
 ('of', 219279),
 ('and', 207335),
 ('a', 201765),
 ('I', 182618),
 ('is', 164602),
 ('you', 157025),
 ('that', 140495),
 ('in', 130244)]

In [6]:
data[:3]

['"Explanation',
 'Why the edits made under my username Hardcore Metallica Fan were reverted? They weren\'t vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don\'t remove the template from the talk page since I\'m retired now.89.205.38.27"',
 "D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (talk) 21:51, January 11, 2016 (UTC)"]

### Data Preprocessing

In [7]:
def text_clean(corpus):
    '''
    Purpose : Function to keep only alphabets, digits and certain words (punctuations, qmarks, tabs etc. removed)
    
    Input : Takes a text corpus, 'corpus' to be cleaned along with a list of words, 'keep_list', which have to be retained
            even after the cleaning process
    
    Output : Returns the cleaned text corpus
    
    '''
    cleaned_corpus = []
    for row in corpus:
        qs = []
        for word in row.split():
            p1 = re.sub(pattern='[^a-zA-Z0-9]',repl=' ',string=word)
            p1 = p1.lower()
            qs.append(p1)
        cleaned_corpus.append(' '.join(qs))
    return cleaned_corpus
def stopwords_removal(corpus):
    wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']
    stop = set(stopwords.words('english'))
    for word in wh_words:
        stop.remove(word)
    corpus = [[x for x in x.split() if x not in stop] for x in corpus]
    return corpus
def lemmatize(corpus):
    lem = WordNetLemmatizer()
    corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus]
    return corpus
def stem(corpus, stem_type = None):
    if stem_type == 'snowball':
        stemmer = SnowballStemmer(language = 'english')
        corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    else :
        stemmer = PorterStemmer()
        corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    return corpus
def preprocess(corpus, cleaning = True, stemming = False, stem_type = None, lemmatization = False, remove_stopwords = True):
    '''
    Purpose : Function to perform all pre-processing tasks (cleaning, stemming, lemmatization, stopwords removal etc.)
    
    Input : 
    'corpus' - Text corpus on which pre-processing tasks will be performed
    'keep_list' - List of words to be retained during cleaning process
    'cleaning', 'stemming', 'lemmatization', 'remove_stopwords' - Boolean variables indicating whether a particular task should 
                                                                  be performed or not
    'stem_type' - Choose between Porter stemmer or Snowball(Porter2) stemmer. Default is "None", which corresponds to Porter
                  Stemmer. 'snowball' corresponds to Snowball Stemmer
    
    Note : Either stemming or lemmatization should be used. There's no benefit of using both of them together
    
    Output : Returns the processed text corpus
    
    '''
    
    if cleaning == True:
        corpus = text_clean(corpus)
    
    if remove_stopwords == True:
        corpus = stopwords_removal(corpus)
    else :
        corpus = [[x for x in x.split()] for x in corpus]
    
    if lemmatization == True:
        corpus = lemmatize(corpus)
        
        
    if stemming == True:
        corpus = stem(corpus, stem_type)
    
    corpus = [' '.join(x) for x in corpus]        

    return corpus
data = preprocess(data)

### Data conversion into formation expected by fastText

In [8]:
preprocessed_data = []
for line in data:
    if line != "":
        preprocessed_data.append(line.split())

### Building the fastText model

In [9]:
model = FastText(size=300, window=3, min_count=1, min_n=1, max_n=5)
model.build_vocab(sentences=preprocessed_data)
len(model.wv.vocab)

182228

In [10]:
model.train(sentences=preprocessed_data, total_examples=len(preprocessed_data), epochs=10)

### Checking for top 5 similar terms

In [11]:
model.wv.most_similar('eplain', topn=5)

[('xplain', 0.885149359703064),
 ('explain', 0.8441685438156128),
 ('eexplain', 0.8420135974884033),
 ('plain', 0.8357589840888977),
 ('reexplain', 0.8252125382423401)]

In [12]:
model.wv.most_similar('reminder', topn=5)

[('remainder', 0.916622519493103),
 ('rejoinder', 0.9131145477294922),
 ('reminde', 0.905627429485321),
 ('reindeer', 0.9020789265632629),
 ('reminders', 0.8999879360198975)]

In [13]:
model.wv.most_similar('relevnt', topn=5)

[('relev', 0.810570240020752),
 ('relevant', 0.7911088466644287),
 ('releant', 0.7699285745620728),
 ('releve', 0.7698831558227539),
 ('rele', 0.7640540599822998)]

In [14]:
model.wv.most_similar('purse', topn=5)

[('purpse', 0.9312492609024048),
 ('cpurse', 0.9152131676673889),
 ('pure', 0.8999971151351929),
 ('pursue', 0.8953242301940918),
 ('pulse', 0.8838240504264832)]

### fastText and Word Mover's Distance

In [15]:
sentence_1 = "Obama speaks to the media in Illinois"
sentence_2 = "President greets the press in Chicago"
sentence_3 = "Apple is my favorite company"
word_mover_distance = model.wmdistance(sentence_1, sentence_2)
word_mover_distance

  after removing the cwd from sys.path.


13.666703045232623

In [16]:
word_mover_distance = model.wmdistance(sentence_2, sentence_3)
word_mover_distance

  """Entry point for launching an IPython kernel.


17.09835656790948