In [1]:
# Step 1: Load the data (we assume that we have one sentence per line and that the tokens are separated by whitespace)

with open("data/wue15.txt", "r") as f:
    data = f.readlines()

In [2]:
# As we can see, the data is a list of strings, where each string is a line from the file.
# Let's print the first 10 lines to see what we have.
data[:10]

['Welcome !\n',
 'Hello , , and welcome to Wikipedia !\n',
 'Thank you for your contributions .\n',
 'I hope you like the place and decide to stay .\n',
 'Here are a few good links for newcomers :\n',
 'The five pillars of Wikipedia\n',
 'How to edit a page\n',
 'Help pages\n',
 'Tutorial\n',
 'How to write a great article\n']

In [3]:
# Now, we need to split each line into tokens. We can do this using the `split()` method, which splits a string into a list of words based on whitespace.
# We will also remove any leading or trailing whitespace from each line.

tokenized_data = []
for line in data:
    # Strip leading/trailing whitespace and split by whitespace
    tokens = line.strip().split()
    tokenized_data.append(tokens)

# Alternatively, we could use a list comprehension to achieve the same result in a more compact way.
# tokenized_data = [line.strip().split() for line in data]

# Now, let's print the first 10 lines again to see the tokenized data.
tokenized_data[:10]

[['Welcome', '!'],
 ['Hello', ',', ',', 'and', 'welcome', 'to', 'Wikipedia', '!'],
 ['Thank', 'you', 'for', 'your', 'contributions', '.'],
 ['I',
  'hope',
  'you',
  'like',
  'the',
  'place',
  'and',
  'decide',
  'to',
  'stay',
  '.'],
 ['Here', 'are', 'a', 'few', 'good', 'links', 'for', 'newcomers', ':'],
 ['The', 'five', 'pillars', 'of', 'Wikipedia'],
 ['How', 'to', 'edit', 'a', 'page'],
 ['Help', 'pages'],
 ['Tutorial'],
 ['How', 'to', 'write', 'a', 'great', 'article']]

In [4]:
# Now, we need to import the Gensim library to create a Word2Vec model using the tokenized data.
from gensim.models import Word2Vec
# We will create a Word2Vec model using the CBOW approach. The parameters are:
# - `vector_size`: The size of the word vectors (we will use 200 dimensions).
# - `window`: The maximum distance between the current and predicted word within a sentence (we will use a window of 5).
# - `min_count`: Ignores all words with total frequency lower than this (we will use 5).
# - `workers`: The number of worker threads to train the model (we will use all available cores except 1).
# - `sg`: Skip-gram model (1) or CBOW (0). We will use CBOW (0).

# get available cores
import multiprocessing
cores = multiprocessing.cpu_count() - 1
if cores > 50:
    cores = 20

# Create the Word2Vec model
model = Word2Vec(sentences=tokenized_data, vector_size=200, window=5, min_count=5, workers=cores, sg=0)

In [5]:
# We can also save the model to a file for later use.
model.save("data/wue15_word2vec.model")

In [1]:
# To load the model later, we can use the following command:
from gensim.models import Word2Vec
model = Word2Vec.load("data/wue15_word2vec.model")

In [2]:
# The interesting part of the model is the `wv` attribute, which contains the word vectors.
model = model.wv

In [3]:
# Let's see if it worked by asking for nearest neighbors of a word.
model.most_similar("attack", topn=20)

[('attacks', 0.6789256930351257),
 ('insult', 0.6435273289680481),
 ('vendetta', 0.5984145402908325),
 ('affront', 0.5594542026519775),
 ('vendettas', 0.5416612029075623),
 ('attacks.--', 0.5324352383613586),
 ('crusade', 0.5312038660049438),
 ('insults', 0.5214263200759888),
 ('slur', 0.5176430940628052),
 ('accusation', 0.4990154802799225),
 ('insulting', 0.4841175675392151),
 ('threat', 0.4821915626525879),
 ('harassment', 0.4795363247394562),
 ('grudge', 0.4791853725910187),
 ('observation', 0.47662192583084106),
 ('provocation', 0.47645336389541626),
 ('ad', 0.4752565324306488),
 ('attacking', 0.4742068350315094),
 ('slander', 0.45394542813301086),
 ('attack.--', 0.4515107572078705)]

In [4]:
# Let's see how many words are in the vocabulary (= how many types from our training data occur at least 5 times in the corpus).
print(f"Vocabulary size: {len(model)}")

Vocabulary size: 115621


In [None]:
max_sents = 10
printed = 0
word = "vendetta"
for sent in tokenized_data:
    if word in sent:
        print(" ".join(sent))
        printed += 1
    if printed >= max_sents:
        break

In [5]:
# Let's now compare the model to a pre-trained one.

import gensim.downloader as api

contrast_model = api.load("glove-wiki-gigaword-50")
contrast_model.most_similar("attack")

[('attacks', 0.9274438619613647),
 ('bombing', 0.8695274591445923),
 ('suicide', 0.8600563406944275),
 ('raid', 0.8567200303077698),
 ('bomb', 0.8250047564506531),
 ('ambush', 0.8242325782775879),
 ('killing', 0.8198403120040894),
 ('deadly', 0.8161444067955017),
 ('strikes', 0.8123002648353577),
 ('militants', 0.8120480179786682)]

In [6]:
# Importantly, the vectors of the two models are not directly comparable, as they are trained independently.

vector1 = model["attack"]
vector2 = contrast_model["attack"]
print(f"Vector 1: {vector1[:10]}")
print(f"Vector 2: {vector2[:10]}")

Vector 1: [-1.1933035   1.6471744   0.92219937 -1.18339     1.0685806   0.96526486
  0.3699839  -0.4874719   0.22341654 -0.86063737]
Vector 2: [ 1.4703  -0.9337   0.51369 -0.19082  0.50227  0.13241  0.12726  0.63662
 -0.13905 -0.32585]


In [7]:
# This will not work, as the vectors are not of the same size.
# And even if they were, the nearest neighbours would not be meaningful as the models are independently trained.
model.most_similar(vector2)

ValueError: operands could not be broadcast together with shapes (200,) (50,) (200,) 

In [8]:
# What you can do, however, is to contrast the two models by contrasting the nearest neighbours of a word.

attack1 = [word_score[0] for word_score in model.most_similar("attack", topn=20)]
attack2 = [word_score[0] for word_score in contrast_model.most_similar("attack", topn=20)]

attack1

['attacks',
 'insult',
 'vendetta',
 'affront',
 'vendettas',
 'attacks.--',
 'crusade',
 'insults',
 'slur',
 'accusation',
 'insulting',
 'threat',
 'harassment',
 'grudge',
 'observation',
 'provocation',
 'ad',
 'attacking',
 'slander',
 'attack.--']

In [9]:
for word in attack1:
    if word not in attack2:
        print(f"Word {word} is in model 1 but not in model 2")

Word insult is in model 1 but not in model 2
Word vendetta is in model 1 but not in model 2
Word affront is in model 1 but not in model 2
Word vendettas is in model 1 but not in model 2
Word attacks.-- is in model 1 but not in model 2
Word crusade is in model 1 but not in model 2
Word insults is in model 1 but not in model 2
Word slur is in model 1 but not in model 2
Word accusation is in model 1 but not in model 2
Word insulting is in model 1 but not in model 2
Word threat is in model 1 but not in model 2
Word harassment is in model 1 but not in model 2
Word grudge is in model 1 but not in model 2
Word observation is in model 1 but not in model 2
Word provocation is in model 1 but not in model 2
Word ad is in model 1 but not in model 2
Word attacking is in model 1 but not in model 2
Word slander is in model 1 but not in model 2
Word attack.-- is in model 1 but not in model 2


In [10]:
# Complete the following function that should return a percentage of overlapping words between two lists of words.
def list_overlap(list1, list2):
    # ... your code here
    empty_list = []
    for word in list1:
        if word in list2:
            empty_list.append(word)
    len(empty_list)/len(list1)
    overlap_percentage = len(empty_list)/len(list1) * 100
    return overlap_percentage

In [11]:
# And now integrate the previous steps and your list_overlap function into the function below.
# It should take two words and two models as input and return the percentage of overlapping words among the n nearest neighbours of the words between the two models.

def overlap_percentage(word, model1, model2, n=20):
    # ... your code here
    # Get the top N nearest neighbors from each model
    neighbors1 = [word_score[0] for word_score in model1.most_similar(word, topn=n)]
    neighbors2 = [word_score[0] for word_score in model2.most_similar(word, topn=n)]
    
    # Find the intersection of both lists
    overlap_count = len(set(neighbors1) & set(neighbors2))
    
    # Compute the overlap percentage
    overlap_percentage = (overlap_count / n) * 100  # Use 'n' instead of len(neighbors1) for consistency

    return overlap_percentage

In [12]:
# Now we can go from a corpus-based to a corpus-driven perspective and look at all words in the vocabulary.

# We find the vocab of the model by using the `index_to_key` attribute.
print(model.index_to_key[:10])

# We build a set of words that are shared between the two models.

shared_vocab = set(model.index_to_key).intersection(set(contrast_model.index_to_key))
print(f"Shared vocabulary size: {len(shared_vocab)}")

# Now we can iterate over the vocabulary and save the overlap percentages in a dictionary.
overlap_dict = {}

for word in shared_vocab:
    overlap = overlap_percentage(word, model, contrast_model)
    overlap_dict[word] = overlap

[',', '.', 'the', 'to', 'you', '(', ')', 'and', 'of', 'a']
Shared vocabulary size: 41054


In [13]:
# Now we can sort the dictionary with this function

def sort_dict(dic, reverse=True):
    return dict(sorted(dic.items(), key=lambda item: item[1], reverse=reverse))

sorted_overlap = sort_dict(overlap_dict)

In [14]:
sorted_overlap

{'22': 90.0,
 '23': 90.0,
 '20': 85.0,
 '21': 85.0,
 '14': 80.0,
 '3': 80.0,
 '28': 80.0,
 '31': 80.0,
 '27': 80.0,
 '29': 80.0,
 '18': 80.0,
 '19': 80.0,
 '17': 80.0,
 '26': 80.0,
 '10': 75.0,
 '15': 75.0,
 '24': 75.0,
 '7': 75.0,
 '13': 75.0,
 '11': 75.0,
 '21st': 75.0,
 '2': 75.0,
 '16th': 75.0,
 '25': 75.0,
 '12': 75.0,
 '9': 70.0,
 '15th': 70.0,
 'ಟ': 70.0,
 '13th': 70.0,
 '5': 70.0,
 'ಗ': 70.0,
 '5th': 70.0,
 '6': 70.0,
 'ಜ': 70.0,
 'ಮ': 70.0,
 '7th': 70.0,
 '16': 70.0,
 '30': 70.0,
 'son': 70.0,
 '6th': 70.0,
 '4': 70.0,
 'ಸ': 70.0,
 'ಡ': 65.0,
 '53': 65.0,
 'ರ': 65.0,
 'ವ': 65.0,
 'ಳ': 65.0,
 '1984': 65.0,
 '1993': 65.0,
 '1989': 65.0,
 '54': 65.0,
 '1979': 65.0,
 '8': 65.0,
 '8th': 65.0,
 '1': 65.0,
 'ದ': 65.0,
 '51': 65.0,
 '4th': 60.0,
 'extraordinarily': 60.0,
 '43': 60.0,
 '1992': 60.0,
 '1,000': 60.0,
 '2014': 60.0,
 'ಲ': 60.0,
 'ನ': 60.0,
 'ಶ': 60.0,
 '1994': 60.0,
 '2000': 60.0,
 '57': 60.0,
 '88': 60.0,
 '1999': 60.0,
 '58': 60.0,
 '59': 60.0,
 '14th': 60.0,
 '12th': 6

In [15]:
sort_dict(overlap_dict, reverse=False)

{'peed': 0.0,
 'post-war': 0.0,
 'vindictiveness': 0.0,
 'neoclassical': 0.0,
 'flail': 0.0,
 'indictment': 0.0,
 'deprived': 0.0,
 '20k': 0.0,
 'popup': 0.0,
 'crusaders': 0.0,
 'stumped': 0.0,
 'prosecute': 0.0,
 'fourteenth': 0.0,
 'non-white': 0.0,
 'grunt': 0.0,
 'reactivity': 0.0,
 'sandbox': 0.0,
 'daring': 0.0,
 'clipboard': 0.0,
 'loony': 0.0,
 'kayak': 0.0,
 'govt': 0.0,
 'reprieve': 0.0,
 'negotiators': 0.0,
 'cohen': 0.0,
 'non-sentient': 0.0,
 'sovereigns': 0.0,
 'soluble': 0.0,
 'inset': 0.0,
 'std': 0.0,
 'fellowship': 0.0,
 'moore': 0.0,
 'fot': 0.0,
 'edu': 0.0,
 'battlegrounds': 0.0,
 'reappearance': 0.0,
 'newish': 0.0,
 'egg': 0.0,
 'superficially': 0.0,
 '138': 0.0,
 'confessions': 0.0,
 'timescales': 0.0,
 'minions': 0.0,
 'gardeners': 0.0,
 'crumbs': 0.0,
 'clergyman': 0.0,
 'subsidence': 0.0,
 'optician': 0.0,
 'ever-changing': 0.0,
 'mandated': 0.0,
 'misinform': 0.0,
 'cleanly': 0.0,
 'self-identifying': 0.0,
 'sanskrit': 0.0,
 'seminars': 0.0,
 'raining': 0.0

In [16]:
model.most_similar("spongebob")

[('386', 0.8149140477180481),
 ('SynaptiCAD', 0.8127642869949341),
 ('Epilepsy', 0.8126484155654907),
 ('Barros', 0.8122938275337219),
 ('waller', 0.810689389705658),
 ('Paco', 0.8102414608001709),
 ('Alemayehu', 0.8082616329193115),
 ('Haller', 0.8078862428665161),
 ('Chanderdat', 0.8072391748428345),
 ('Ele', 0.8070787787437439)]

In [17]:
contrast_model.most_similar("spongebob")

[('squarepants', 0.9683283567428589),
 ('buffy', 0.6830703616142273),
 ('nickelodeon', 0.6683040857315063),
 ('degeneres', 0.6681976914405823),
 ('cbbc', 0.666020929813385),
 ('veggietales', 0.6627994179725647),
 ('tigger', 0.6526259779930115),
 ('roseanne', 0.6281614303588867),
 ('shrek', 0.6193712949752808),
 ('whisperer', 0.6192430257797241)]

# Original model

In [18]:
# To load the model later, we can use the following command:
from gensim.models import Word2Vec
model = Word2Vec.load("data_ori/wue15_word2vec.model")

In [19]:
# The interesting part of the model is the `wv` attribute, which contains the word vectors.
model = model.wv

In [20]:
# Let's see if it worked by asking for nearest neighbors of a word.
model.most_similar("attack", topn=20)

[('attacks', 0.6653181910514832),
 ('insult', 0.6299493312835693),
 ('vendetta', 0.627418041229248),
 ('attack.--', 0.6023419499397278),
 ('slur', 0.5512348413467407),
 ('crusade', 0.5488301515579224),
 ('affront', 0.5374261736869812),
 ('vendettas', 0.5121461153030396),
 ('accusation', 0.5028104186058044),
 ('attacks.--', 0.5013230443000793),
 ('slander', 0.5011648535728455),
 ('grudge', 0.5004501342773438),
 ('threat', 0.494328111410141),
 ('insults', 0.49367964267730713),
 ('allegation', 0.49109581112861633),
 ('harassment', 0.49007099866867065),
 ('ad', 0.48044106364250183),
 ('atacks', 0.4773373603820801),
 ('observation', 0.4734446406364441),
 ('insulting', 0.4729105234146118)]

In [21]:
# Let's see how many words are in the vocabulary (= how many types from our training data occur at least 5 times in the corpus).
print(f"Vocabulary size: {len(model)}")

Vocabulary size: 115621


In [27]:
max_sents = 10
printed = 0
word = "vendetta"
for sent in tokenized_data:
    if word in sent:
        print(" ".join(sent))
        printed += 1
    if printed >= max_sents:
        break

Besides , I have a vendetta against forced antidepressants .
You are now conducting a personal vendetta against me. You've decided to remove every single edit I've made to Trek pages .
-- 15:12 , 18 December 2014 ( UTC ) Your AIV report I removed it. There is no " vendetta " , and if you're filing a complaint for edit warring you should have reported the other party as well .
Should you come to your senses and decide to make constructive edits again , I don't think we'll have a problem , but if you insist on continuing your vendetta against the Powerpuff Girls , It won't end well .
With no edits to his account there's no way to tell if it's tCv , this impersonator , someone with a vendetta against them , or just another vandal .
Admit that you have a personal vendetta against this person .
I just can't understand why SkyWriter has such a vendetta to push " clarifying " the article as " Christian " at the expense of marginalization of many Messianic Jews who reject the label not only no

In [22]:
# Let's now compare the model to a pre-trained one.

import gensim.downloader as api

contrast_model = api.load("glove-wiki-gigaword-50")
contrast_model.most_similar("attack")

[('attacks', 0.9274438619613647),
 ('bombing', 0.8695274591445923),
 ('suicide', 0.8600563406944275),
 ('raid', 0.8567200303077698),
 ('bomb', 0.8250047564506531),
 ('ambush', 0.8242325782775879),
 ('killing', 0.8198403120040894),
 ('deadly', 0.8161444067955017),
 ('strikes', 0.8123002648353577),
 ('militants', 0.8120480179786682)]

In [23]:
# Importantly, the vectors of the two models are not directly comparable, as they are trained independently.

vector1 = model["attack"]
vector2 = contrast_model["attack"]
print(f"Vector 1: {vector1[:10]}")
print(f"Vector 2: {vector2[:10]}")

Vector 1: [ 0.80315816  1.2910506  -0.90030617  0.81309754  0.18535283  1.4075223
 -0.75989944 -1.7453072   0.42122108 -1.1356882 ]
Vector 2: [ 1.4703  -0.9337   0.51369 -0.19082  0.50227  0.13241  0.12726  0.63662
 -0.13905 -0.32585]


In [24]:
# This will not work, as the vectors are not of the same size.
# And even if they were, the nearest neighbours would not be meaningful as the models are independently trained.
model.most_similar(vector2)

ValueError: operands could not be broadcast together with shapes (200,) (50,) (200,) 

In [25]:
# What you can do, however, is to contrast the two models by contrasting the nearest neighbours of a word.

attack1 = [word_score[0] for word_score in model.most_similar("attack", topn=20)]
attack2 = [word_score[0] for word_score in contrast_model.most_similar("attack", topn=20)]

attack1

['attacks',
 'insult',
 'vendetta',
 'attack.--',
 'slur',
 'crusade',
 'affront',
 'vendettas',
 'accusation',
 'attacks.--',
 'slander',
 'grudge',
 'threat',
 'insults',
 'allegation',
 'harassment',
 'ad',
 'atacks',
 'observation',
 'insulting']

In [26]:
for word in attack1:
    if word not in attack2:
        print(f"Word {word} is in model 1 but not in model 2")

Word insult is in model 1 but not in model 2
Word vendetta is in model 1 but not in model 2
Word attack.-- is in model 1 but not in model 2
Word slur is in model 1 but not in model 2
Word crusade is in model 1 but not in model 2
Word affront is in model 1 but not in model 2
Word vendettas is in model 1 but not in model 2
Word accusation is in model 1 but not in model 2
Word attacks.-- is in model 1 but not in model 2
Word slander is in model 1 but not in model 2
Word grudge is in model 1 but not in model 2
Word threat is in model 1 but not in model 2
Word insults is in model 1 but not in model 2
Word allegation is in model 1 but not in model 2
Word harassment is in model 1 but not in model 2
Word ad is in model 1 but not in model 2
Word atacks is in model 1 but not in model 2
Word observation is in model 1 but not in model 2
Word insulting is in model 1 but not in model 2


In [27]:
# Complete the following function that should return a percentage of overlapping words between two lists of words.
def list_overlap(list1, list2):
    # ... your code here
    empty_list = []
    for word in list1:
        if word in list2:
            empty_list.append(word)
    len(empty_list)/len(list1)
    overlap_percentage = len(empty_list)/len(list1) * 100
    return overlap_percentage

In [28]:
# And now integrate the previous steps and your list_overlap function into the function below.
# It should take two words and two models as input and return the percentage of overlapping words among the n nearest neighbours of the words between the two models.

def overlap_percentage(word, model1, model2, n=20):
    # ... your code here
    # Get the top N nearest neighbors from each model
    neighbors1 = [word_score[0] for word_score in model1.most_similar(word, topn=n)]
    neighbors2 = [word_score[0] for word_score in model2.most_similar(word, topn=n)]
    
    # Find the intersection of both lists
    overlap_count = len(set(neighbors1) & set(neighbors2))
    
    # Compute the overlap percentage
    overlap_percentage = (overlap_count / n) * 100  # Use 'n' instead of len(neighbors1) for consistency

    return overlap_percentage

In [29]:
# Now we can go from a corpus-based to a corpus-driven perspective and look at all words in the vocabulary.

# We find the vocab of the model by using the `index_to_key` attribute.
print(model.index_to_key[:10])

# We build a set of words that are shared between the two models.

shared_vocab = set(model.index_to_key).intersection(set(contrast_model.index_to_key))
print(f"Shared vocabulary size: {len(shared_vocab)}")

# Now we can iterate over the vocabulary and save the overlap percentages in a dictionary.
overlap_dict = {}

for word in shared_vocab:
    overlap = overlap_percentage(word, model, contrast_model)
    overlap_dict[word] = overlap

[',', '.', 'the', 'to', 'you', '(', ')', 'and', 'of', 'a']
Shared vocabulary size: 41054


In [30]:
# Now we can sort the dictionary with this function

def sort_dict(dic, reverse=True):
    return dict(sorted(dic.items(), key=lambda item: item[1], reverse=reverse))

sorted_overlap = sort_dict(overlap_dict)

In [31]:
sorted_overlap

{'22': 90.0,
 '23': 90.0,
 '21': 85.0,
 '20': 80.0,
 '28': 80.0,
 '7': 80.0,
 '27': 80.0,
 '29': 80.0,
 '18': 80.0,
 '16th': 80.0,
 '19': 80.0,
 '17': 80.0,
 '26': 80.0,
 '14': 75.0,
 '10': 75.0,
 '3': 75.0,
 '15': 75.0,
 '1993': 75.0,
 '24': 75.0,
 '31': 75.0,
 '16': 75.0,
 '13': 75.0,
 '21st': 75.0,
 '2': 75.0,
 '4': 75.0,
 '25': 75.0,
 '12': 75.0,
 '1': 75.0,
 '4th': 70.0,
 '9': 70.0,
 'ಲ': 70.0,
 'ಟ': 70.0,
 'ಶ': 70.0,
 '13th': 70.0,
 '5': 70.0,
 'ಗ': 70.0,
 'ಳ': 70.0,
 '5th': 70.0,
 '6': 70.0,
 'ಮ': 70.0,
 '1990': 70.0,
 '11': 70.0,
 '30': 70.0,
 'son': 70.0,
 'ಡ': 65.0,
 '53': 65.0,
 'ರ': 65.0,
 '1994': 65.0,
 'ವ': 65.0,
 'ಪ': 65.0,
 'ಕ': 65.0,
 'ಜ': 65.0,
 'ತ': 65.0,
 '9th': 65.0,
 '7th': 65.0,
 '54': 65.0,
 '8': 65.0,
 '17th': 65.0,
 '6th': 65.0,
 'daughter': 65.0,
 '8th': 65.0,
 'ಸ': 65.0,
 'ದ': 65.0,
 '51': 65.0,
 '1991': 60.0,
 '43': 60.0,
 '1992': 60.0,
 '1,000': 60.0,
 '2014': 60.0,
 '15th': 60.0,
 'remarkably': 60.0,
 'ನ': 60.0,
 '1995': 60.0,
 '2000': 60.0,
 '57': 60.0,


In [32]:
sort_dict(overlap_dict, reverse=False)

{'peed': 0.0,
 'post-war': 0.0,
 'vindictiveness': 0.0,
 'neoclassical': 0.0,
 'flail': 0.0,
 'indictment': 0.0,
 'popup': 0.0,
 'crusaders': 0.0,
 'stumped': 0.0,
 'fourteenth': 0.0,
 'non-white': 0.0,
 'grunt': 0.0,
 'reactivity': 0.0,
 'sandbox': 0.0,
 'daring': 0.0,
 'clipboard': 0.0,
 'loony': 0.0,
 'kayak': 0.0,
 'govt': 0.0,
 'reprieve': 0.0,
 'negotiators': 0.0,
 'cohen': 0.0,
 '3s': 0.0,
 'non-sentient': 0.0,
 'sovereigns': 0.0,
 'soluble': 0.0,
 'inset': 0.0,
 'std': 0.0,
 'fellowship': 0.0,
 'moore': 0.0,
 'fot': 0.0,
 'edu': 0.0,
 'battlegrounds': 0.0,
 'reappearance': 0.0,
 'newish': 0.0,
 'egg': 0.0,
 'superficially': 0.0,
 '138': 0.0,
 'confessions': 0.0,
 'timescales': 0.0,
 'kabhi': 0.0,
 'minions': 0.0,
 'gardeners': 0.0,
 'crumbs': 0.0,
 'clergyman': 0.0,
 'subsidence': 0.0,
 'optician': 0.0,
 'ever-changing': 0.0,
 'mandated': 0.0,
 'misinform': 0.0,
 'cleanly': 0.0,
 'self-identifying': 0.0,
 'sanskrit': 0.0,
 'seminars': 0.0,
 'raining': 0.0,
 '1809': 0.0,
 'unsym

In [33]:
model.most_similar("spongebob")

[('Din-e', 0.7905212640762329),
 ('Ujfaluši', 0.7738329768180847),
 ('Rikki', 0.771855890750885),
 ('chính', 0.770332396030426),
 ('kaul', 0.7690215110778809),
 ('Haller', 0.768900990486145),
 ('costumbre', 0.7681359648704529),
 ('Branders.com', 0.7681124806404114),
 ('Velázquez', 0.7669525146484375),
 ('Voskhod', 0.7669331431388855)]

In [34]:
contrast_model.most_similar("spongebob")

[('squarepants', 0.9683283567428589),
 ('buffy', 0.6830703616142273),
 ('nickelodeon', 0.6683040857315063),
 ('degeneres', 0.6681976914405823),
 ('cbbc', 0.666020929813385),
 ('veggietales', 0.6627994179725647),
 ('tigger', 0.6526259779930115),
 ('roseanne', 0.6281614303588867),
 ('shrek', 0.6193712949752808),
 ('whisperer', 0.6192430257797241)]