In [1]:
# Step 1: Load the data (we assume that we have one sentence per line and that the tokens are separated by whitespace)

with open("data/wue15.txt", "r") as f:
    data = f.readlines()

In [2]:
# As we can see, the data is a list of strings, where each string is a line from the file.
# Let's print the first 10 lines to see what we have.
data[:10]

['Welcome !\n',
 'Hello , , and welcome to Wikipedia !\n',
 'Thank you for your contributions .\n',
 'I hope you like the place and decide to stay .\n',
 'Here are a few good links for newcomers :\n',
 'The five pillars of Wikipedia\n',
 'How to edit a page\n',
 'Help pages\n',
 'Tutorial\n',
 'How to write a great article\n']

In [3]:
# Now, we need to split each line into tokens. We can do this using the `split()` method, which splits a string into a list of words based on whitespace.
# We will also remove any leading or trailing whitespace from each line.

tokenized_data = []
for line in data:
    # Strip leading/trailing whitespace and split by whitespace
    tokens = line.strip().split()
    tokenized_data.append(tokens)

# Alternatively, we could use a list comprehension to achieve the same result in a more compact way.
# tokenized_data = [line.strip().split() for line in data]

# Now, let's print the first 10 lines again to see the tokenized data.
tokenized_data[:10]

[['Welcome', '!'],
 ['Hello', ',', ',', 'and', 'welcome', 'to', 'Wikipedia', '!'],
 ['Thank', 'you', 'for', 'your', 'contributions', '.'],
 ['I',
  'hope',
  'you',
  'like',
  'the',
  'place',
  'and',
  'decide',
  'to',
  'stay',
  '.'],
 ['Here', 'are', 'a', 'few', 'good', 'links', 'for', 'newcomers', ':'],
 ['The', 'five', 'pillars', 'of', 'Wikipedia'],
 ['How', 'to', 'edit', 'a', 'page'],
 ['Help', 'pages'],
 ['Tutorial'],
 ['How', 'to', 'write', 'a', 'great', 'article']]

In [4]:
# Now, we need to import the Gensim library to create a Word2Vec model using the tokenized data.
from gensim.models import Word2Vec
# We will create a Word2Vec model using the CBOW approach. The parameters are:
# - `vector_size`: The size of the word vectors (we will use 200 dimensions).
# - `window`: The maximum distance between the current and predicted word within a sentence (we will use a window of 5).
# - `min_count`: Ignores all words with total frequency lower than this (we will use 5).
# - `workers`: The number of worker threads to train the model (we will use all available cores except 1).
# - `sg`: Skip-gram model (1) or CBOW (0). We will use CBOW (0).

# get available cores
import multiprocessing
cores = multiprocessing.cpu_count() - 1
if cores > 50:
    cores = 20

# Create the Word2Vec model
model = Word2Vec(sentences=tokenized_data, vector_size=200, window=5, min_count=5, workers=cores, sg=0)

In [5]:
# We can also save the model to a file for later use.
model.save("data/wue15_word2vec.model")

In [None]:
# To load the model later, we can use the following command:
from gensim.models import Word2Vec
model = Word2Vec.load("data/wue15_word2vec.model")

In [6]:
# The interesting part of the model is the `wv` attribute, which contains the word vectors.
model = model.wv

In [7]:
# Let's see if it worked by asking for nearest neighbors of a word.
model.most_similar("attack", topn=20)

[('attacks', 0.6587900519371033),
 ('insult', 0.6190794706344604),
 ('vendetta', 0.6006001234054565),
 ('attack.--', 0.5832156538963318),
 ('affront', 0.5648747086524963),
 ('slur', 0.5172262787818909),
 ('accusation', 0.5081024765968323),
 ('attacks.--', 0.5062654614448547),
 ('harassment', 0.49636349081993103),
 ('threat', 0.4924222230911255),
 ('insulting', 0.4907834231853485),
 ('ad', 0.48570260405540466),
 ('insults', 0.48160967230796814),
 ('attacking', 0.4793931841850281),
 ('allegation', 0.47617363929748535),
 ('observation', 0.47574079036712646),
 ('asshat', 0.4682883322238922),
 ('slander', 0.46813884377479553),
 ('animus', 0.46535375714302063),
 ('edit-war', 0.4615827798843384)]

In [8]:
# Let's see how many words are in the vocabulary (= how many types from our training data occur at least 5 times in the corpus).
print(f"Vocabulary size: {len(model)}")

Vocabulary size: 115621


In [9]:
max_sents = 10
printed = 0
word = "vendetta"
for sent in tokenized_data:
    if word in sent:
        print(" ".join(sent))
        printed += 1
    if printed >= max_sents:
        break

Besides , I have a vendetta against forced antidepressants .
You are now conducting a personal vendetta against me. You've decided to remove every single edit I've made to Trek pages .
-- 15:12 , 18 December 2014 ( UTC ) Your AIV report I removed it. There is no " vendetta " , and if you're filing a complaint for edit warring you should have reported the other party as well .
Should you come to your senses and decide to make constructive edits again , I don't think we'll have a problem , but if you insist on continuing your vendetta against the Powerpuff Girls , It won't end well .
With no edits to his account there's no way to tell if it's tCv , this impersonator , someone with a vendetta against them , or just another vandal .
Admit that you have a personal vendetta against this person .
I just can't understand why SkyWriter has such a vendetta to push " clarifying " the article as " Christian " at the expense of marginalization of many Messianic Jews who reject the label not only no

In [10]:
# Let's now compare the model to a pre-trained one.

import gensim.downloader as api

contrast_model = api.load("glove-wiki-gigaword-50")
contrast_model.most_similar("attack")

[('attacks', 0.9274438619613647),
 ('bombing', 0.8695274591445923),
 ('suicide', 0.8600563406944275),
 ('raid', 0.8567200303077698),
 ('bomb', 0.8250047564506531),
 ('ambush', 0.8242325782775879),
 ('killing', 0.8198403120040894),
 ('deadly', 0.8161444067955017),
 ('strikes', 0.8123002648353577),
 ('militants', 0.8120480179786682)]

In [11]:
# Importantly, the vectors of the two models are not directly comparable, as they are trained independently.

vector1 = model["attack"]
vector2 = contrast_model["attack"]
print(f"Vector 1: {vector1[:10]}")
print(f"Vector 2: {vector2[:10]}")

Vector 1: [-1.2542692   1.1226459   0.41601932 -0.47282043 -0.08565995  1.7557871
 -0.55072826 -0.03814235 -0.67968714 -0.7880553 ]
Vector 2: [ 1.4703  -0.9337   0.51369 -0.19082  0.50227  0.13241  0.12726  0.63662
 -0.13905 -0.32585]


In [12]:
# This will not work, as the vectors are not of the same size.
# And even if they were, the nearest neighbours would not be meaningful as the models are independently trained.
model.most_similar(vector2)

ValueError: operands could not be broadcast together with shapes (200,) (50,) (200,) 

In [13]:
# What you can do, however, is to contrast the two models by contrasting the nearest neighbours of a word.

attack1 = [word_score[0] for word_score in model.most_similar("attack", topn=20)]
attack2 = [word_score[0] for word_score in contrast_model.most_similar("attack", topn=20)]

attack1

['attacks',
 'insult',
 'vendetta',
 'attack.--',
 'affront',
 'slur',
 'accusation',
 'attacks.--',
 'harassment',
 'threat',
 'insulting',
 'ad',
 'insults',
 'attacking',
 'allegation',
 'observation',
 'asshat',
 'slander',
 'animus',
 'edit-war']

In [14]:
for word in attack1:
    if word not in attack2:
        print(f"Word {word} is in model 1 but not in model 2")

Word insult is in model 1 but not in model 2
Word vendetta is in model 1 but not in model 2
Word attack.-- is in model 1 but not in model 2
Word affront is in model 1 but not in model 2
Word slur is in model 1 but not in model 2
Word accusation is in model 1 but not in model 2
Word attacks.-- is in model 1 but not in model 2
Word harassment is in model 1 but not in model 2
Word threat is in model 1 but not in model 2
Word insulting is in model 1 but not in model 2
Word ad is in model 1 but not in model 2
Word insults is in model 1 but not in model 2
Word attacking is in model 1 but not in model 2
Word allegation is in model 1 but not in model 2
Word observation is in model 1 but not in model 2
Word asshat is in model 1 but not in model 2
Word slander is in model 1 but not in model 2
Word animus is in model 1 but not in model 2
Word edit-war is in model 1 but not in model 2


In [15]:
# Complete the following function that should return a percentage of overlapping words between two lists of words.
def list_overlap(list1, list2):
    # ... your code here
    return overlap_percentage

In [16]:
# And now integrate the previous steps and your list_overlap function into the function below.
# It should take two words and two models as input and return the percentage of overlapping words among the n nearest neighbours of the words between the two models.

def overlap_percentage(word, model1, model2, n=20):
    # ... your code here    
    return overlap_percentage

In [17]:
# Now we can go from a corpus-based to a corpus-driven perspective and look at all words in the vocabulary.

# We find the vocab of the model by using the `index_to_key` attribute.
print(model.index_to_key[:10])

# We build a set of words that are shared between the two models.

shared_vocab = set(model.index_to_key).intersection(set(contrast_model.index_to_key))
print(f"Shared vocabulary size: {len(shared_vocab)}")

# Now we can iterate over the vocabulary and save the overlap percentages in a dictionary.
overlap_dict = {}

for word in shared_vocab:
    overlap = overlap_percentage(word, model, contrast_model)
    overlap_dict[word] = overlap

[',', '.', 'the', 'to', 'you', '(', ')', 'and', 'of', 'a']
Shared vocabulary size: 41054


In [18]:
# Now we can sort the dictionary with this function

def sort_dict(dic, reverse=True):
    return dict(sorted(dic.items(), key=lambda item: item[1], reverse=reverse))

sorted_overlap = sort_dict(overlap_dict)

TypeError: '<' not supported between instances of 'function' and 'function'

In [19]:
sorted_overlap

NameError: name 'sorted_overlap' is not defined

In [20]:
sort_dict(overlap_dict, reverse=False)

TypeError: '<' not supported between instances of 'function' and 'function'

In [21]:
model.most_similar("spongebob")

[('Touchpoint.in', 0.7790597081184387),
 ('Raghunath', 0.7682300806045532),
 ('Waheed', 0.7670059204101562),
 ('Indi', 0.7634084820747375),
 ('Argentvive', 0.7631905674934387),
 ('Dionne', 0.7628239393234253),
 ('Velázquez', 0.7623804211616516),
 ('Mahaffey-Muhammad', 0.7612507939338684),
 ('Frugoo', 0.7594695687294006),
 ('Toshiyuki', 0.7582099437713623)]

In [22]:
contrast_model.most_similar("spongebob")

[('squarepants', 0.9683283567428589),
 ('buffy', 0.6830703616142273),
 ('nickelodeon', 0.6683040857315063),
 ('degeneres', 0.6681976914405823),
 ('cbbc', 0.666020929813385),
 ('veggietales', 0.6627994179725647),
 ('tigger', 0.6526259779930115),
 ('roseanne', 0.6281614303588867),
 ('shrek', 0.6193712949752808),
 ('whisperer', 0.6192430257797241)]