In [None]:
# Execute if you are working in colab
!pip install -qq numpy==1.26.4 gensim
get_ipython().kernel.do_shutdown (restart=True)

In [None]:
# Step 1: Load the data (we assume that we have one sentence per line and that the tokens are separated by whitespace)

with open("wue15.txt", "r") as f:
    data = f.readlines()

In [None]:
# As we can see, the data is a list of strings, where each string is a line from the file.
# Let's print the first 10 lines to see what we have.
data[:10]

In [None]:
# Now, we need to split each line into tokens. We can do this using the `split()` method, which splits a string into a list of words based on whitespace.
# We will also remove any leading or trailing whitespace from each line.

tokenized_data = []
for line in data:
    # Strip leading/trailing whitespace and split by whitespace
    tokens = line.strip().split()
    tokenized_data.append(tokens)

# Alternatively, we could use a list comprehension to achieve the same result in a more compact way.
# tokenized_data = [line.strip().split() for line in data]

# Now, let's print the first 10 lines again to see the tokenized data.
tokenized_data[:10]

In [None]:
# Now, we need to import the Gensim library to create a Word2Vec model using the tokenized data.
from gensim.models import Word2Vec
# We will create a Word2Vec model using the CBOW approach. The parameters are:
# - `vector_size`: The size of the word vectors (we will use 200 dimensions).
# - `window`: The maximum distance between the current and predicted word within a sentence (we will use a window of 5).
# - `min_count`: Ignores all words with total frequency lower than this (we will use 5).
# - `workers`: The number of worker threads to train the model (we will use all available cores except 1).
# - `sg`: Skip-gram model (1) or CBOW (0). We will use CBOW (0).

# get available cores
import multiprocessing
cores = multiprocessing.cpu_count() - 1
if cores > 50:
    cores = 20

# Create the Word2Vec model
model = Word2Vec(sentences=tokenized_data, vector_size=200, window=5, min_count=5, workers=cores, sg=0)

In [None]:
# We can also save the model to a file for later use.
model.save("wue15_word2vec.model")

In [None]:
# To load the model later, we can use the following command:
from gensim.models import Word2Vec
model = Word2Vec.load("wue15_word2vec.model")

In [None]:
# The interesting part of the model is the `wv` attribute, which contains the word vectors.
model = model.wv

In [None]:
# Let's see if it worked by asking for nearest neighbors of a word.
model.most_similar("attack", topn=20)

In [None]:
# Let's see how many words are in the vocabulary (= how many types from our training data occur at least 5 times in the corpus).
print(f"Vocabulary size: {len(model)}")

In [None]:
max_sents = 10
printed = 0
word = "vendetta"
for sent in tokenized_data:
    if word in sent:
        print(" ".join(sent))
        printed += 1
    if printed >= max_sents:
        break

In [None]:
# Let's now compare the model to a pre-trained one.

import gensim.downloader as api

contrast_model = api.load("glove-wiki-gigaword-50")
contrast_model.most_similar("attack")

In [None]:
# Importantly, the vectors of the two models are not directly comparable, as they are trained independently.

vector1 = model["attack"]
vector2 = contrast_model["attack"]
print(f"Vector 1: {vector1[:10]}")
print(f"Vector 2: {vector2[:10]}")

In [None]:
# This will not work, as the vectors are not of the same size.
# And even if they were, the nearest neighbours would not be meaningful as the models are independently trained.
model.most_similar(vector2)

In [None]:
# What you can do, however, is to contrast the two models by contrasting the nearest neighbours of a word.

attack1 = [word_score[0] for word_score in model.most_similar("attack", topn=20)]
attack2 = [word_score[0] for word_score in contrast_model.most_similar("attack", topn=20)]

attack1

In [None]:
for word in attack1:
    if word not in attack2:
        print(f"Word {word} is in model 1 but not in model 2")

In [None]:
# Complete the following function that should return a percentage of overlapping words between two lists of words.
def list_overlap(list1, list2):
    # create an empty list to store the overlapping words
    # use a for loop to iterate over the first list and check if each word is in the second list - if so, add it to the list of overlapping words
    # divide the length of the list of overlapping words by the length of the first list
    # and multiply by 100 to get the percentage
    return overlap_percentage

In [None]:
# And now integrate the previous steps and your list_overlap function into the function below.
# It should take two words and two models as input and return the percentage of overlapping words among the n nearest neighbours of the words between the two models.

def overlap_percentage(word, model1, model2, n=20):
    # Get the n nearest neighbours of the word in both models
    word1_neighbors = [word_score[0] for word_score in model1.most_similar(word, topn=n)]
    word2_neighbors = [word_score[0] for word_score in model2.most_similar(word, topn=n)]
    # Calculate the overlap percentage using the list_overlap function
    overlap_percentage = list_overlap(word1_neighbors, word2_neighbors)
    return overlap_percentage

In [None]:
# Now we can use the new function – What overlap percentages do you get?

overlap_percentage("attack", model, contrast_model)

In [None]:
overlap_percentage("2000", model, contrast_model)

In [None]:
overlap_percentage("teenager", model, contrast_model)

In [None]:
model.most_similar("teenager")

In [None]:
contrast_model.most_similar("teenager")

In [None]:
# Now we can go from a corpus-based to a corpus-driven perspective and look at all words in the vocabulary.

# We find the vocab of the model by using the `index_to_key` attribute.
print(model.index_to_key[:10])

# We build a set of words that are shared between the two models.

shared_vocab = set(model.index_to_key).intersection(set(contrast_model.index_to_key))
print(f"Shared vocabulary size: {len(shared_vocab)}")

# Now we can iterate over the vocabulary and save the overlap percentages in a dictionary.
overlap_dict = {}

max_words = 1000 # set to len(shared_vocab) to use the whole vocab
i = 0
for word in shared_vocab:
    overlap = overlap_percentage(word, model, contrast_model)
    overlap_dict[word] = overlap
    i += 1
    if i >= max_words:
      break

In [None]:
# Now we can sort the dictionary with this function

def sort_dict(dic, reverse=True):
    return dict(sorted(dic.items(), key=lambda item: item[1], reverse=reverse))

sorted_overlap = sort_dict(overlap_dict)

In [None]:
sorted_overlap

In [None]:
sort_dict(overlap_dict, reverse=False)

In [None]:
model.most_similar("spongebob")

In [None]:
contrast_model.most_similar("spongebob")