In [6]:
# This mounts your Google Drive to the Colab VM.
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# TODO: Enter the foldername in your Drive where you have saved the unzipped
# assignment folder, e.g. 'cs231n/assignments/assignment3/'
FOLDERNAME = "cs231n/project"
assert FOLDERNAME is not None, "[!] Enter the foldername."

# Now that we've mounted your Drive, this ensures that
# the Python interpreter of the Colab VM can load
# python files from within it.
import sys
sys.path.append('/content/drive/My Drive/{}'.format(FOLDERNAME))

Mounted at /content/drive


In [1]:
!wget https://conceptnet.s3.amazonaws.com/downloads/2019/numberbatch/numberbatch-en-19.08.txt.gz
!gunzip numberbatch-en-19.08.txt.gz

--2025-06-02 04:52:11--  https://conceptnet.s3.amazonaws.com/downloads/2019/numberbatch/numberbatch-en-19.08.txt.gz
Resolving conceptnet.s3.amazonaws.com (conceptnet.s3.amazonaws.com)... 16.182.96.241, 3.5.8.170, 3.5.27.206, ...
Connecting to conceptnet.s3.amazonaws.com (conceptnet.s3.amazonaws.com)|16.182.96.241|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 325403502 (310M) [application/x-gzip]
Saving to: ‘numberbatch-en-19.08.txt.gz’


2025-06-02 04:52:17 (51.3 MB/s) - ‘numberbatch-en-19.08.txt.gz’ saved [325403502/325403502]



In [2]:
import numpy as np

def load_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        next(f)  # Skip header line
        for line in f:
            parts = line.rstrip().split(' ')
            word = parts[0]
            vector = np.array([float(x) for x in parts[1:]])
            embeddings[word] = vector
    return embeddings

# Load Numberbatch embeddings
embedding_path = "numberbatch-en-19.08.txt"
embeddings = load_embeddings(embedding_path)
print(f"Loaded {len(embeddings)} words.")

Loaded 516782 words.


In [4]:
from sklearn.metrics.pairwise import cosine_similarity

def get_top_related_words(prompt, embeddings, top_n):
    if prompt not in embeddings:
        print(f"'{prompt}' not found in embeddings!")
        return []

    prompt_vector = embeddings[prompt].reshape(1, -1)
    words = list(embeddings.keys())
    vectors = np.array([embeddings[word] for word in words])

    similarities = cosine_similarity(prompt_vector, vectors)[0]
    sorted_indices = np.argsort(-similarities)  # Descending order
    top_words = [(words[i], similarities[i]) for i in sorted_indices[1:top_n+1]]  # Skip prompt itself

    return top_words

# Example usage
prompt_word = "soapbox"
num_related_words = 25
top_related = get_top_related_words(prompt_word, embeddings, num_related_words)
for word, score in top_related:
    print(f"{word}: {score:.4f}")


soap_box: 0.8627
soapbox_car: 0.6999
stump_orator: 0.5671
podium: 0.5362
pulpit: 0.5332
pulpitical: 0.5212
making_speech: 0.5079
dais: 0.5067
pulpitish: 0.4950
rostrum: 0.4804
reviewing_stand: 0.4664
abat_voix: 0.4631
daïs: 0.4579
indoor_stage: 0.4577
pulpitry: 0.4571
abatvoix: 0.4569
rostra: 0.4516
daises: 0.4457
lectern: 0.4343
pulpiteer: 0.4230
pulpits: 0.4226
podiumed: 0.4213
dancing_platform: 0.4200
rants: 0.4146
preaching_cross: 0.4132


In [7]:
import pickle
with open('/content/drive/MyDrive/cs231n/project/DAVIS_labels.pkl', 'rb') as f:
    loaded_list = pickle.load(f)
print(loaded_list)

['bear', 'blackswan', 'bmx-bumps', 'bmx-trees', 'boat', 'breakdance', 'breakdance-flare', 'bus', 'camel', 'car-roundabout', 'car-shadow', 'car-turn', 'cows', 'dance-jump', 'dance-twirl', 'dog', 'dog-agility', 'drift-chicane', 'drift-straight', 'drift-turn', 'elephant', 'flamingo', 'goat', 'hike', 'hockey', 'horsejump-high', 'horsejump-low', 'kite-surf', 'kite-walk', 'libby', 'lucia', 'mallard-fly', 'mallard-water', 'motocross-bumps', 'motocross-jump', 'motorbike', 'paragliding', 'paragliding-launch', 'parkour', 'rhino', 'rollerblade', 'scooter-black', 'scooter-gray', 'soapbox', 'soccerball', 'stroller', 'surf', 'swing', 'tennis', 'train']


In [9]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def get_combined_embedding(word, embeddings):
  parts = word.replace("-", " ").split()
  valid_parts = [embeddings[part] for part in parts if part in embeddings]
  if not valid_parts:
      return None
  combined = np.mean(valid_parts, axis=0)
  return combined

def get_top_related_words_list(prompts, embeddings, top_n):
  result = {}
  words = list(embeddings.keys())
  vectors = np.array([embeddings[word] for word in words])

  for prompt in prompts:
    if prompt in embeddings:
      prompt_vector = embeddings[prompt].reshape(1, -1)
    else:
      # Try to get combined embedding for hyphenated words
      combined = get_combined_embedding(prompt, embeddings)
      if combined is None:
        print(f"'{prompt}' not found and no valid parts!")
        result[prompt] = []
        continue
      prompt_vector = combined.reshape(1, -1)

    similarities = cosine_similarity(prompt_vector, vectors)[0]
    sorted_indices = np.argsort(-similarities)
    top_words = [(words[i], similarities[i]) for i in sorted_indices[1:top_n+1]]
    result[prompt] = top_words

  return result
num_related_words = 25
top_related_words = get_top_related_words_list(loaded_list, embeddings, num_related_words)
print(top_related_words)

'blackswan' not found and no valid parts!
{'bear': [('bearest', np.float64(0.9557343053417982)), ('beareth', np.float64(0.9521229330689869)), ('rebear', np.float64(0.9291696740875154)), ('cinnamon_bear', np.float64(0.921328338754523)), ('kamchatka_brown_bear', np.float64(0.9169017811439297)), ("bergman's_bear", np.float64(0.9169017811439297)), ('himalayan_brown_bear', np.float64(0.9169017811439297)), ('syrian_brown_bear', np.float64(0.914868452532226)), ('forthbear', np.float64(0.9129296053576413)), ('atlas_bear', np.float64(0.9108688398506375)), ('bearproof', np.float64(0.9003535842976019)), ('formosan_black_bear', np.float64(0.8874738720697124)), ('sun_bear', np.float64(0.8766908420114662)), ('brown_fur', np.float64(0.8723509708366437)), ('big_animal', np.float64(0.8682817869832731)), ('bearly', np.float64(0.8657648973321971)), ('cave_bear', np.float64(0.8645256922779995)), ('inbear', np.float64(0.8635077228899443)), ('asiatic_black_bear', np.float64(0.8524683184848215)), ('ursoid', 

In [10]:
import json
# Save to a file called 'similar_words.json' for the similar words we did find
with open('/content/drive/MyDrive/cs231n/project/conceptnet_similar_words.json', 'w') as f:
    json.dump(top_related_words, f, indent=4)  # indent=4 makes it nicely formatted