##### Installs: 

In [None]:
!pip install kaggle --upgrade # for kaggle download
!pip install -U sentence-transformers # for SBERT pre-trained download

##### Imports:

In [2]:
import os
import pandas as pd
import torch
import torch.nn as nn
from google.colab import drive
import random
import gensim.downloader
from sentence_transformers import util

##### Load in nouns dataset:

In [3]:
os.environ['KAGGLE_USERNAME'] = 'liamhp03'
os.environ['KAGGLE_KEY'] = '6797bd10159b399c7191a8303d78af30'
!kaggle datasets download -d leite0407/list-of-nouns # https://www.kaggle.com/datasets/leite0407/list-of-nouns
!unzip list-of-nouns.zip

Downloading list-of-nouns.zip to /content
  0% 0.00/21.9k [00:00<?, ?B/s]
100% 21.9k/21.9k [00:00<00:00, 24.8MB/s]
Archive:  list-of-nouns.zip
  inflating: nounlist.csv            


Cleaning:

In [4]:
nouns = pd.read_csv('nounlist.csv').values.tolist()
for i in range(len(nouns)):
  nouns[i]=nouns[i][0]
words_dict = {}
for i in range(len(nouns)):
  words_dict.update({nouns[i].lower(): i})

##### Load in a trained graph from drive: 

Graph training colab: https://colab.research.google.com/drive/1st1rI1A99D8lQb7eTjm3Ikj4Mm6TgLGL?authuser=3#scrollTo=DMOtDw4N3Rl1

In [5]:
drive.mount('/content/drive')
saved = os.listdir("/content/drive/My Drive/J-Term 2023/output-matricies")
t=[print(str(i)+": "+saved[i]) for i in range(len(saved))]

Mounted at /content/drive
0: unweighted-threshold@.45-onehotencoded-output.pt
1: unweighted-threshold@.45-onehotencoded-updated.pt
2: weighted-threshold@.45-onehotencoded-updated.pt
3: unweighted-threshold.45-onehot-graphout.pt
4: unw-thresh45-onehot-graphout-eval.pt
5: unw-w2vsim-50x8000x-comat2-4:37.pt
6: w2vweighted-50x8000x-comat1-9:52.pt
7: unweighted-50x8000x-comat2-10:15.pt
8: unweighted-50x8000x-comat3-11:20.pt


In [7]:
choice = 8
load_graphout = torch.load('/content/drive/My Drive/J-Term 2023/output-matricies/'+saved[choice], map_location=torch.device('cpu'))

##### Create a words dict to map words to indicies

In [8]:
words_dict = {}
for i in range(len(nouns)):
  words_dict.update({nouns[i].lower(): i})

##### Helper functions to retrieve embeddings and find the most similar word to a  new embedding:

In [15]:
word2vec = gensim.downloader.load("word2vec-google-news-300") # load a pretrained word2vec model



In [28]:
def get_emb(word, w2v):
  if (w2v):
    return word2vec[word]
  return load_graphout[words_dict[word]].detach().numpy()

def most_sim(emb, w2v):
  close_idx = 0
  close_sim = 0
  for i in range(len(nouns)):
    try:
      sim = util.cos_sim(get_emb(nouns[i], w2v), emb)
      if(sim > close_sim):
        close_sim = sim
        close_idx = i
    except:
      pass
  return nouns[close_idx]

##### Looping game code:

In [None]:
brk = False
while(not brk):
  goal = nouns[random.randint(0, len(nouns)-1)]
  resp = input("Your goal word is "+goal+", reroll? y/n   ")
  brk = (resp == "n")

brk = False
while(not brk):
  start = nouns[random.randint(0, len(nouns)-1)]
  resp = input("Your starting word is "+start+", reroll? y/n   ")
  brk = (resp == "n")

In [17]:
using_w2v = True

In [29]:
goal = "queen"
start = "king"

curr = start
counter = 0
brk=False
while curr != goal and not brk:
  
  curr_emb = get_emb(curr, using_w2v)
  print("\nCurrent word: "+curr)
  print("Guesses: "+str(counter))
  guess = input('Next: ')

  guesses = guess.split(' ')
  for i in guesses:
    try:
      if(i[0] == '+'):
        curr_emb += get_emb(i[1:], using_w2v)
      elif(i[0] == '-'):
        curr_emb -= get_emb(i[1:], using_w2v)
    except:
      print(i+" is not a valid guess... skipping.")
  
  curr = most_sim(curr_emb, using_w2v)
  counter+=1

print("\n\nWell done! You reached "+goal+" from "+start+" in "+str(counter)+ " guesses!")


Current word: king
Guesses: 0
Next: -man +woman
-man is not a valid guess... skipping.
+woman is not a valid guess... skipping.

Current word: king
Guesses: 1


KeyboardInterrupt: ignored