##### Installs (only run once):



In [None]:
!pip install kaggle --upgrade # for kaggle download
!pip install -U sentence-transformers # for SBERT pre-trained download
!pip install torch-geometric
!pip install pyg-lib torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-1.13.1+cu116.html

##### Imports:

In [None]:
import os
import pandas as pd
import numpy as np
import random
from random import sample
import pickle
from IPython.display import HTML, display
from google.colab import drive
import torch
from sentence_transformers import util
import gensim.downloader # pretrained word2vec and glove: https://radimrehurek.com/gensim/models/word2vec.html, https://github.com/RaRe-Technologies/gensim-data

##### Load and process a common nouns dataset from Kaggle:

In [None]:
# downloading kaggle noun dataset - only need to run once
os.environ['KAGGLE_USERNAME'] = 'liamhp03'
os.environ['KAGGLE_KEY'] = '6797bd10159b399c7191a8303d78af30'
!kaggle datasets download -d leite0407/list-of-nouns # https://www.kaggle.com/datasets/leite0407/list-of-nouns
!unzip list-of-nouns.zip

Downloading list-of-nouns.zip to /content
  0% 0.00/21.9k [00:00<?, ?B/s]
100% 21.9k/21.9k [00:00<00:00, 15.8MB/s]
Archive:  list-of-nouns.zip
  inflating: nounlist.csv            


In [None]:
# cleaning the data
nouns = pd.read_csv('nounlist.csv').values.tolist()
for i in range(len(nouns)):
  nouns[i]=nouns[i][0]

##### Setting up our Node class:

In [None]:
class Node():
  def __init__(self, embedding, engl, edges, edgeweights, idx):
    self.emb = embedding # onehot or pre-embed encoding
    self.en = engl # english word
    self.edges = edges # list of connected edges
    self.edgew = edgeweights # edgeweights corresponding to edges
    self.idx = idx # node index

##### Building a graph:

Loading a pretrained model to select edges:

In [None]:
word2vec = gensim.downloader.load("word2vec-google-news-300") # load a pretrained word2vec model



##### Test the pretrained model

In [None]:
word = nouns[random.randint(0, len(nouns)-1)] # choose a random word from our words dataset
print("Model: word2vec")
print("Word: " + word)
print("Embedding shape: " + str(word2vec[word].shape))

Model: word2vec
Word: tramp
Embedding shape: (300,)


##### Define a onehot encoding function:

In [None]:
def onehot(word):
  v = torch.zeros(len(nouns))
  v[nouns.index(word)] = 1
  return v

##### Create a graph of noun nodes:

In [None]:
strToNode = {} # dictionary that maps the english words to their nodes

##### Quick function to visualizing model progress:

In [None]:
# src: https://stackoverflow.com/questions/46939393/how-do-i-use-updatable-displays-on-colab
def progressbar(value, max=100):
    return HTML("""
        <progress
            value='{value}'
            max='{max}',
            style='width: 80%'
        >
            {value}
        </progress>
    """.format(value=value, max=max))

##### Initialize each node and add it to the dictionary

In [None]:
progress = display(progressbar(0, len(nouns)), display_id=True)
for w in range(len(nouns)):
  cword = nouns[w] # current word
  sim_words = [] # edges
  sim_vals = [] # edge weights

  outofvocab=0 # ignore words not in the word2vec vocab
  for i in range(len(nouns)):
    pword = nouns[i] # potential edge
    if(pword!=cword): # make sure they aren't the same word
      try:
        sim = word2vec.similarity(cword, pword)
        if(sim > .45): # iterate through every other word, if its similarity is above the threshold then add it as an edge with the similarity as the weight
          sim_words.append(pword)
          sim_vals.append(sim)
      except:
        outofvocab+=1
    
  # create the node and add it to our graph
  if(outofvocab < 300): # check to make sure cword is in vocab
    wordnode = Node(word2vec[cword], cword, sim_words, sim_vals, w)
    strToNode.update({cword: wordnode})

  # update our progress bar
  if(w%50 == 0):
    progress.update(progressbar(w, len(nouns)))
progress.update(progressbar(len(nouns), len(nouns)))

Testing the graph:

In [None]:
word = 'apple'
print("Word: "+word)
print("Embedding Shape: " + str(strToNode[word].emb.shape))
print("Edges: " + strToNode[word].edges)
print("Edgeweights: " + strToNode[word].edgew)

##### Save our graph to google drive:

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def Save(graph):
    writename = "init-graph-w2vec-.35thresh"
    with open("/content/drive/My Drive/J-Term 2023/input-graphs/"+writename+".txt", "wb") as pkl_handle:
        pickle.dump(graph, pkl_handle)
Save(strToNode)
del(strToNode)

##### Print all saved graphs

In [None]:
saved = os.listdir("/content/drive/My Drive/J-Term 2023/input-graphs")
t=[print(str(i)+": "+saved[i]) for i in range(len(saved))]

FileNotFoundError: ignored

In [None]:
def Load(graphname):
    with open("/content/drive/My Drive/J-Term 2023/input-graphs/" + graphname, "rb") as pkl_handle:
        output = pickle.load(pkl_handle)
        print("loaded: "+graphname)
        return output

loaded_graph = Load(saved[0])

In [None]:
import matplotlib.pyplot as plt 

plt.figure(figsize=(20,10))
plotted = {} # idx: (xcor, ycor)
for i in range(0, 10):
  x = random.randint(0,20)
  y = random.randint(0,20)
  plotted.update({i: [x,y]})
  plt.plot(x,y, marker="bo")
  for e in strToNode[strToNode[i]].edges:
    if(e.idx < i):
      plt.plot([x, plotted[e.idx][0]], [y, plotted[e.idx][1]])

plt.show()