Want to create RNN with input layer of dimension 301, and output layer of dimension 300.

-Input layer: Concatenate word vector (300 elements) with the associated score (1 element)
-Output layer: Guess the vector of the closest word (maybe find nearest neighbor, or approx. nearest neighbor?)

Feed in last N guesses (or all guesses if n_guesses < N)

Steps:

1. Generate training, validation data:
-choose word at random to be solution
-choose 5 words at random and get similarity score with random word
-feed in word + score into RNN one by one
-output in JSON file
2. Set up RNN with correct amount of dimensions
3. Train RNN

In [1]:
!pip3 install -U gensim
!pip3 install -U sklearn
!pip3 install -U ipywidgets


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [1]:
import gensim
import torch
import torch.nn as nn
import json
from answers import secretWords as answers
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import random
from torch.utils.data import Dataset, DataLoader, Subset
from tqdm.notebook import tqdm, trange
from constants import PATH_TO_DATASET
import torch.optim as optim
import ipywidgets
import heuristicrnn

In [2]:
PATH_TO_DATASET = "~/Desktop/CS 4701/archive/GoogleNews-vectors-negative300.bin"

gnews_model = gensim.models.KeyedVectors.load_word2vec_format(PATH_TO_DATASET, binary=True)

In [3]:
embeddings = gnews_model[answers]

In [14]:
def generate_data(num_entries, filename):
  res = []
  for i in range(num_entries):
    # Select 7 random word vectors
    random_values = random.sample(range(len(embeddings)), k=7)
    vectors = embeddings[random_values]
    guesses = vectors[0:5]
    correct_guess = vectors[5]
    next_guess = vectors[6]

    # The corresponding strings
    words = [answers[i] for i in random_values]
    guessed_words = words[0:5]
    correct_word = words[5]
    next_word = words[6]

    # Calculate cosine similarity between each guess and the correct guess
    similarity_scores = [gnews_model.similarity(guessed_word, correct_word) for guessed_word in guessed_words]

    # Format the past guesses as a list of pairs of word vectors and corresponding similarity scores
    past_guesses = [{"word": word, "similarity_score": float(score * 100)} for word, score in zip(guessed_words, similarity_scores)]

    # Create a JSON object with the past guesses and correct guess
    entry = {"past_guesses": past_guesses, "next_guess": next_word, "actual_score": float(gnews_model.similarity(next_word, correct_word) * 100)}
    res.append(entry)
    
  output = {"entries": res}
  # Write output JSON to file
  with open(f"{filename}.json", "w") as file:
    json.dump(output, file) 

In [15]:
NUM_ENTRIES = 4000

generate_data(NUM_ENTRIES, "train")
generate_data(NUM_ENTRIES, "val")

In [16]:
# concatenate similarity score to each word vector except the next guess (concatenate a 0)
def get_data_from_json(res):
    data = []
    expected_scores = []
    for entry in res:
        new_entry = []
        for guess in entry["past_guesses"]:
            guess_vector = gnews_model[guess["word"]].tolist()
            guess_cat = [guess_vector + [guess["similarity_score"]]]
            new_entry += guess_cat

        next_guess_cat = [gnews_model[entry["next_guess"]].tolist() + [0]]
        new_entry += next_guess_cat
        data += [new_entry]
        expected_scores += [entry["actual_score"]]

    return data, expected_scores


In [17]:
with open("train.json", "r") as file:
  f = json.load(file)
  train_data, train_expected_scores = get_data_from_json(f["entries"])

with open("val.json", "r") as file:
  f = json.load(file)
  val_data, val_expected_scores = get_data_from_json(f["entries"])

In [18]:
print(len(train_data))
print(train_expected_scores[0])

4000
11.763811111450195


In [19]:
class WordDataset(Dataset):
    def __init__(self, guesses, scores):
        """
        Loads in the word dataset as tensors.

        Args:
          guesses: groups of 5 guesses concatenated with their scores, and 1 guess concatenated with 0
        """
        self.guesses = torch.tensor(guesses)
        self.scores = torch.tensor(scores)
    
    def __len__(self):
        return len(self.guesses)
    
    def __getitem__(self, index):
        """
        Get the model input for the given index.
        """
        return self.guesses[index], self.scores[index]

In [20]:
def get_data_loaders(data, scores, batch_size=1, shuffle=False):
    dataset = WordDataset(data, scores)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
    return loader

In [21]:
train_loader = get_data_loaders(train_data, train_expected_scores, shuffle=True, batch_size=3)
val_loader = get_data_loaders(val_data, val_expected_scores)

In [22]:
def train_epoch(model, train_loader, optimizer):
    model.train()
    total = 0
    batch = 0
    total_loss = 0
    correct = 0

    for (input_batch, expected_out) in tqdm(train_loader, leave=False, desc="Training Batches"):
        optimizer.zero_grad()
        batch += 1
        output = model(input_batch)

        loss = model.compute_loss(output, expected_out)

        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    print(f"Average loss: {total_loss/batch}")
    return total_loss/batch

In [28]:
def evaluate_epoch(model, val_loader):
    model.eval()
    loss = 0
    total_loss = 0
    for(input_batch, expected_out) in tqdm(val_loader, leave=False, desc="Validation"):
        output = model(input_batch)
        loss = model.compute_loss(output, expected_out)
        total_loss += loss.item()
    
    print(f"Average loss: {total_loss/len(val_loader)}")

In [24]:
def train(num_epochs, model, train_loader, learning_rate=0.001):
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=.01)
    for epoch in trange(num_epochs, desc="Epochs"):
        cur_loss = train_epoch(model, train_loader, optimizer)

In [25]:
def evaluate(num_epochs, model, val_loader):
    for epoch in trange(num_epochs, desc="Epochs"):
        evaluate_epoch(model, val_loader)

In [26]:
def weight_init(m):
	if isinstance(m, nn.Linear):
		nn.init.xavier_uniform_(m.weight)
		nn.init.constant_(m.bias, 0.)

In [30]:
# torch.Size([2000, 6, 301])

semantle_rnn = heuristicrnn.RNN(301, 200, 1, 2)
semantle_rnn.apply(weight_init)
train(4, semantle_rnn, train_loader)

Epochs:   0%|          | 0/4 [00:00<?, ?it/s]

Training Batches:   0%|          | 0/1334 [00:00<?, ?it/s]

Average loss: 5.862300193023825


Training Batches:   0%|          | 0/1334 [00:00<?, ?it/s]

Average loss: 5.569367382815038


Training Batches:   0%|          | 0/1334 [00:00<?, ?it/s]

Average loss: 5.317144947799518


Training Batches:   0%|          | 0/1334 [00:00<?, ?it/s]

Average loss: 4.884453966364689


In [32]:
semantle_rnn.save_model("heuristicrnn")

In [29]:
semantle_rnn = heuristicrnn.RNN(301, 200, 1, 2)
semantle_rnn.load_model("heuristic-model")

In [33]:
evaluate(1, semantle_rnn, val_loader)

Epochs:   0%|          | 0/1 [00:00<?, ?it/s]

Validation:   0%|          | 0/4000 [00:00<?, ?it/s]

Average loss: 6.032291997164488


In [None]:
semantle_rnn.save_model("heuristic-model")