<a href="https://colab.research.google.com/github/charlesincharge/Caltech-CS155-2022/blob/main/sets/set5/set5_prob3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Set 5
## 3. Word2Vec \*\*Principles**

#### Preparation

In [1]:
# download the helper function
!wget -O P3CHelpers.py https://raw.githubusercontent.com/charlesincharge/Caltech-CS155-2022/main/sets/set5/P3CHelpers.py

zsh:1: command not found: wget


In [2]:
# download the dataset
!wget -O dr_seuss.txt https://raw.githubusercontent.com/charlesincharge/Caltech-CS155-2022/main/sets/set5/data/dr_seuss.txt

zsh:1: command not found: wget


In [1]:
import numpy as np
from P3CHelpers import *
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset

#### Problem D: 
Fill in the generate_traindata and find_most_similar_pairs functions.

In [2]:
def get_word_repr(word_to_index, word):
    """
    Returns one-hot-encoded feature representation of the specified word given
    a dictionary mapping words to their one-hot-encoded index.

    Arguments:
        word_to_index: Dictionary mapping words to their corresponding index
                       in a one-hot-encoded representation of our corpus.

        word:          Word whose feature representation we wish to compute.

    Returns:
        feature_representation:     Feature representation of the passed-in word.
    """
    unique_words = word_to_index.keys()
    # Return a vector that's zero everywhere besides the index corresponding to <word>
    feature_representation = np.zeros(len(unique_words))
    feature_representation[word_to_index[word]] = 1
    return feature_representation    

def generate_traindata(word_list, word_to_index, window_size=4):
    """
    Generates training data for Skipgram model.

    Arguments:
        word_list:     Sequential list of words (strings).
        word_to_index: Dictionary mapping words to their corresponding index
                       in a one-hot-encoded representation of our corpus.

        window_size:   Size of Skipgram window. Defaults to 2 
                       (use the default value when running your code).

    Returns:
        (trainX, trainY):     A pair of matrices (trainX, trainY) containing training 
                              points (one-hot-encoded vectors) and their corresponding output_word
                              (also one-hot-encoded vectors)

    """
    trainX = []
    trainY = []

    ##############################################################
    # TODO: Implement this function, populating trainX and trainY
    ##############################################################
    for i in range(len(word_list)):
        x = get_word_repr(word_to_index, word_list[i])
        for j in range(i - window_size, i + window_size + 1):
            if j >= 0 and j < len(word_list):
                y = get_word_repr(word_to_index, word_list[j])
                trainX.append(x)
                trainY.append(y)
    
    return np.array(trainX), np.array(trainY)

In [18]:
class W2VDataset(Dataset):

    def __init__(self, X, y):
        self.X = torch.tensor(X).float()
        self.y = torch.tensor(np.argmax(y, axis = 1)).long()

    def __len__(self):
        return self.X.shape[0]
    
    def __getitem__(self, index):  
          return self.X[index,:], self.y[index]

In [54]:
def find_most_similar_pairs(filename, num_latent_factors):
    """
    Find the most similar pairs from the word embeddings computed from
    a body of text
    
    Arguments:
        filename:           Text file to read and train embeddings from
        num_latent_factors: The number of latent factors / the size of the embedding
    """
    # Load in a list of words from the specified file; remove non-alphanumeric characters
    # and make all chars lowercase.
    sample_text = load_word_list(filename)

    # Create word dictionary
    word_to_index = generate_onehot_dict(sample_text)
    print("Textfile contains %s unique words"%len(word_to_index))
    # Create training data
    trainX, trainY = generate_traindata(sample_text, word_to_index)
    train_dataset = W2VDataset(trainX, trainY)
    ##############################################################
    # TODO: 1) Create and train model in Pytorch.      
    ##############################################################

    # vocab_size = number of unique words in our text file. Will be useful 
    # when adding layers to your neural network
    vocab_size = len(word_to_index)
    model = nn.Sequential( 
        nn.Linear(vocab_size, 10),
        nn.Linear(10, vocab_size),
    )

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    loss_fn = nn.CrossEntropyLoss()

    torch.manual_seed(0)
    #need to fix this
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=128, shuffle=True)

    for epoch in range(20):
        for batch_idx, (data, target) in enumerate(train_loader):
            # Erase accumulated gradients
            optimizer.zero_grad()

            # Forward pass
            output = model(data)

            # Calculate loss
            loss = loss_fn(output, target)

            # Backward pass
            loss.backward()
            
            # Weight update
            optimizer.step()

        # Track loss each epoch
        print('Train Epoch: %d  Loss: %.4f' % (epoch + 1,  loss.item()))

    ##############################################################
    # TODO: 2) Extract weights for hidden layer
    ##############################################################
    # set weights variable below
    weights = list(model.parameters())[2].detach().numpy()
    print(weights.shape)
    
    # Find and print most similar pairs
    similar_pairs = most_similar_pairs(weights, word_to_index)
    for pair in similar_pairs[:30]:
        print(pair)
    

### Problem E-H:
Run your model on drseuss.txt and answer questions from E through H.

In [55]:
find_most_similar_pairs('dr_seuss.txt', 10)

Textfile contains 308 unique words
Train Epoch: 1  Loss: 5.5281
Train Epoch: 2  Loss: 4.8917
Train Epoch: 3  Loss: 4.4720
Train Epoch: 4  Loss: 4.5169
Train Epoch: 5  Loss: 4.6518
Train Epoch: 6  Loss: 4.6868
Train Epoch: 7  Loss: 4.7877
Train Epoch: 8  Loss: 4.8217
Train Epoch: 9  Loss: 4.7809
Train Epoch: 10  Loss: 4.8425
Train Epoch: 11  Loss: 4.5532
Train Epoch: 12  Loss: 4.2282
Train Epoch: 13  Loss: 4.1857
Train Epoch: 14  Loss: 4.4402
Train Epoch: 15  Loss: 4.4127
Train Epoch: 16  Loss: 4.4619
Train Epoch: 17  Loss: 4.4291
Train Epoch: 18  Loss: 4.0862
Train Epoch: 19  Loss: 4.7889
Train Epoch: 20  Loss: 4.1189
(308, 10)
Pair(likes, drink), Similarity: 0.98902935
Pair(drink, likes), Similarity: 0.98902935
Pair(upon, grows), Similarity: 0.9775233
Pair(grows, upon), Similarity: 0.9775233
Pair(gone, tomorrow), Similarity: 0.97598
Pair(tomorrow, gone), Similarity: 0.97598
Pair(off, cold), Similarity: 0.9744982
Pair(cold, off), Similarity: 0.9744982
Pair(wink, pink), Similarity: 0.97