# Skip Gram Word2Vec

## Setup the environment

Local-MacbookPro M1Pro

```bash
source ~/mypyenv/bin/activate
unset http_proxy https_proxy all_proxy
pip install pysocks

# close and open a new terminal

source ./mypyenv/bin/activate
pip install numpy
pip3 install torch torchvision torchaudio
pip3 install ipykernel
```

In VSCode

Select venv python for python interpreter

Create a new file: skip-gram.ipynb

## Code

In [1]:
sentences = ["Kage is Teacher", "Mazong is Boss", "Niuzhong is Boss", "Xiaobing is Student", "Xiaoxue is Student"]
words = ' '.join(sentences).split()
word_list = list(set(words))
word_to_idx = { word: idx for idx, word in enumerate(word_list) }
idx_to_word = { idx: word for idx, word in enumerate(word_list) }
voc_size = len(word_list)
print("Vocabulary: ", word_list)
print("From word to index: ", word_to_idx)
print("From index to word: ", idx_to_word)
print("Size of vocabulary: ", voc_size)

Vocabulary:  ['Xiaobing', 'Student', 'Teacher', 'Boss', 'Mazong', 'Xiaoxue', 'Kage', 'Niuzhong', 'is']
From word to index:  {'Xiaobing': 0, 'Student': 1, 'Teacher': 2, 'Boss': 3, 'Mazong': 4, 'Xiaoxue': 5, 'Kage': 6, 'Niuzhong': 7, 'is': 8}
From index to word:  {0: 'Xiaobing', 1: 'Student', 2: 'Teacher', 3: 'Boss', 4: 'Mazong', 5: 'Xiaoxue', 6: 'Kage', 7: 'Niuzhong', 8: 'is'}
Size of vocabulary:  9


In [2]:
def create_skipgram_dataset(sentences, window_size=2):
    data = []
    for sentence in sentences:
        sentence = sentence.split()
        for idx, word in enumerate(word_list):
            for neighbor in sentence[max(idx - window_size, 0):min(idx + window_size + 1, len(sentence))]:
                if neighbor != word:
                    data.append((neighbor, word))
    return data

skipgram_data = create_skipgram_dataset(sentences)
print("Skip-Gram data example (Not encoded): ", skipgram_data[:3])

Skip-Gram data example (Not encoded):  [('Kage', 'Xiaobing'), ('is', 'Xiaobing'), ('Teacher', 'Xiaobing')]


In [3]:
import torch
def one_hot_encoding(word, word_to_idx):
    tensor = torch.zeros(len(word_to_idx))
    tensor[word_to_idx[word]] = 1
    return tensor

word_example = "Teacher"
print("Word before One-Hot encoding: ", word_example)
print("One-Hot encoded vector: ", one_hot_encoding(word_example, word_to_idx))

print("Skip-Gram data example (Encoded): ", [(one_hot_encoding(context, word_to_idx), word_to_idx[target]) for context, target in skipgram_data[:3]])

Word before One-Hot encoding:  Teacher
One-Hot encoded vector:  tensor([0., 0., 1., 0., 0., 0., 0., 0., 0.])
Skip-Gram data example (Encoded):  [(tensor([0., 0., 0., 0., 0., 0., 1., 0., 0.]), 0), (tensor([0., 0., 0., 0., 0., 0., 0., 0., 1.]), 0), (tensor([0., 0., 1., 0., 0., 0., 0., 0., 0.]), 0)]


In [4]:
import torch.nn as nn
class SkipGram(nn.Module):
    def __init__(self, voc_size, embedding_size):
        super(SkipGram, self).__init__()
        self.input_to_hidden = nn.Linear(voc_size, embedding_size, bias=False)
        self.hidden_to_output = nn.Linear(embedding_size, voc_size, bias=False)
    def forward(self, X):
        hidden = self.input_to_hidden(X)
        output = self.hidden_to_output(hidden)
        return output

embedding_size = 2
skipgram_model = SkipGram(voc_size, embedding_size)
print("Skip-Gram class: ", skipgram_model)

Skip-Gram class:  SkipGram(
  (input_to_hidden): Linear(in_features=9, out_features=2, bias=False)
  (hidden_to_output): Linear(in_features=2, out_features=9, bias=False)
)
