# Skip-Gram (Not Optimized)

#### Outline
1. Data Preprocessing
2. Generate Training and Testing Data
3. Define the Skip-Gram Model
4. SGD
5. Evaluation of Model


In [1]:
import numpy as np
import pandas as pd
import tqdm
import jax
import jax.numpy as jnp
import string
import tensorflow as tf
import time
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords

## Data Preprocessing

In [2]:
# Read Data
data = pd.read_csv('./data/raw data/raw_data.csv', header=0, names=['text'], usecols=[1])
print(f'Data Shape: {data.shape}')
data.head()

Data Shape: (13368, 1)


Unnamed: 0,text
0,"Sally Forrest, an actress-dancer who graced th..."
1,A middle-school teacher in China has inked hun...
2,A man convicted of killing the father and sist...
3,Avid rugby fan Prince Harry could barely watch...
4,A Triple M Radio producer has been inundated w...


In [3]:
# remove punctuation
punctuations = string.punctuation
def remove_punctuation(txt):
    for char in punctuations:
        if char in txt:
            txt = txt.replace(char, "")
    return txt

# change to lower caps
data['text'] = data['text'].str.lower()

# remove punctuations
data['text'] = data['text'].apply(remove_punctuation)

In [4]:
# remove stopwords
# read stopwords from data/raw data/stopwords.txt
stop_words = []
with open('./data/raw data/stopwords.txt', 'r') as f:
    for line in f:
        stop_words.append(line.strip())

def remove_stopwords(txt):
    txt = [word for word in txt.split() if word not in stop_words]
    return ' '.join(txt)

data['text'] = data['text'].apply(remove_stopwords)

In [5]:
# split each row into list of words
data_lst = data['text'].apply(lambda txt: txt.split(" "))

# select number of rows to be used as training data
nrows = 200
random_indices = np.random.randint(low=0, high=len(data_lst), size=nrows)
data_lst = data_lst[random_indices].reset_index(drop=True)

print(f'Number of rows of data: {len(data_lst)}')
data_lst[:5]

Number of rows of data: 200


0    [gary, neville, spent, friday, night, manchest...
1    [passionate, conservation, chinas, yunnan, pro...
2    [sight, world, wince, sympathy, chinese, build...
3    [buzz, aldrin, walk, moon, worlds, monuments, ...
4    [australians, splash, study, swimming, common,...
Name: text, dtype: object

In [6]:
# vocab dict
vocab, index = {}, 1
vocab['<pad>'] = 0
for line in data_lst:
    for word in line:
        if word not in vocab:
            vocab[word] = index
            index += 1

# inverse_vocab dict
inverse_vocab = {}
for word, index in vocab.items():
    inverse_vocab[index] = word

print(f'Vocab size: {len(vocab)}')

Vocab size: 14233


In [63]:
# sequences
sequences = []
for line in data_lst:
    vectorized_line = [vocab[word] for word in line]
    sequences.append(vectorized_line)

## Generate Training and Testing Data

In [64]:
# split into train and test sets
# choose 20 random sequences
ntest = 20
test_indices = np.random.randint(low=0, high=len(sequences), size=ntest)
test_sequences = [sequences[i] for i in test_indices]
train_sequences = [sequences[i] for i in range(len(sequences)) if i not in test_indices]

In [96]:
[inverse_vocab[i] for i in train_sequences[0]]

['gary',
 'neville',
 'spent',
 'friday',
 'night',
 'manchesters',
 'finest',
 'england',
 'assistant',
 'watched',
 'charlatans',
 'albert',
 'hall',
 'former',
 'manchester',
 'united',
 'defender',
 'smiles',
 'time',
 'charlatans',
 'lead',
 'singer',
 'tim',
 'burgess',
 'stone',
 'roses',
 'legend',
 'mani',
 'courteeners',
 'frontman',
 'liam',
 'fray',
 'ahead',
 'gig',
 'tweeting',
 'image',
 'mancunians',
 'twitter',
 'burgess',
 'joked',
 '5',
 'aside',
 'team',
 'gary',
 'neville',
 'left',
 'poses',
 'tim',
 'burgess',
 'lead',
 'singer',
 'charlatans',
 'left',
 'mani',
 'liam',
 'fray',
 'former',
 'manchester',
 'united',
 'england',
 'defender',
 'smiles',
 'ahead',
 'charlatans',
 'gig',
 'neville',
 'shared',
 'snaps',
 'seat',
 'gig',
 'seemingly',
 'fan',
 'madchester',
 'scene',
 'seemingly',
 'fan',
 'madchester',
 'music',
 'scene',
 'height',
 'late',
 '80s',
 '90s',
 '40yearold',
 'posted',
 'snaps',
 'gig',
 'seat',
 'neville',
 'posed',
 'burgess',
 'signed

In [97]:
[inverse_vocab[i] for i in test_sequences[0]]

['british',
 'game',
 'thrones',
 'fans',
 'paying',
 '£60',
 'shoppers',
 'hands',
 'products',
 'research',
 'comparing',
 'price',
 'merchandise',
 'hbos',
 'online',
 'store',
 'found',
 'huge',
 'markups',
 'stretch',
 'jewellery',
 'clothing',
 'toys',
 'sterling',
 'silver',
 'targaryen',
 'pendant',
 'costs',
 '£6299',
 'uk',
 '£3945',
 'pond',
 'british',
 'game',
 'thrones',
 'fans',
 'paying',
 'shoppers',
 'game',
 'thrones',
 'merchandise',
 'left',
 'targaryen',
 'ring',
 'costs',
 '£2799',
 'uk',
 'equivalent',
 '£2340',
 'america',
 'sterling',
 'silver',
 'targaryen',
 'pendant',
 'costs',
 '£6299',
 'uk',
 '£3945',
 'pond',
 'similarly',
 'sterling',
 'silver',
 'dragon',
 'egg',
 'pendant',
 'sets',
 'british',
 'fans',
 '£11799',
 'americans',
 'paying',
 '£5835',
 'bookend',
 'shape',
 'iron',
 'throne',
 'nearly',
 '£20',
 'expensive',
 'uk',
 'letter',
 'opener',
 '£895',
 'shoppers',
 'buying',
 'figurines',
 'fork',
 '£462',
 'uk',
 'increase',
 '25',
 'cent',


In [98]:
# function to generate training and testing data
def generate_data(sequences, window_size):
    targets, contexts = [], []
    for sequence in tqdm(sequences):
        # for each sentence
        for center_word_pos in range(len(sequence)):
            # add to targets
            targets.append(sequence[center_word_pos])

            context = []
            # for each window position
            for w in range(-window_size, window_size + 1):
                context_word_pos = center_word_pos + w
                # make sure not jump out sentence
                if context_word_pos < 0 or context_word_pos >= len(sequence) or center_word_pos == context_word_pos:
                    continue
                context_word_idx = sequence[context_word_pos]
                context.append(context_word_idx)

            # if length of context < 2*window_size
            # pad context with None until length is 2 * window_size
            if len(context) < 2 * window_size:
                pad_length = (2 * window_size) - len(context)
                for i in range(pad_length):
                    context.append(None)

            # add to contexts
            context = np.array(context)#; print(context)
            contexts.append(context)

    targets = np.array(targets)
    contexts = np.array(contexts)

    return targets, contexts

In [99]:
# generate training data
targets, contexts = generate_data(train_sequences, 5)

print(f'targets shape: {targets.shape}')
print(f'contexts shape: {contexts.shape}')


# generate testing data
test_targets, test_contexts = generate_data(test_sequences, 5)

print(f'test_targets shape: {test_targets.shape}')
print(f'test_contexts shape: {test_contexts.shape}')

  0%|          | 0/183 [00:00<?, ?it/s]

100%|██████████| 183/183 [00:00<00:00, 483.07it/s]


targets shape: (58804,)
contexts shape: (58804, 10)


100%|██████████| 20/20 [00:00<00:00, 518.54it/s]

test_targets shape: (6341,)
test_contexts shape: (6341, 10)





**Print a few examples of training and testing data**

In [100]:
# training data
index = 0
print(f"target_index    : {targets[index]}")
print(f"target_word     : {inverse_vocab[targets[index]]}")
print(f"context_indices : {contexts[index]}")
for c in contexts[index]:
    if c == None:
        continue
    print(f"context_words   : {[inverse_vocab[c]]}")

print("target  :", targets[index])
print("context :", contexts[index])

target_index    : 1
target_word     : gary
context_indices : [2 3 4 5 6 None None None None None]
context_words   : ['neville']
context_words   : ['spent']
context_words   : ['friday']
context_words   : ['night']
context_words   : ['manchesters']
target  : 1
context : [2 3 4 5 6 None None None None None]


In [94]:
# testing data
index = 0
print(f"target_index    : {test_targets[index]}")
print(f"target_word     : {inverse_vocab[test_targets[index]]}")
print(f"context_indices : {test_contexts[index]}")
for c in test_contexts[index]:
    if c == None:
        continue
    print(f"context_words   : {[inverse_vocab[c]]}")

print("target  :", test_targets[index])
print("context :", test_contexts[index])

target_index    : 1528
target_word     : british
context_indices : [2447 11176 826 6635 11177 None None None None None]
context_words   : ['game']
context_words   : ['thrones']
context_words   : ['fans']
context_words   : ['paying']
context_words   : ['£60']
target  : 1528
context : [2447 11176 826 6635 11177 None None None None None]


## Define the Skip-Gram Model

In [107]:
# define a function that takes in as input target and vocab length
# outputs a one-hot vector, x_hot of dimension = (vocab_length,)
def get_x_hot(target_idx, vocab_length):
    x_hot = np.zeros(vocab_length)
    x_hot[target_idx] = 1.
    return x_hot
# get_x_hot(3, 10)

def get_y_true(context_idxs, vocab_length):
    y_true = np.zeros(vocab_length)
    for i in context_idxs:
        y_true[i] = 1.
    return y_true
# get_y_true(np.array([1, 2, 3]), 10)

def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    return np.exp(x) / np.sum(np.exp(x), axis=0)
# softmax(np.array[1,2,3])

# define a forward pass in the skip gram model 
def net(target_hot, V, U):
    """
    Input
    x_hot is one-hot vector, dimensions = (|v| x 1)
    V: input embedding matrix, dimension = (n x |v|)
    U: output embedding matrix, dimension = (|v| x n)
        |v| = vocab size
        n = no. of embedding dimensions
    Output
    z: score vector of probability distribution, dimension = (|v| x 1)
    """
    return softmax( U @ V @ target_hot )

def local_loss(y_hat, y_true):
    """
    Input
    y_hat: predicted vector from 1 pass through the network, dimension = (|v| x 1)
    y_true: actual vector from get_y_true, dimension = (|v| x 1)

    Output
    loss_value: real number
    """
    

array([0., 1., 1., 1., 0., 0., 0., 0., 0., 0.])

## Stochastic Gradient Descent