In [1]:
%load_ext autoreload
%autoreload 2

## Tokenization

### - Build tokenized docs

In [3]:
import re
from nltk.corpus import stopwords

from tqdm import tqdm_notebook

pattern = re.compile(r'[A-Za-z]+[\w]*|[\w]*[A-Za-z]+[\w]*')   # obtains tokens with a least one alphebet
stopwords = stopwords.words('English')
docs = []
token_set = set()

with open('algorithm_corpus.txt', 'r') as file:
    for line in tqdm_notebook(file.readlines()):
        tokens = pattern.findall(line.lower())
        tokens = [token for token in tokens]
        
        if 5 <= len(tokens):
            docs.append(tokens)
        
        token_set.update(tokens)




### - Map id to word

In [4]:
word_to_id = dict()
id_to_word = dict()

for i, token in tqdm_notebook(enumerate(token_set)):
    word_to_id[token] = i
    id_to_word[i] = token




### - Transform docs into id representation

In [5]:
docs_in_id = [[word_to_id[token] for token in doc] for doc in docs]

## Stats

In [6]:
print('number of documents: {}'.format(len(docs)))
print('dictionary size: {}'.format(len(word_to_id)))
print('word size: {}'.format(sum(len(doc) for doc in docs)))

number of documents: 136162
dictionary size: 31295
word size: 3722137


## Prepare Training data

### CBOW

In [7]:
import numpy as np

window_size = 5
example_len = 5 * 2 + 1
X_cbow = []
Y_cbow = []

for doc in tqdm_notebook(docs_in_id):
    for i in range(len(doc) - example_len + 1):
        X_cbow.extend(doc[i:i+5]+doc[i+6:i+11])
        Y_cbow.append(doc[i+5])
        
X_cbow = np.array(X_cbow)
X_cbow = X_cbow.reshape(-1, window_size * 2).T
Y_cbow = np.array(Y_cbow)
Y_cbow = Y_cbow.reshape(1, -1)




### Skip Gram

In [18]:
window_size = 3
example_len = 3 * 2 + 1
X_skip = []
Y_skip = []
docs_in_id = [['the','quick','brown','fox','jumped','over','the','lazy','dog']]

for doc in tqdm_notebook(docs_in_id):
    for i in range(len(doc) - example_len + 1):
        for j in list(range(i, i+window_size)) + list(range(i+window_size+1, i+window_size*2+1)):
            X_skip.append(doc[j])
            Y_skip.append(doc[i+window_size])
        
X_skip = np.array(X_skip)
X_skip = np.expand_dims(X_skip, axis=0)
Y_skip = np.array(Y_skip)
Y_skip = np.expand_dims(Y_skip, axis=0)




In [19]:
X_skip

array([['the', 'quick', 'brown', 'jumped', 'over', 'the', 'quick',
        'brown', 'fox', 'over', 'the', 'lazy', 'brown', 'fox', 'jumped',
        'the', 'lazy', 'dog']], dtype='<U6')

In [11]:
docs = [['the','quick','brown','fox','jumped','over','the','lazy','dog']]


(10, 2470420)

In [10]:
import pickle

with open('pickles/X.pkl', 'wb') as file:
    pickle.dump(X, file)
    
with open('pickles/Y.pkl', 'wb') as file:
    pickle.dump(Y, file)
    
with open('pickles/word_to_id.pkl', 'wb') as file:
    pickle.dump(word_to_id, file)
    
with open('pickles/id_to_word.pkl', 'wb') as file:
    pickle.dump(id_to_word, file)