In [2]:
""" Basic word2vec example.
From Tensorflow's Official Github

As a practice and commented and modify by Kin
"""


import collections
import math
import os
import random
import zipfile

import numpy as np
from six.moves import urllib
import tensorflow as tf

In [3]:
# Step 1: Get the data

url = 'http://mattmahoney.net/dc/'

def maybe_download(filename, expected_bytes):
    '''Download a file if not present'''
    if not os.path.exists(filename):
        filename, _ = urllib.request.urlretrieve(url + filename, filename)
    statinfo = os.stat(filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified (correct size)', filename)
    else:
        print(statinfo.st_size)
        raise Exception(
        'Failed to verify {}. Can you get to it by yourself?'.format(filename))
    return filename

filename = maybe_download('text8.zip', 31344016)

Found and verified (correct size) text8.zip


In [4]:
# Read the data into a list of strings
def read_data(filename):
    '''Extract the first file enclosed in a zip file as a list of words'''
    with zipfile.ZipFile(filename) as f:
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data

vocabulary = read_data(filename)
print('Data size (num of items in the list)', len(vocabulary))

Data size (num of items in the list) 17005207


In [6]:
# Checking
vocabulary[0:100]

['anarchism',
 'originated',
 'as',
 'a',
 'term',
 'of',
 'abuse',
 'first',
 'used',
 'against',
 'early',
 'working',
 'class',
 'radicals',
 'including',
 'the',
 'diggers',
 'of',
 'the',
 'english',
 'revolution',
 'and',
 'the',
 'sans',
 'culottes',
 'of',
 'the',
 'french',
 'revolution',
 'whilst',
 'the',
 'term',
 'is',
 'still',
 'used',
 'in',
 'a',
 'pejorative',
 'way',
 'to',
 'describe',
 'any',
 'act',
 'that',
 'used',
 'violent',
 'means',
 'to',
 'destroy',
 'the',
 'organization',
 'of',
 'society',
 'it',
 'has',
 'also',
 'been',
 'taken',
 'up',
 'as',
 'a',
 'positive',
 'label',
 'by',
 'self',
 'defined',
 'anarchists',
 'the',
 'word',
 'anarchism',
 'is',
 'derived',
 'from',
 'the',
 'greek',
 'without',
 'archons',
 'ruler',
 'chief',
 'king',
 'anarchism',
 'as',
 'a',
 'political',
 'philosophy',
 'is',
 'the',
 'belief',
 'that',
 'rulers',
 'are',
 'unnecessary',
 'and',
 'should',
 'be',
 'abolished',
 'although',
 'there',
 'are',
 'differing']

In [18]:
# Step 2: Build the dictionary and replace rare words with UNK token.
vocabulary_size = 50000


def build_dataset(words, n_words):
    '''Process raw inputs into a dataset'''
    count = [['UNK', -1]]
    # get the most common 50000 words as the basic of the dictionary
    # Counter() for count the 
    count.extend(collections.Counter(words).most_common(n_words - 1))
    # create the dict for the whole corpus, with numbers
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)  # [Good, but maybe slower] for putting numbers into dict
    # create a data list by using the corpus's number-encoding
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count +=1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

data, count, dictionary, reversed_dictionary = build_dataset(vocabulary,
                                                            vocabulary_size)

del vocabulary  # [Good Practice] to reduce memory

In [23]:
print('Top 5 Most commom words (+UNK)', count[:5])
print('\nSample Data', data[:10], [reversed_dictionary[i] for i in data[:10]])

data_index = 0

Top 5 Most commom words (+UNK) [['UNK', 418391], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764)]

Sample Data [5244, 3081, 12, 6, 195, 2, 3136, 46, 59, 156] ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']


In [None]:
# Step 3: Func to generate a training batch for the skip-gram model
def generate_batch(batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1  # [ skip_window target skip_window]
    buffer = collections.deque(maxlen=span)
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    for i in range(batch_size // num_skips):
        target = skip_window  # target label at the center of the buffer
        targets_to_avoid = [skip_window]
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
                targets_to_avoid.append(target)
                batch[i * num_skips + j ] = buffer[skip_window]
                labels[i * num_skips + j, 0] = buffer[target]
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    # backtrack a little bit to avoid skipping words in the end of a batch
    data_index = (data_index + len(data) - span) % len(data)
    return batch, labels
d
batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)
