## Preprocessing
### 爬取数据

In [2]:
import time
import numpy as np
import tensorflow as tf
from urllib.request import urlretrieve
from os.path import isfile, isdir
import zipfile
from tqdm import tqdm
import os
os.chdir('C:/test/tensorflow/Dataset')
dataset_folder_path = 'data'
dataset_filename = 'text8.zip'
dataset_name = 'Text8 Dataset'
class DLProgress(tqdm):
    last_block = 0
    def hook(self, block_num = 1, block_size = 1, total_size = None):
        self.total = total_size
        self.update((block_num - self.last_block) * block_size)
        self.last_block = block_num
if not isfile(dataset_filename):
    with DLProgress(unit = 'B', unit_scale = True, 
                   miniters = 1, desc = dataset_name) as pbar:
        urlretrieve('http://mattmahoney.net/dc/text8.zip', 
                   dataset_filename, pbar.hook)
if not isdir(dataset_folder_path):
    with zipfile.ZipFile(dataset_filename) as zip_ref:
        zip_ref.extractall(dataset_folder_path)

Text8 Dataset: 31.4MB [02:08, 244kB/s]                                                                                 


In [1]:
import os
os.chdir('C:/test/tensorflow/Dataset')
with open('data/text8') as f:
    text = f.read()

In [6]:
from collections import Counter
def preprocess(text):
    # Replace punctuation with tokens so we can use them in our model
    text = text.lower()
    text = text.replace('.', ' <PERIOD> ')
    text = text.replace(',', ' <COMMA> ')
    text = text.replace('"', ' <QUOTATION_MARK> ')
    text = text.replace(';', ' <SEMICOLON> ')
    text = text.replace('!', ' <EXCLAMATION_MARK> ')
    text = text.replace('?', ' <QUESTION_MARK> ')
    text = text.replace('(', ' <LEFT_PAREN> ')
    text = text.replace(')', ' <RIGHT_PAREN> ')
    text = text.replace('--', ' <HYPHENS> ')
    text = text.replace('?', ' <QUESTION_MARK> ')
    # text = text.replace('\n', ' <NEW_LINE> ')
    text = text.replace(':', ' <COLON> ')
    words = text.split()  
    # Remove all words with  5 or fewer occurences
    word_counts = Counter(words)
    trimmed_words = [word for word in words if word_counts[word] > 5]
    return trimmed_words

def create_lookup_tables(words):
    """
    Create lookup tables for vocabulary
    :param words: Input list of words
    :return: A tuple of dicts.  The first dict....
    """
    word_counts = Counter(words)
    sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
    int_to_vocab = {ii: word for ii, word in enumerate(sorted_vocab)}
    vocab_to_int = {word: ii for ii, word in int_to_vocab.items()}

    return vocab_to_int, int_to_vocab

words = preprocess(text)
vocb_to_int, int_to_vocab = create_lookup_tables(words)
int_words = [vocab_to_int[word] for word in words]

False

### Subsampling
- discard it with probability given by
$$P(W_{i}) = 1- \sqrt{(\frac{t}{f(w_{i})}}$$ where $t$ is a threshold parameter and $f(w_i)$ is the frequency of word $w_i$ in the total dataset

In [None]:
import random
threshold = 1e-5
word_counts = Counter(int_words)
total_count = len(int_words)
freqs = {word: count/total_count for word, 
        count in word_counts.items}
p_drop = {word: 1 - np.sqrt(threshold/freqs[word]) for
         word in word_counts}
train_words = [word for word ni int_words if random.random() < (1-p_drop[word])]

### Making batches

In [None]:
def get_target(words, idx, window_size = 5):
    R = np.random.randint(1, window_size + 1)
    start = idx - R if (idx - R) > 0 else 0
    stop = idx + R
    target_words = set(words[start:idx] + words[idx + 1: stop + 1])
    return list(target_words)
def get_batches(words, batch_size, window_size = 5):
    n_batches = len(words) // batch_size
    words = words[:n_batches * batch_size]
    for idx in range(0, len(words), batch_size):
        x, y = [], []
        batch = words[idx: idx + batch_size]
        for i in range(len(batch)):
            batch_x = batch[i]
            batch_y = get_target(batch, i, window_size)
            y.extend(batch_y)
            x.extend([batch_x] * len(batch_y))
            yield x, y

## Building the graph

In [None]:
graph = tf.Graph()
with graph.as_default():
    
    inputs = tf.placeholder(tf.int32, [None], name = 'inputs')
    labels = tf.placeholder(tf.int32, [None, None], name = 'labels')
n_vocab = len(int_to_vocab)
n_embedding = 200
# Encode
with graph.as_default(): 
    embedding = tf.Variable(tf.random_uniform((n_vocab, n_embedding), -1, 1))
    embed = tf.nn.embedding_lookup(embedding, inputs)
n_sampled = 100
# Decode
with graph.as_default():
    softmax_w = tf.Variable(tf.truncated_normal(
    (n_vocab, n_embedding), stddev = 0.1))
    softmax_b = tf.Variable(tf.zeros(n_vocab))
    loss = tf.nn.sampled_softmax_loss(softmax_w, 
                                     softmax_b, labels, embed, 
                                     n_sampled, n_vocab)
    cost = tf.reduce_mean(loss)
    optimizer = tf.train.AdamOptimizer().minimize(cost)
    

### Vlidation

In [None]:
with graph.as_default():
    valid_size = 16
    valid_window = 100
    valid_examples = np.array(random.sample(range(valid_window), valid_size//2))
    valid_examples = np.append(valid_examples, 
                              random.sample(range(1000, 1000 + valid_window), valid_size//2))
    valid_dataset = tf.constant(valid_examples, dtype = tf.int32)
    norm = tf.sqrt(tf.reduce_sum(tf.square(embedding), 1, keep_dims = True))
    normalized_embedding = embedding/norm
    valid_embedding = tf.nn.embedding_lookup(normalized_embedding, valid_dataset)
    similarity = tf.matmul(valid_embedding, tf.transpose(normalized_embedding))

### Training

In [None]:
epochs = 2
batch_size = 10
window_size = 5
with graph.as_default():
    saver = tf.train.Saver()
with tf.Session(graph = graph) as sess:
    iteration = 1
    loss = 0
    sess.run(tf.global_variables_initializer())
    for e in range(1, epochs + 1):
        batches = get_batches(train_words, batch_size, window_size)
        for x, y in batches:
            feed = {inputs:x, 
                   labels: np.array(y)[:, None]}
            train_loss, _ = sess.run([cost, optimizer], 
                                    feed_dict = feed)
            loss += train_loss
    save_path = saver.save(sess, 'checkpoints/text8.ckpt')
    embed_mat = sess.run(normalized_embedding)


### Restore
- Restore the trained network 

In [None]:
with graph.as_default():
    saver = tf.train.Saver()
with tf.Session(graph = graph) as sess:
    saver.restore(sess, tf.train.lastest_checkpoints('checkpoints/text8.ckpt'))
    embed_mat = sess.run(embedding)