### Generate Wiki Text 

##### Download data from https://metamind.io/research/the-wikitext-long-term-dependency-language-modeling-dataset/

In [1]:
#mini-demo
from urllib.request import urlretrieve
import os 
from os.path import isfile, isdir
import zipfile 
from tqdm import tqdm
import numpy as np #vectorization
import random #generate probability distribution 
import tensorflow as tf #ml
import datetime #clock training time

### First download data 

In [2]:
#### process bar
class DLProgress(tqdm):
    last_block = 0
    def hook(self, block_num=1, block_size=1, total_size=None):
        self.total = total_size
        self.update((block_num - self.last_block) * block_size)
        self.last_block = block_num

## download file 
data_path = './wikitext'
if isdir(data_path):
    print('Data already exist')
else:
    if not isdir(data_path):
        os.mkdir(data_path)
    zip_file = os.path.join(data_path,'wikitext-103-v1.zip')
    with DLProgress(unit='B', unit_scale=True, miniters=1, desc='wikidata') as pbar:
        #urlretrieve('https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip',
        #            zip_file,
        #            pbar.hook)
        urlretrieve('https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip',
                    zip_file,
                    pbar.hook)
    with zipfile.ZipFile(os.path.join(data_path,'wikitext-103-v1.zip')) as myzip:
        myzip.extractall(data_path)
    ## remove zip file 
    os.remove(data_path+'/wikitext-103-v1.zip')

data_file_path = "./wikitext/wikitext-2"
train_file = os.path.join(data_file_path,'wiki.train.tokens')
validate_file = os.path.join(data_file_path,'wiki.valid.tokens')

Data already exist


### Read data

In [3]:
#lets open the text
#native python file read function
text = open(train_file,encoding='utf8').read()
text = text[:500000]
print('text length in number of characters:', len(text))
print('head of text:')
print(text[:1000]) #all tokenized words, stored in a list called text

text length in number of characters: 500000
head of text:
 
 = Valkyria Chronicles III = 
 
 Senjō no Valkyria 3 : <unk> Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . <unk> the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " <unk> Raven " . 
 The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such

#### Create a id to character and character to id map dictionary

In [4]:
## get the set of characters and sort them 
chars = sorted(list(set(text)))               ## all unique characters
char_size = len(chars)
print('number of characters:', char_size)
print(chars[:20])

number of characters: 127
['\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '+', ',', '-', '.', '/', '0', '1', '2', '3']


In [5]:
## chrate char to id and id to char map 
char2id = {c:i for i,c in enumerate(chars)}
id2char = {i:c for i,c in enumerate(chars)}

In [6]:
text_num = [char2id[s] for s in text]

In [7]:
#Given a probability of each character, return a likely character, one-hot encoded
#our prediction will give us an array of probabilities of each character
#we'll pick the most likely and one-hot encode it
def sample(prediction):
    '''
    prediction: is a list of characters probilities
    '''
    r = random.uniform(0,1)  ## it is just a random number from 0-1
    s = 0 
    char_id = len(prediction)-1  ## this is because it starts with 0
    #for each char prediction probability 
    for i in range(len(prediction)):
        s+= prediction[i]
        if s >= r:
            char_id = i 
            break 
    
    char_one_hot = np.zeros(shape[char_size])  ## one hot encode characters 
    char_one_hot[char_id] = 1.0
    return char_one_hot

#### Create X and y sets and one hot encode them  

In [8]:
from sklearn.preprocessing import LabelBinarizer
def one_hot_encode(x,label_binarizer):
    return label_binarizer.transform(x).tolist()

In [9]:
## 
#vectorize our data to feed it into model
len_per_section = 50
skip = 2
sections = []
next_chars = []
#fill sections list with chunks of text, every 2 characters create a new 50 
#character long section
#because we are generating it at a character level
for i in range(0, len(text_num) - len_per_section, skip):
    sections.append(text_num[i: i + len_per_section])
    next_chars.append(text_num[i + len_per_section])

In [10]:
label_binarizer = LabelBinarizer()
label_binarizer.fit(range(char_size))

X = np.array([one_hot_encode(section,label_binarizer) for section in sections])
y = np.array(one_hot_encode(next_chars,label_binarizer))
print(X.shape,y.shape)


(249975, 50, 127) (249975, 127)


#### This is not very efficient, need to figure out soemthing else 

### LSTM part

In [11]:
batch_size = 1024
max_steps = 70000
log_every = 1000
save_every = 10000      
hidden_nodes = 1024    ## number of hidden nodes 
test_start = 'i am thinkg that'
checkpoint_directory = 'ckpt'

## create a checkpoint directory 
if tf.gfile.Exists(checkpoint_directory):
    tf.gfile.DeleteRecursively(checkpoint_directory)
tf.gfile.MakeDirs(checkpoint_directory)

print('training data size:', len(X))
print('approximate steps per epoch:', int(len(X)/batch_size))

training data size: 249975
approximate steps per epoch: 244


In [12]:
graph = tf.Graph()
with graph.as_default():
    #global_step just keeps track 
    #of the number of batches seen so far starts off as 0
    global_step = tf.Variable(0)
    #input data 
    data = tf.placeholder(tf.float32, [batch_size, len_per_section, char_size])
    #labels
    labels = tf.placeholder(tf.float32, [batch_size, char_size])
    
    ############################
    #Now let's build LSTM Cell##
    ############################
    
    #input gate 
    w_ii = tf.Variable(tf.truncated_normal([char_size,hidden_nodes],-0.1,0.1))
    w_io = tf.Variable(tf.truncated_normal([hidden_nodes, hidden_nodes], -0.1, 0.1))
    b_i = tf.Variable(tf.zeros([1, hidden_nodes]))
    #Forget gate: weights for input, weights for previous output, and bias
    w_fi = tf.Variable(tf.truncated_normal([char_size, hidden_nodes], -0.1, 0.1))
    w_fo = tf.Variable(tf.truncated_normal([hidden_nodes, hidden_nodes], -0.1, 0.1))
    b_f = tf.Variable(tf.zeros([1, hidden_nodes]))
    #Output gate: weights for input, weights for previous output, and bias
    w_oi = tf.Variable(tf.truncated_normal([char_size, hidden_nodes], -0.1, 0.1))
    w_oo = tf.Variable(tf.truncated_normal([hidden_nodes, hidden_nodes], -0.1, 0.1))
    b_o = tf.Variable(tf.zeros([1, hidden_nodes]))
    #Memory cell: weights for input, weights for previous output, and bias
    w_ci = tf.Variable(tf.truncated_normal([char_size, hidden_nodes], -0.1, 0.1))
    w_co = tf.Variable(tf.truncated_normal([hidden_nodes, hidden_nodes], -0.1, 0.1))
    b_c = tf.Variable(tf.zeros([1, hidden_nodes]))
    
    #LSTM Cell
    # given input, output, external state, it will return output and state
    #output starts off empty, LSTM cell calculates it
    
    #Since, we have two kinds of states - the internal state ct 
    #and the (exposed) external state st, and since we need both of 
    #them for the subsequent sequential operations, we combine them 
    #into a tensor at each step, and pass them as input to the next 
    #step. This tensor is unpacked into st_1 and ct_1 at the beginning of each step.
    
    
    def lstm(i, o, state):
        
        #these are all calculated seperately, no overlap until....
        #(input * input weights) + (output * weights for previous output) + bias
        input_gate = tf.sigmoid(tf.matmul(i, w_ii) + tf.matmul(o, w_io) + b_i)
        #(input * forget weights) + (output * weights for previous output) + bias
        forget_gate = tf.sigmoid(tf.matmul(i, w_fi) + tf.matmul(o, w_fo) + b_f)
        #(input * output weights) + (output * weights for previous output) + bias
        output_gate = tf.sigmoid(tf.matmul(i, w_oi) + tf.matmul(o, w_oo) + b_o)
        #(input * internal state weights) + (output * weights for previous output) + bias
        memory_cell = tf.sigmoid(tf.matmul(i, w_ci) + tf.matmul(o, w_co) + b_c)
        
        #...now! multiply forget gate * given state    +  input gate * hidden state
        state = forget_gate * state + input_gate * memory_cell
        #squash that state with tanh nonlin (Computes hyperbolic tangent of x element-wise)
        #multiply by output
        output = output_gate * tf.tanh(state)
        #return 
        return output, state
    
    
    ###########
    #Operation
    ###########
    #LSTM
    #both start off as empty, LSTM will calculate this
    output = tf.zeros([batch_size, hidden_nodes])
    state = tf.zeros([batch_size, hidden_nodes])

    ####################################
    ## Ok, this is the important part, LSTM is actually running a loop 
    ## it will slice each plane of the 3d matrix of input then do the 
    ## matrix multiplication 
    ####################################
    for i in range(len_per_section):
        #calculate state and output from LSTM
        output, state = lstm(data[:, i, :], output, state)
        #to start, 
        if i == 0:
            #store initial output and labels
            outputs_all_i = output
            labels_all_i = data[:, i+1, :]
        #for each new set, concat outputs and labels
        elif i != len_per_section - 1:
            #concatenates (combines) vectors along a dimension axis, not multiply
            outputs_all_i = tf.concat([outputs_all_i, output],0)        ## tf 1.0 changes
            labels_all_i = tf.concat([labels_all_i, data[:, i+1, :]],0) ## tf 1.0 changes
        else:
            #final store
            outputs_all_i = tf.concat([outputs_all_i, output],0)        ## tf 1.0 changes
            labels_all_i = tf.concat([labels_all_i, labels],0)          ## tf 1.0 changes

    #Classifier
    #The Classifier will only run after saved_output and saved_state were assigned.
    
    #calculate weight and bias values for the network
    #generated randomly given a size and distribution
    w = tf.Variable(tf.truncated_normal([hidden_nodes, char_size], -0.1, 0.1))
    b = tf.Variable(tf.zeros([char_size]))
    #Logits simply means that the function operates on the unscaled output 
    #of earlier layers and that the relative scale to understand the units 
    #is linear. It means, in particular, the sum of the inputs may not equal 1, 
    #that the values are not probabilities (you might have an input of 5).
    logits = tf.matmul(outputs_all_i, w) + b
    
    #logits is our prediction outputs, lets compare it with our labels
    #cross entropy since multiclass classification
    #computes the cost for a softmax layer
    #then Computes the mean of elements across dimensions of a tensor.
    #average loss across all values
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=labels_all_i))

    #Optimizer
    #minimize loss with graident descent, learning rate 10,  keep track of batches
    optimizer = tf.train.GradientDescentOptimizer(10.).minimize(loss, global_step=global_step)
    
    ##########################################################
    #Test ####################################################
    ##########################################################
    
    ## it seems that for testing, we don't limit ourself to sequence length 
    ## it seems that outcome and states just keeps accumulateing 
    
    test_data = tf.placeholder(tf.float32, shape=[1, char_size])
    test_output = tf.Variable(tf.zeros([1, hidden_nodes]))
    test_state = tf.Variable(tf.zeros([1, hidden_nodes]))
    
    Reset at the beginning of each test
    reset_test_state = tf.group(test_output.assign(tf.zeros([1, hidden_nodes])), 
                                test_state.assign(tf.zeros([1, hidden_nodes])))

    #LSTM
    test_output, test_state = lstm(test_data, test_output, test_state)
    test_prediction = tf.nn.softmax(tf.matmul(test_output, w) + b)

#### Now we are going to train the model 

In [13]:
## initialize tf session 
with tf.Session(graph = graph) as sess:
    ## intialize all variables 
    tf.global_variables_initializer().run()
    offset = 0 
    saver = tf.train.Saver()
    
    ##for each training step
    for step in range(max_steps):
        ## starts off as 0 
        offset = offset % len(X)  ## the reminder 
        ## calculate batch data and lables to feed model iteratively 
        if offset <= (len(X)-batch_size):
            # first part 
            batch_data = X[offset:offset+batch_size]
            batch_labels = y[offset:offset+batch_size]
            offset += batch_size
        # until when offset = batch size, then we 
        else:
            #last part 
            to_add = batch_size - (len(X) - offset)
            batch_data = np.concatenate((X[offset:len(X)],X[0:to_add]))
            batch_labels = np.concatenate((y[offset:len(X)],y[0:to_add]))
            offset = to_add
        
        ## optimize!
        _,training_loss = sess.run([optimizer,loss],feed_dict={data:batch_data,labels:batch_labels})
        if step % log_every == 0:
            print('training loss at step %d: %.2f (%s)' % (step, training_loss, datetime.datetime.now()))
            if step % save_every == 0:
                saver.save(sess, checkpoint_directory + '/model', global_step=step)

training loss at step 0: 4.89 (2017-05-31 21:36:36.931236)
training loss at step 1000: 2.88 (2017-05-31 21:42:13.964942)
training loss at step 2000: 2.56 (2017-05-31 21:47:51.331521)
training loss at step 3000: 2.37 (2017-05-31 21:53:31.012398)
training loss at step 4000: 1.95 (2017-05-31 21:59:10.962961)
training loss at step 5000: 1.95 (2017-05-31 22:04:50.373679)
training loss at step 6000: 1.83 (2017-05-31 22:10:30.394437)
training loss at step 7000: 1.77 (2017-05-31 22:16:10.889125)
training loss at step 8000: 1.82 (2017-05-31 22:21:51.179532)
training loss at step 9000: 1.67 (2017-05-31 22:27:31.031314)
training loss at step 10000: 1.62 (2017-05-31 22:32:50.144931)
training loss at step 11000: 1.62 (2017-05-31 22:38:01.082995)
training loss at step 12000: 1.60 (2017-05-31 22:43:10.854862)
training loss at step 13000: 1.62 (2017-05-31 22:48:20.472440)
training loss at step 14000: 1.43 (2017-05-31 22:53:30.762236)
training loss at step 15000: 1.49 (2017-05-31 22:58:40.952771)
train

#### Now we can use the model to predict 

In [None]:
test_start = 'I plan to make the world a better place '

with tf.Session(graph=graph) as sess:
    #init graph, load model
    tf.global_variables_initializer().run()
    model = tf.train.latest_checkpoint(checkpoint_directory)
    saver = tf.train.Saver()
    saver.restore(sess, model)

    #set input variable to generate chars from
    reset_test_state.run() 
    test_generated = test_start

    #for every char in the input sentennce
    for i in range(len(test_start) - 1):
        #initialize an empty char store
        test_X = np.zeros((1, char_size))
        #store it in id from, onehot encode it 
        test_X[0, char2id[test_start[i]]] = 1.
        #feed it to model, test_prediction is the output value
        _ = sess.run(test_prediction, feed_dict={test_data: test_X})

    
    #where we store encoded char predictions
    test_X = np.zeros((1, char_size))
    test_X[0, char2id[test_start[-1]]] = 1.

    #lets generate 500 characters
    for i in range(500):
        #get each prediction probability
        prediction = test_prediction.eval({test_data: test_X})[0]
        #one hot encode it
        next_char_one_hot = sample(prediction)
        #get the indices of the max values (highest probability)  and convert to char
        next_char = id2char[np.argmax(next_char_one_hot)]
        #add each char to the output text iteratively
        test_generated += next_char
        #update the 
        test_X = next_char_one_hot.reshape((1, char_size))

    print(test_generated)