In [25]:
import tensorflow as tf 
import numpy as np
import helpers
import operator

# GLOBAL CONTANTS
PAD = 0
EOS = 1
character_changing_num = 10
max_batches = 3001
batches_in_epoch = 100
# send 10 sequences into encoder at one time
batch_size = 10

# x (store encoder inputs [source morphological tags + target morphological tags + source word])
source_data = []
# y (store decoder expected outputs [source morphological tags + target morphological tags + target word])      
target_data = []

# character encodings
alphabet = dict()

# source and target morphological tag encodings
morphological_tags = dict()

# create (source morphological tags + target morphological tags + source/target word) sequence
def create_sequence(data_line_, word_index):
    sequence = []
    
    for i in data_line_[0]:
        sequence.append(i)
            
    for i in data_line_[2]:
        sequence.append(i)
        
    for i in data_line_[word_index]:
        sequence.append(i)
        
    return sequence
    

# read, split and encode input data
with open('data2.tsv','r') as input_file:
    # read it line-by-line
    for line in input_file:
        data_line_ = line.strip('\n').split('\t')
        
        # encode words into vector of ints 
        for item in range(0,4):         
            # contains encoded form of word
            coded_word = []
            
            if item == 1 or item == 3:
                # encode source and target word
                for character in data_line_[item]:
                    index = alphabet.setdefault(character, len(alphabet) + 2)
                    coded_word.append(index)
            else:
                # split morphological tags
                tags = data_line_[item].split(',')
                
                # encode morphological tags
                for tag in tags:
                    index = morphological_tags.setdefault(tag, len(morphological_tags) + 2)
                    coded_word.append(index)
            
            # store encoded form
            data_line_[item] = coded_word
        
        # store encoder input (source morphological tags + target morphological tags + source word)
        source_data.append(create_sequence(data_line_, 1))
        
        # store decoder expected outputs (source morphological tags + target morphological tags + target word)
        target_data.append(create_sequence(data_line_, 3))

A programhoz felhasznált forrásfájl a következőképpen néz ki:

    pos=V,mood=IND,def=INDF,tense=PRS,per=1,num=PL	agyondicsérünk	
    pos=V,mood=IND,def=DEF,tense=PST,per=1,num=PL	agyondicsértük
    pos=V,tense=PRS	ürítő	pos=V,mood=POT	üríthet
    pos=V,finite=NFIN	vitatni	pos=V,polite=INFM,per=2,num=SG,finite=NFIN	vitatnod

Minden sora egy-egy bemeneti adatot reprezentál (forrás morfológiai tagek + forrás szóalak + cél morfológiai tagek + cél szóalak) formában.

Beolvassuk a fájlból soronként, elvégezzük a szükséges feldarabolási lépéseket, majd kódoljuk mind a morfológiai tageket, mind a szóalakokat is számok formájában. Ehhez a már megszokott +1-gyel növelt ABC kódolást használja. A morfológiai tageket páronként kódolja szintén mindig +1-gyel növelt értéktől kezdve: (FONTOS! mivel EOS=1 és PAD=0 ezért a kódolást a 2-es értéktől kezdi)
    
    pl.: POS=V -> 2-es érték
         MOOD=IND -> 3-es érték
         
A morfológiai tagek kódolt formájából szekvenciát épít és mind a forrás mind a cél tagek szekvenciáját a forrás szóalak szekvenciája elé fűzi, így állítja elő az source_data változóba az encoder bemenetét. 
             (forrás morfológiai tagek szekvenciája + cél morfológiai tagek szekvenciája + forrás szóalak szekvenciája)
             
Az előbbihez hasonlóan készíti el a decoder elvárt kimenetét is, amit a target_data változóban tárol:
             (forrás morfológiai tagek szekvenciája + cél morfológiai tagek szekvenciája + cél szóalak szekvenciája)


In [26]:
# Clears the default graph stack and resets the global default graph.
tf.reset_default_graph() 
# initializes a tensorflow session
sess = tf.InteractiveSession() 

max_alphabet = alphabet[max(alphabet.items(), key=operator.itemgetter(1))[0]]
max_morphological_tags = morphological_tags[max(morphological_tags.items(), key=operator.itemgetter(1))[0]]

# calculate vocab_size (max(alphabet,morphological_tags))
vocab_size = max([max_alphabet, max_morphological_tags]) + 1
#character length
input_embedding_size = 30 

A vocab_size-ot manuálisan kell kiszámolni, hogy pontosan megállapíthassuk, hogy hány különböző kódolt karakterünk van. (Ennek pontos értékére az embedding miatt van szükség)

Mivel az ABC betűit és a morfológiai tag párok kódolt alakját is külön-külön tároltam, ezért meg kell vizsgálnom az ABC betűinél melyik a legnagyobb kódolt érték és melyik a legnagyobb kódolt érték a morfológiai tag-ek kódolt alakjánál. Ezután a két maximum érték közül a nagyobbat kell vennem. 

In [27]:
# num neurons
encoder_hidden_units = 20 
# in original paper, they used same number of neurons for both encoder
# and decoder, but we use twice as many so decoded output is different, the target value is the original input 
#in this example
decoder_hidden_units = encoder_hidden_units * 2 

# input placehodlers
encoder_inputs = tf.placeholder(shape=(None, None), dtype=tf.int32, name='encoder_inputs')
# contains the lengths for each of the sequence in the batch, we will pad so all the same
# if you don't want to pad, check out dynamic memory networks to input variable length sequences
encoder_inputs_length = tf.placeholder(shape=(None,), dtype=tf.int32, name='encoder_inputs_length')
decoder_targets = tf.placeholder(shape=(None, None), dtype=tf.int32, name='decoder_targets')

# randomly initialized embedding matrrix that can fit input sequence
# used to convert sequences to vectors (embeddings) for both encoder and decoder of the right size
# reshaping is a thing, in TF you gotta make sure you tensors are the right shape (num dimensions)
embeddings = tf.Variable(tf.random_uniform([vocab_size, input_embedding_size], -1.0, 1.0), dtype=tf.float32)

# this thing could get huge in a real world application
encoder_inputs_embedded = tf.nn.embedding_lookup(embeddings, encoder_inputs)

# define encoder
encoder_cell = tf.contrib.rnn.LSTMCell(encoder_hidden_units)

# define bidirectionel function of encoder (backpropagation)
((encoder_fw_outputs,
  encoder_bw_outputs),
 (encoder_fw_final_state,
  encoder_bw_final_state)) = (
    tf.nn.bidirectional_dynamic_rnn(cell_fw=encoder_cell,
                                    cell_bw=encoder_cell,
                                    inputs=encoder_inputs_embedded,
                                    sequence_length=encoder_inputs_length,
                                    dtype=tf.float32, time_major=True)
    )

#Concatenates tensors along one dimension.
encoder_outputs = tf.concat((encoder_fw_outputs, encoder_bw_outputs), 2)

#letters h and c are commonly used to denote "output value" and "cell state". 
#http://colah.github.io/posts/2015-08-Understanding-LSTMs/ 
#Those tensors represent combined internal state of the cell, and should be passed together. 

encoder_final_state_c = tf.concat(
    (encoder_fw_final_state.c, encoder_bw_final_state.c), 1)

encoder_final_state_h = tf.concat(
    (encoder_fw_final_state.h, encoder_bw_final_state.h), 1)

#TF Tuple used by LSTM Cells for state_size, zero_state, and output state.
encoder_final_state = tf.contrib.rnn.LSTMStateTuple(
    c=encoder_final_state_c,
    h=encoder_final_state_h
)

decoder_cell = tf.contrib.rnn.LSTMCell(decoder_hidden_units)

#we could print this, won't need
encoder_max_time, batch_size = tf.unstack(tf.shape(encoder_inputs))

decoder_lengths = encoder_inputs_length + character_changing_num
# +(character_changing_num-1) additional steps, +1 leading <EOS> token for decoder inputs

#manually specifying since we are going to implement attention details for the decoder in a sec
#weights
W = tf.Variable(tf.random_uniform([decoder_hidden_units, vocab_size], -1, 1), dtype=tf.float32)
#bias
b = tf.Variable(tf.zeros([vocab_size]), dtype=tf.float32)

#create padded inputs for the decoder from the word embeddings
#were telling the program to test a condition, and trigger an error if the condition is false.
assert EOS == 1 and PAD == 0

eos_time_slice = tf.ones([batch_size], dtype=tf.int32, name='EOS')
pad_time_slice = tf.zeros([batch_size], dtype=tf.int32, name='PAD')

#retrieves rows of the params tensor. The behavior is similar to using indexing with arrays in numpy
eos_step_embedded = tf.nn.embedding_lookup(embeddings, eos_time_slice)
pad_step_embedded = tf.nn.embedding_lookup(embeddings, pad_time_slice)

#manually specifying loop function through time - to get initial cell state and input to RNN
#normally we'd just use dynamic_rnn, but lets get detailed here with raw_rnn

#we define and return these values, no operations occur here
def loop_fn_initial():
    initial_elements_finished = (0 >= decoder_lengths)  # all False at the initial step
    #end of sentence
    initial_input = eos_step_embedded
    #last time steps cell state
    initial_cell_state = encoder_final_state
    #none
    initial_cell_output = None
    #none
    initial_loop_state = None  # we don't need to pass any additional information
    return (initial_elements_finished,
            initial_input,
            initial_cell_state,
            initial_cell_output,
            initial_loop_state)


#attention mechanism --choose which previously generated token to pass as input in the next timestep
def loop_fn_transition(time, previous_output, previous_state, previous_loop_state):

    def get_next_input():
        #dot product between previous ouput and weights, then + biases
        output_logits = tf.add(tf.matmul(previous_output, W), b)
        #Logits simply means that the function operates on the unscaled output of 
        #earlier layers and that the relative scale to understand the units is linear. 
        #It means, in particular, the sum of the inputs may not equal 1, that the values are not probabilities 
        #(you might have an input of 5).
        #prediction value at current time step
        
        #Returns the index with the largest value across axes of a tensor.
        prediction = tf.argmax(output_logits, axis=1)
        #embed prediction for the next input
        next_input = tf.nn.embedding_lookup(embeddings, prediction)
        return next_input
    
    
    elements_finished = (time >= decoder_lengths) # this operation produces boolean tensor of [batch_size]
                                                  # defining if corresponding sequence has ended

    
    #Computes the "logical and" of elements across dimensions of a tensor.
    finished = tf.reduce_all(elements_finished) # -> boolean scalar
    #Return either fn1() or fn2() based on the boolean predicate pred.
    input = tf.cond(finished, lambda: pad_step_embedded, get_next_input)
    
    #set previous to current
    state = previous_state
    output = previous_output
    loop_state = None

    return (elements_finished, 
            input,
            state,
            output,
            loop_state)

def loop_fn(time, previous_output, previous_state, previous_loop_state):
    if previous_state is None:    # time == 0
        assert previous_output is None and previous_state is None
        return loop_fn_initial()
    else:
        return loop_fn_transition(time, previous_output, previous_state, previous_loop_state)

#Creates an RNN specified by RNNCell cell and loop function loop_fn.
#This function is a more primitive version of dynamic_rnn that provides more direct access to the 
#inputs each iteration. It also provides more control over when to start and finish reading the sequence, 
#and what to emit for the output.
#ta = tensor array
decoder_outputs_ta, decoder_final_state, _ = tf.nn.raw_rnn(decoder_cell, loop_fn)
decoder_outputs = decoder_outputs_ta.stack()

decoder_outputs

#to convert output to human readable prediction
#we will reshape output tensor

#Unpacks the given dimension of a rank-R tensor into rank-(R-1) tensors.
#reduces dimensionality
decoder_max_steps, decoder_batch_size, decoder_dim = tf.unstack(tf.shape(decoder_outputs))
#flettened output tensor
decoder_outputs_flat = tf.reshape(decoder_outputs, (-1, decoder_dim))
#pass flattened tensor through decoder
decoder_logits_flat = tf.add(tf.matmul(decoder_outputs_flat, W), b)
#prediction vals
decoder_logits = tf.reshape(decoder_logits_flat, (decoder_max_steps, decoder_batch_size, vocab_size))

#final prediction
decoder_prediction = tf.argmax(decoder_logits, 2)

#cross entropy loss
#one hot encode the target values so we don't rank just differentiate
stepwise_cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
    labels=tf.one_hot(decoder_targets, depth=vocab_size, dtype=tf.float32),
    logits=decoder_logits,
)

#loss function
loss = tf.reduce_mean(stepwise_cross_entropy)
#train it 
train_op = tf.train.AdamOptimizer().minimize(loss)

sess.run(tf.global_variables_initializer())

# send 10 sequences into encoder at one time
batch_size = 10

In [28]:
# create batches with size of batch_size
def create_batches(data, batch_size):
    # stores batches
    batches = []
    # stores last batch beginning index
    prev_batch_begin = 0
    
    for j in range(0, len(data)):
        if j % batch_size ==0 and j != 0:
            batches.append(data[prev_batch_begin:j])
            prev_batch_begin = j
            
    # put the rest of it in another batch
    if prev_batch_begin != j:
        batches.append(data[prev_batch_begin:j])
        
    return batches

# encoder inputs devided into batches
source_batches = create_batches(source_data, batch_size)

# decoder targets devided into batches
target_batches = create_batches(target_data, batch_size)

A bemeneti adatokból és az elvárt kimenetekből legyártja a batch_size-nak megfelelő méretű batcheket. 

Maga a batch szekvenciák kötegét jelenti, hogy egyszerre hány input sort adunk be a rendszerünknek. Ezért fontos, hogy mind az encoder bemenetén, mind a decoder kimenetén azonos méretű batchek legyenek. Emiatt hívjuk meg a source_data és target_data-ra is egyaránt. 

In [29]:
def next_feed(batch_num, source_batches, target_batches):
    # get transpose of source_batches[batch_num]
    encoder_inputs_, encoder_input_lengths_ = helpers.batch(source_batches[batch_num])
    
    # get max input sequence length
    max_input_length = max(encoder_input_lengths_)
    
    # target word is max character_changing_num character longer than source word 
    # get transpose of target_batches[i] and put an EOF and PAD at the end
    decoder_targets_, _ = helpers.batch(
            [(sequence) + [EOS] + [PAD] * ((max_input_length + character_changing_num - 1) - len(sequence))  for sequence in target_batches[batch_num]]
    )
   
    return {
        encoder_inputs: encoder_inputs_,
        encoder_inputs_length: encoder_input_lengths_,
        decoder_targets: decoder_targets_,
    }

Mivel a forrás szóalak hossza nem feltétlenül egyezik meg a cél szóalak hosszával, ezért fontos hogy lehetővé tegyük a rendszer számára, hogy további karaktereket fűzhessen az eredetihez. Azt, hogy hány karakterrel lehet hosszabb a képzett szó (cél szó) az character_changing_num változó definiálja.

    Ha a character_changing_num = 10 ez azt jelenti, hogy 9 karakterben térhet el az eredeti szóalaktól, mivel a szavak végére +1 karakterként odatesszük az EOF karaktert, hogy jelezzük a decodernek, hogy befejezheti a feldolgozást.
    
Az ehhez szükséges padding karakterek számának kiszámolásához megkeressük a legnagyobb bemeneti szekvencia hosszát, amit a max_input_length változóban tárolunk el. Ezután a legnagyobb bemeneti szekvencia hosszához hozzáadjuk a (character_changing_num-1) értéket (-1 mert EOF külön hozzáadva) és kivonjuk belőle az aktuális szekvencia hosszát. Ezzel az ettől való eltéréseket 0-val töltjük fel.

In [30]:
loss_track = []

try:
    # get every batches and train the model on it
    for batch_num in range(0, len(source_batches)):
        fd = next_feed(batch_num, source_batches, target_batches)
   
        _, l = sess.run([train_op, loss], fd)
        loss_track.append(l)
        
        if batch_num == 0 or batch_num % batches_in_epoch == 0:
            print('batch {}'.format(batch_num))
            print('  minibatch loss: {}'.format(sess.run(loss, fd)))
            predict_ = sess.run(decoder_prediction, fd)
            for i, (inp, pred) in enumerate(zip(fd[encoder_inputs].T, predict_.T)):
                print('  sample {}:'.format(i + 1))
                print('    input     > {}'.format(inp))
                print('    predicted > {}'.format(pred))
                if i >= 2:
                    break
            print()

except KeyboardInterrupt:
    print('training interrupted')

batch 0
  minibatch loss: 3.658271312713623
  sample 1:
    input     > [ 2  3  4  5  6  7  2  3  8  9  6  7  2  3  4  5  6  7  8  9 10 11 12 13  6
 14]
    predicted > [40  1 28 28  7 15  3 23  4 23 41 11 15  4 23 11 41 11 15  4 23 11 41 11 15
  4 23 11 29 23 11 41 11 15  4 23]
  sample 2:
    input     > [ 2  5  2 10 13 12 16 15 17  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0]
    predicted > [23  1 41 37  1  1 28 28 39 28 39 28 23  7  4 23 11 41 11  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0]
  sample 3:
    input     > [ 2 11  2 12 13 14 11 20  8 15  2 15  6  8  0  0  0  0  0  0  0  0  0  0  0
  0]
    predicted > [39 28 39 28 23  7  4  4 23 11 41 11 15  4 23 11 41 11 15  4 23 11  4 23  0
  0  0  0  0  0  0  0  0  0  0  0]

batch 100
  minibatch loss: 2.5866785049438477
  sample 1:
    input     > [ 2 15  4 12  5 13  7  2  3  4  9  6  7 31 19  3  2  7  6 23 15  5 14  0  0
  0]
    predicted > [ 2  2  2  2  3  2  5  2  5 13  5  6  6 14 19 19 19 19 19  0  0  0  0  0  0