In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import os
import time
import shutil

class SpellChecker():
    def __init__(self, config):
        self.lr = config['lr']
        self.n_hidden= config['n_hidden']
        self.total_epoch = config['total_epoch'] 
        self.batch_size = config['batch_size'] 
        self.char_arr = list("SEPabcdefghijklmnopqrstuvwxyz ")
        self.n_class = self.n_input = self.dic_len = len(self.char_arr)
        self.num_dic = {n: i for i, n in enumerate(self.char_arr)}
        self.n_eval = config['n_eval']
        
        # Checkpoint files will be saved in this directory during training
        timestamp = str(int(time.time()))
        self.checkpoint_dir = './checkpoints_' + timestamp + '/'
        if os.path.exists(self.checkpoint_dir):
            shutil.rmtree(self.checkpoint_dir)
        os.makedirs(self.checkpoint_dir)
        self.checkpoint_prefix = os.path.join(self.checkpoint_dir, 'model')        
        

        self.encoder_inputs= tf.placeholder(dtype = tf.float32, shape = [None, None, self.n_input], name = "encoder_inputs")
        self.decoder_inputs = tf.placeholder(dtype = tf.float32, shape = [None, None, self.n_input], name = "decoder_inputs")
        self.decoder_outputs = tf.placeholder(tf.int64, [None, None], name = "decoder_outputs")
        self.target_weights = tf.placeholder(tf.float32, [None, None], name = "target_weights")

        self.encoder_length = tf.placeholder(tf.int32, [None], name = "encoder_length")
        self.decoder_length = tf.placeholder(tf.int32, [None], name = "decoder_length")
        
        # Embedding
        # Look up embedding:
        #   encoder_inputs: [max_time, batch_size]
        #   encoder_emb_inp: [max_time, batch_size, embedding_size]
        # self.embedding_size = 4

        # self.embedding_encoder = tf.get_variable("embedding_encoder", [self.dic_len, self.embedding_size])
        # self.encoder_emb_inp = tf.nn.embedding_lookup(self.embedding_encoder, self.encoder_inputs)
        # 
        # self.embedding_decoder = tf.get_variable("embedding_decoder", [self.dic_len, self.embedding_size])
        # self.decoder_emb_inp = tf.nn.embedding_lookup(self.embedding_decoder, self.decoder_inputs)
        # self.decoder_emb_outp = tf.nn.embedding_lookup(self.embedding_decoder, self.decoder_outputs)
         
        
        #[batch_size, time_steps, input_size]
        self.global_step = tf.Variable(0, name='global_step', trainable=False)

        
        with tf.variable_scope('encode'):
            self.enc_cell = tf.nn.rnn_cell.BasicLSTMCell(self.n_hidden)
            # self.enc_cell = tf.nn.rnn_cell.DropoutWrapper(self.enc_cell, output_keep_prob = 0.5)
            self.outputs, self.enc_states = tf.nn.dynamic_rnn(cell = self.enc_cell, inputs = self.encoder_inputs , dtype = tf.float32, sequence_length = self.encoder_length)
    
        with tf.variable_scope('decode'):
            self.dec_cell = tf.nn.rnn_cell.BasicLSTMCell(self.n_hidden)
            # self.dec_cell = tf.nn.rnn_cell.DropoutWrapper(self.dec_cell, output_keep_prob = 0.5)
            self.projection_layer = tf.layers.Dense(self.dic_len, use_bias=True)
            
            self.helper = tf.contrib.seq2seq.TrainingHelper( self.decoder_inputs, self.decoder_length)
            self.decoder = tf.contrib.seq2seq.BasicDecoder(self.dec_cell, self.helper, self.enc_states, output_layer = self.projection_layer)
            
            self.outputs, self.dec_states, self.final_sequence_lengths = tf.contrib.seq2seq.dynamic_decode(self.decoder)

        with tf.variable_scope('output'):
            self.logits = self.outputs.rnn_output            
            self.prediction = tf.argmax(self.logits, axis = 2)

        with tf.variable_scope('cost'):
            self.crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(logits = self.logits, labels = self.decoder_outputs)
            self.cost = (tf.reduce_mean(self.crossent * self.target_weights))
            tf.summary.scalar('cost', self.cost )

        with tf.variable_scope('accuracy_'):
            correct_predictions = tf.equal(self.prediction, self.decoder_outputs)
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name='accuracy')
            tf.summary.scalar('accuracy', self.accuracy )

        # 
        # with tf.name_scope('num_correct'):
        #     correct = tf.equal(self.prediction, tf.argmax(self.decoder_outputs, axis = 1))
        #     self.num_correct = tf.reduce_sum(tf.cast(correct, 'float'))
         
        with tf.variable_scope('optimiser'):
            self.params = tf.trainable_variables()
            self.gradients = tf.gradients(self.cost, self.params)
            self.clipped_gradients, _ = tf.clip_by_global_norm(self.gradients, 1)
            self.opimiser= tf.train.AdamOptimizer(self.lr)         
            self.train_op = self.opimiser.apply_gradients(zip(self.clipped_gradients, self.params), global_step = self.global_step)         
            # self.train_op = tf.train.AdamOptimizer(self.lr).minimize(self.cost, global_step = self.global_step)
            
        self.df_train = pd.read_csv('./df_train.csv', index_col = False)
        self.df_test = pd.read_csv('./df_test.csv', index_col = False)
        self.df_train = self.df_train[['x', 'y']]
        self.df_test = self.df_test[['x', 'y']]
        
        self.graph = tf.Graph()    
        self.saver = tf.train.Saver()
        self.sess = tf.Session()
        self.train_writer = tf.summary.FileWriter('./train', self.sess.graph)
         
        # tf.global_variables_initializer().run()
        
        self.sess.run(tf.global_variables_initializer())
    
    
    def batch_iter(self, data, batch_size, num_epochs):
        data = np.array(data)
        data_size = len(data)
        num_batches_per_epoch = int(data_size / batch_size) + 1
    
        for epoch in range(num_epochs):
            for batch_num in range(num_batches_per_epoch):
                start_index = batch_num * batch_size
                end_index = min((batch_num + 1) * batch_size, data_size)
                yield data[start_index:end_index]
                    
                    
    def make_batch(self, df): 
        enc_input_batch = []
        dec_input_batch = []
        dec_output_batch = []
        target_weights_batch = []
        
        enc_len_batch = [] 
        dec_len_batch = []
        
        enc_max_len = 0
        dec_max_len = 0
        
        #preprecessing
        for i in range(0, len(df)):
            if enc_max_len < len(df.loc[i, 'x']): enc_max_len = len(df.loc[i, 'x'])
            if dec_max_len < len(df.loc[i, 'y']) + 1: dec_max_len = len(df.loc[i, 'y']) + 1
            
            enc_len_batch.append(len(df.loc[i, 'x']))
            dec_len_batch.append(len(df.loc[i, 'y']) + 1)
            
        for i in range(0, len(df)):
            input = [self.num_dic[n] for n in df.loc[i, 'x'].lower()]
            output = [self.num_dic[n] for n in ('S' + df.loc[i, 'y'].lower())]
            target = [self.num_dic[n] for n in (df.loc[i, 'y'].lower() + 'E')]
            
            target_weights_batch.extend([([1] * len(target)) + ([0] * (dec_max_len - len(target)))]) 
            
            #pad sentence with 'P'
            input = input + [2] * (enc_max_len - len(input))
            output = output + [2] * (dec_max_len - len(output))
            target = target + [2] * (dec_max_len - len(target))
            
            enc_input_batch.append(np.eye(self.dic_len)[input])
            dec_input_batch.append(np.eye(self.dic_len)[output])
            dec_output_batch.append(target)
                        
        # target_weights_batch = tf.squeeze(target_weights_batch)
        return enc_input_batch, dec_input_batch, dec_output_batch, target_weights_batch, enc_len_batch, dec_len_batch
        
        
    def train(self):
        x_train= self.df_train.x.tolist()
        y_train = self.df_train.y.tolist()
        train_batches = self.batch_iter(data = list(zip(x_train, y_train)), batch_size = self.batch_size, num_epochs = self.total_epoch)
        train_best_accuracy, val_best_accuracy, self.best_at_step = 0, 0, 0
        
        for train_batch in train_batches:
            current_step = tf.train.global_step(self.sess, self.global_step)
            train_enc_input_batch, train_dec_input_batch, train_dec_output_batch, train_target_weights_batch, train_enc_len_batch, train_dec_len_batch \
                = self.make_batch(pd.DataFrame(train_batch, columns = ['x','y']))
            feed_dict = {
                self.encoder_inputs: train_enc_input_batch,
                self.decoder_inputs: train_dec_input_batch,
                self.decoder_outputs: train_dec_output_batch,
                self.target_weights: train_target_weights_batch,
                self.encoder_length: train_enc_len_batch, 
                self.decoder_length: train_dec_len_batch
            }
            self.merged_summaries = tf.summary.merge_all()
            _, loss, accuracy, summary = self.sess.run([self.train_op, self.cost, self.accuracy, self.merged_summaries], feed_dict = feed_dict)
            self.train_writer.add_summary(summary = summary, global_step = current_step)
            
            print('current_step = ', '{}'.format(current_step), ', cost = ', '{:.6f}'.format(loss), ', accuracy = ', '{:.6f}'.format(accuracy))
            
            if current_step % self.n_eval == 0:
                val_enc_input_batch, val_dec_input_batch, val_dec_output_batch, val_target_weights_batch, val_enc_len_batch, val_dec_len_batch \
                    = self.make_batch(pd.DataFrame(train_batch, columns = ['x','y']))
                
                val_feed_dict = {
                    self.encoder_inputs: val_enc_input_batch,
                    self.decoder_inputs: val_dec_input_batch,
                    self.decoder_outputs: val_dec_output_batch,
                    self.target_weights: val_target_weights_batch,
                    self.encoder_length: val_enc_len_batch, 
                    self.decoder_length: val_dec_len_batch
                }

                val_loss, val_accuracy = self.sess.run([self.cost, self.accuracy], feed_dict = val_feed_dict)
                
                print('current_step = ', '{}'.format(current_step), ', val_cost = ', '{:.6f}'.format(val_loss), ', val_accuracy = ', '{:.6f}'.format(val_accuracy))
                
                if accuracy > train_best_accuracy and val_accuracy > val_best_accuracy:
                    train_best_accuracy, val_best_accuracy, self.best_at_step = accuracy, val_accuracy, current_step
                    
                    path = self.saver.save(self.sess, self.checkpoint_prefix, global_step=current_step)
                    print('Saved model {} at step {}'.format(path, self.best_at_step))
                    print('Best accuracy {} and {} at step {}'.format(train_best_accuracy, val_best_accuracy, self.best_at_step))
                    
                # test_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/test')
    
    def test(self):
        self.saver.restore(self.sess, self.checkpoint_prefix + '-' + str(self.best_at_step))

        enc_input_batch, dec_input_batch, dec_output_batch, target_weights_batch, enc_len_batch, dec_len_batch = self.make_batch(self.df_train)
        
        feed_dict = {
            self.encoder_inputs: enc_input_batch, 
            self.decoder_inputs: dec_input_batch, 
            self.decoder_outputs: dec_output_batch, 
            self.target_weights: target_weights_batch, 
            self.encoder_length: enc_len_batch,  
            self.decoder_length: dec_len_batch
        }
        
        results, loss, accuracy = self.sess.run([self.prediction, self.cost, self.accuracy], feed_dict = feed_dict)
        print('cost = ', '{:.6f}'.format(loss), ', accuracy = ', '{:.6f}'.format(accuracy))

        decoded = []
        for result in results:
            decoded.append([self.char_arr[i] for i in result]) 
        
        self.translated = []
        for result in decoded:
            try:
                end = result.index('E')
                self.translated.append([''.join(result[:end])])
            except:
                self.translated.append([''.join(result)])
        return self.translated

In [3]:
config = {}
config['lr'] = 0.003
config['n_hidden'] = 128
config['total_epoch'] = 3
config['batch_size'] = 256
config['n_eval'] = 20
spell_checker = SpellChecker(config)

ValueError: Variable encode/rnn/basic_lstm_cell/kernel already exists, disallowed. Did you mean to set reuse=True or reuse=tf.AUTO_REUSE in VarScope? Originally defined at:

  File "<ipython-input-1-006efd64969a>", line 57, in __init__
    self.outputs, self.enc_states = tf.nn.dynamic_rnn(cell = self.enc_cell, inputs = self.encoder_inputs , dtype = tf.float32, sequence_length = self.encoder_length)
  File "<ipython-input-2-e682d7b01ec8>", line 7, in <module>
    spell_checker = SpellChecker(config)
  File "/Users/joonyoungjeon/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2910, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
spell_checker.train()
# np.array(spell_checker.dec_len_batch).shape
# np.array(spell_checker.enc_len_batch).shape
# tf.squeeze(spell_checker.dec_len_batch).shape


current_step =  0 , cost =  1.986304 , accuracy =  0.011029


current_step =  0 , val_cost =  1.968843 , val_accuracy =  0.071232


Saved model ./checkpoints_1526002576/model-0 at step 0
Best accuracy 0.011029412038624287 and 0.07123161852359772 at step 0


current_step =  1 , cost =  2.108317 , accuracy =  0.078369


current_step =  2 , cost =  2.137313 , accuracy =  0.099365


current_step =  3 , cost =  1.833881 , accuracy =  0.083333


current_step =  4 , cost =  1.789638 , accuracy =  0.082465


current_step =  5 , cost =  1.773541 , accuracy =  0.081572


current_step =  6 , cost =  1.630220 , accuracy =  0.078125


current_step =  7 , cost =  1.278034 , accuracy =  0.071162


current_step =  8 , cost =  1.742894 , accuracy =  0.107077


current_step =  9 , cost =  1.462426 , accuracy =  0.101768


current_step =  10 , cost =  1.693642 , accuracy =  0.108686


current_step =  11 , cost =  1.460389 , accuracy =  0.097656


current_step =  12 , cost =  1.511595 , accuracy =  0.108073


current_step =  13 , cost =  1.677967 , accuracy =  0.109707


current_step =  14 , cost =  1.566065 , accuracy =  0.113051


current_step =  15 , cost =  1.671605 , accuracy =  0.121582


current_step =  16 , cost =  1.698094 , accuracy =  0.123535


current_step =  17 , cost =  1.478436 , accuracy =  0.108073


current_step =  18 , cost =  1.498668 , accuracy =  0.110243


current_step =  19 , cost =  1.548044 , accuracy =  0.122243


current_step =  20 , cost =  1.472781 , accuracy =  0.105252
current_step =  20 , val_cost =  1.460033 , val_accuracy =  0.106771


Saved model ./checkpoints_1526002576/model-20 at step 20
Best accuracy 0.1052517369389534 and 0.1067708358168602 at step 20


current_step =  21 , cost =  1.179594 , accuracy =  0.090863


current_step =  22 , cost =  1.617843 , accuracy =  0.117877


current_step =  23 , cost =  1.377453 , accuracy =  0.104441


current_step =  24 , cost =  1.600677 , accuracy =  0.127298


current_step =  25 , cost =  1.402129 , accuracy =  0.108758


current_step =  26 , cost =  1.460074 , accuracy =  0.120009


current_step =  27 , cost =  1.623811 , accuracy =  0.123005


current_step =  28 , cost =  1.528417 , accuracy =  0.135570


current_step =  29 , cost =  1.623811 , accuracy =  0.144287


current_step =  30 , cost =  1.662039 , accuracy =  0.137695


current_step =  31 , cost =  1.432808 , accuracy =  0.125651


current_step =  32 , cost =  1.460444 , accuracy =  0.126953


current_step =  33 , cost =  1.511550 , accuracy =  0.141085


current_step =  34 , cost =  1.429419 , accuracy =  0.125217


current_step =  35 , cost =  1.153000 , accuracy =  0.103431


current_step =  36 , cost =  1.558729 , accuracy =  0.139246


current_step =  37 , cost =  1.330115 , accuracy =  0.120888


current_step =  38 , cost =  1.553973 , accuracy =  0.144761


current_step =  39 , cost =  1.346335 , accuracy =  0.134663


current_step =  40 , cost =  1.414260 , accuracy =  0.141710
current_step =  40 , val_cost =  1.403905 , val_accuracy =  0.146267
Saved model ./checkpoints_1526002576/model-40 at step 40
Best accuracy 0.1417100727558136 and 0.1462673544883728 at step 40


current_step =  41 , cost =  1.567531 , accuracy =  0.166888


In [6]:
test_result 

[['aiss'],
 ['aaaaaiaaee'],
 ['aaaaaltt'],
 ['esessesse'],
 ['eeeieinnn'],
 ['eeeeeeee'],
 ['eeeeenee'],
 ['cmmmciiiinnen'],
 ['eseseteeee'],
 ['prsseereeesn'],
 ['eeee'],
 ['miaiiaiiitiil'],
 ['desiiitte'],
 ['aciiiiii'],
 ['moiiniinne'],
 ['cmmniinnitn'],
 ['asrrsasees'],
 ['aeeee'],
 ['spsssssseee'],
 ['aeaaaaael'],
 ['iiiiis'],
 ['eeeeee'],
 ['cmneineiiiitn'],
 ['oareee'],
 ['ceiiile'],
 ['eeeeees'],
 ['eeeeeeee'],
 ['eeeee'],
 ['eeeeere'],
 ['eeeeeeeine'],
 ['ppnrriaiitie'],
 ['isreie'],
 ['eeeeneeee'],
 ['eeeeee'],
 ['eeeereeee'],
 ['eceeaeeae'],
 ['aaarraa'],
 ['ceiiiie'],
 ['pereeseeeee'],
 ['ssaupteisee'],
 ['srssetit'],
 ['aaaaaiaaae'],
 ['oiiiiine'],
 ['eeeee'],
 ['apppupssstes'],
 ['ssseseeess'],
 ['aaaaaaaae'],
 ['caaaaaiiitl'],
 ['eeeeeeeeeee'],
 ['ippstotnttte'],
 ['puaaaaae'],
 ['pspeeseeese'],
 ['ssee'],
 ['uarrrsete'],
 ['eeeeeeee'],
 ['aaaaaaaaaaeae'],
 ['aealie'],
 ['eeee'],
 ['eeereees'],
 ['eneniennn'],
 ['ereeeeeee'],
 ['aaaaaael'],
 ['eoiiiiete'],
 ['ppirssiiiin

In [7]:
pd.DataFrame(test_result).to_csv("./train_result.csv", index = False)

In [7]:
spell_checker.dec_output_batch

[[27, 17, 23, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2],
 [5, 14, 3, 21, 21, 11, 5, 3, 14, 14, 27, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2],
 [3, 24, 3, 11, 14, 3, 4, 14, 7, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2],
 [7, 21, 21, 7, 16, 22, 11, 3, 14, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2],
 [6, 7, 8, 11, 16, 11, 16, 9, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2],
 [20, 7, 14, 7, 24, 3, 16, 22, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2],
 [16, 7, 5, 7, 21, 21, 3, 20, 27, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2],
 [15, 7, 22, 3, 18, 10, 17, 20, 11, 5, 3, 14, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2],
 [18, 7, 16, 11, 16, 21, 23, 14, 3, 20, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2],
 [20,
  7,
  21,
  23,
  20,
  20,
  7,
  5,
  22,
  11,
  17,
  16,
  1,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2],
 [15, 17, 6, 7, 14, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2],
 [5,
  17,
  15,
  18,
  3,
  22,
  11,
  4,
  11,
  14,
  11,
  22,
  27,
  1,
  2,
  2,
  2,
  2,
  

In [9]:
spell_checker.target_weights_batch

[[1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  

In [6]:
np.array(spell_checker.target_weights_batch).shape 

(3422, 51)

In [9]:
spell_checker.enc_input_batch, spell_checker.dec_input_batch, spell_checker.dec_output_batch, spell_checker.target_weights_batch, spell_checker.enc_len_batch, spell_checker.dec_len_batch \
    = spell_checker.make_batch(spell_checker.df_test)

spell_checker.results, loss = spell_checker.sess.run([spell_checker.prediction, spell_checker.cost],
                              feed_dict = {spell_checker.encoder_inputs: spell_checker.enc_input_batch,
                                    spell_checker.decoder_inputs: spell_checker.dec_input_batch,
                                    spell_checker.decoder_outputs: spell_checker.dec_output_batch,
                                    spell_checker.target_weights: spell_checker.target_weights_batch,
                                    spell_checker.encoder_length: spell_checker.enc_len_batch, 
                                    spell_checker.decoder_length: spell_checker.dec_len_batch})
print('cost = ', '{:.6f}'.format(loss))

# decoded = [char_arr[i] for i in result[0]]
decoded = []
for result in spell_checker.results:
    decoded.append([spell_checker.char_arr[i] for i in result]) 
# 
spell_checker.translated = []
for result in decoded:
    try:
        end = result.index('E')
        # translated.append()
        spell_checker.translated.append([''.join(result[:end])])
    except:
                spell_checker.translated.append([''.join(result)])

cost =  0.698517


In [10]:
decoded_len = [len(x) for x in decoded] 

In [23]:
np.argmax(spell_checker.dec_len_batch)

1802

In [11]:
pd.DataFrame(spell_checker.translated).to_csv("./test_result.csv")

In [None]:

'''
embedding
attention
regularisation
'''