In [1]:
import tensorflow as tf
import os

#from tensorflow.contrib.rnn.python.ops.rnn_cell import AttentionCellWrapper
from tfa.seq2seq import AttentionWrapper
#from tensorflow.contrib.tensorboard.plugins import projector


def dropout(x, keep_prob):
    return tf.nn.dropout(x, keep_prob)


def lstm_cell(cell_dim, layer_num, keep_prob):
    with tf.variable_scope('LSTM_Cell') as scope:
        cell = tf.contrib.rnn.BasicLSTMCell(cell_dim, forget_bias=1.0, activation=tf.tanh, state_is_tuple=True)
        # cell = AttentionCellWrapper(cell, 10, state_is_tuple=True)
        cell = tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=keep_prob)
        return tf.contrib.rnn.MultiRNNCell([cell] * layer_num, state_is_tuple=True)


def rnn_reshape(inputs, input_dim, max_time_step):
    with tf.variable_scope('Reshape') as scope:
        """
        reshape inputs from [batch_size, max_time_step, input_dim] to [max_time_step * (batch_size, input_dim)]

        :param inputs: inputs of shape [batch_size, max_time_step, input_dim]
        :param input_dim: dimension of input
        :param max_time_step: max of time step

        :return:
            outputs of shape [max_time_step * (batch_size, input_dim)]
        """
        inputs_tr = tf.transpose(inputs, [1, 0, 2])
        inputs_tr_reshape = tf.reshape(inputs_tr, [-1, input_dim])
        inputs_tr_reshape_split = tf.split(axis=0, num_or_size_splits=max_time_step,
                value=inputs_tr_reshape)
        return inputs_tr_reshape_split


def rnn_model(inputs, input_len, cell, params):
    max_time_step = params['max_time_step']
    dim_rnn_cell = params['dim_rnn_cell']
    with tf.variable_scope('RNN') as scope:
        outputs, state = tf.contrib.rnn.static_rnn(cell, inputs, sequence_length=input_len, dtype=tf.float32, scope=scope)
        outputs = tf.transpose(tf.stack(outputs), [1, 0, 2])
        spread_len = tf.range(0, tf.shape(input_len)[0]) * max_time_step + (input_len - 1)
        gathered_outputs = tf.gather(tf.reshape(outputs, [-1, dim_rnn_cell]), spread_len)
        return gathered_outputs


def bi_rnn_model(inputs, input_len, fw_cell, bw_cell):
    with tf.variable_scope('Bi-RNN') as scope:
        outputs, _, _ = tf.nn.bidirectional_rnn(fw_cell, bw_cell, inputs,
                sequence_length=input_len, dtype=tf.float32, scope=scope)
        outputs = tf.transpose(tf.pack(outputs), [1, 0, 2])
        return outputs


def embedding_lookup(inputs, voca_size, embedding_dim, visual_dir, config, draw=False,
        initializer=None, trainable=True, scope='Embedding'):
    with tf.variable_scope(scope) as scope:
        if initializer is not None:
            embedding_table = tf.get_variable("embed",
                    initializer=initializer, trainable=trainable, dtype=tf.float32)
        else:
            embedding_table = tf.get_variable("embed", [voca_size, embedding_dim],
                    dtype=tf.float32, trainable=trainable)
        inputs_embed = tf.nn.embedding_lookup(embedding_table, inputs)
        print(inputs_embed)

        if draw:
            embedding = config.embeddings.add()
            embedding.tensor_name = embedding_table.name
            embedding.metadata_path = os.path.join(visual_dir, '%s_metadata.tsv'%scope.name)
            return inputs_embed, projector
        else:
            return inputs_embed, None


def mask_by_index(batch_size, input_len, max_time_step):
    with tf.variable_scope('Masking') as scope:
        input_index = tf.range(0, batch_size) * max_time_step + (input_len - 1)
        lengths_transposed = tf.expand_dims(input_index, 1)
        lengths_tiled = tf.tile(lengths_transposed, [1, max_time_step])
        mask_range = tf.range(0, max_time_step)
        range_row = tf.expand_dims(mask_range, 0)
        range_tiled = tf.tile(range_row, [batch_size, 1])
        mask = tf.less_equal(range_tiled, lengths_tiled)
        weight = tf.select(mask, tf.ones([batch_size, max_time_step]),
                           tf.zeros([batch_size, max_time_step]))
        weight = tf.reshape(weight, [-1])
        return weight


def linear(inputs, output_dim, dropout_rate=1.0, regularize_rate=0, activation=None, scope='Linear'):
    with tf.variable_scope(scope) as scope:
        input_dim = inputs.get_shape().as_list()[-1]
        inputs = tf.reshape(inputs, [-1, input_dim])
        weights = tf.get_variable('Weights', [input_dim, output_dim],
                                  initializer=tf.random_normal_initializer())
        variable_summaries(weights, scope.name + '/Weights')
        biases = tf.get_variable('Biases', [output_dim],
                                 initializer=tf.constant_initializer(0.0))
        variable_summaries(biases, scope.name + '/Biases')
        if activation is None:
            return dropout((tf.matmul(inputs, weights) + biases), dropout_rate)
        else:
            return dropout(activation(tf.matmul(inputs, weights) + biases), dropout_rate)


def variable_summaries(var, name):
    """Attach a lot of summaries to a Tensor."""
    with tf.name_scope('summaries'):
        mean = tf.reduce_mean(var)
        tf.summary.scalar('mean/' + name, mean)
        with tf.name_scope('stddev'):
            stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
        tf.summary.scalar('stddev/' + name, stddev)
        tf.summary.scalar('max/' + name, tf.reduce_max(var))
        tf.summary.scalar('min/' + name, tf.reduce_min(var))
        tf.summary.histogram(name, var)


ModuleNotFoundError: No module named 'tensorflow.contrib'

In [2]:
import tensorflow as tf
import os

from ops import *


class RNN(object):
    def __init__(self, params, initializer):

        # session settings
        config = tf.ConfigProto(device_count={'GPU':1})
        config.gpu_options.allow_growth = True
        config.gpu_options.per_process_gpu_memory_fraction = 0.5
        self.session = tf.Session(config=config)
        self.params = params
        self.model_name = params['model_name']

        # hyper parameters
        self.learning_rate = params['learning_rate']
        self.decay_rate = params['decay_rate']
        self.decay_step = params['decay_step']
        self.min_grad = params['min_grad']
        self.max_grad = params['max_grad']

        # rnn parameters
        self.max_time_step = params['max_time_step']
        self.cell_layer_num = params['lstm_layer']
        self.dim_embed_unigram = params['dim_embed_unigram']
        self.dim_embed_bigram = params['dim_embed_bigram']
        self.dim_embed_trigram = params['dim_embed_trigram']
        self.dim_hidden = params['dim_hidden']
        self.dim_rnn_cell = params['dim_rnn_cell']
        self.dim_unigram = params['dim_unigram'] 
        self.dim_bigram = params['dim_bigram'] 
        self.dim_trigram = params['dim_trigram'] 
        self.dim_output = params['dim_output']
        self.ngram = params['ngram']
        self.ensemble = params['ensemble']
        self.embed = params['embed']
        self.embed_trainable = params['embed_trainable']
        self.checkpoint_dir = params['checkpoint_dir']
        self.initializer = initializer

        # input data placeholders
        self.unigram = tf.placeholder(tf.int32, [None, self.max_time_step])
        self.bigram = tf.placeholder(tf.int32, [None, self.max_time_step])
        self.trigram = tf.placeholder(tf.int32, [None, self.max_time_step])
        self.lengths = tf.placeholder(tf.int32, [None])
        self.labels = tf.placeholder(tf.int32, [None])
        self.lstm_dropout = tf.placeholder(tf.float32)
        self.hidden_dropout = tf.placeholder(tf.float32)

        # model settings
        self.global_step = tf.Variable(0, name="step", trainable=False)
        self.learning_rate = tf.train.exponential_decay(
                self.learning_rate, self.global_step,
                self.decay_step, self.decay_rate, staircase=True)
        self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
        self.optimize = None
        self.saver = None
        self.losses = None
        self.logits = None

        # model build
        self.merged_summary = None
        self.embed_writer = tf.summary.FileWriter(self.checkpoint_dir)
        self.embed_config = projector.ProjectorConfig()
        self.projector = None
        self.build_model()
        self.session.run(tf.global_variables_initializer())
       
        # debug initializer
        '''
        with tf.variable_scope('Unigram', reuse=True):
            unigram_embed = tf.get_variable("embed", [self.dim_unigram, self.dim_embed_unigram], dtype=tf.float32)
            print(unigram_embed.eval(session=self.session))
        '''

    def ngram_logits(self, inputs, length, dim_input, dim_embed=None, 
            initializer=None, trainable=True, scope='ngram'):
        with tf.variable_scope(scope) as scope: 
            fw_cell = lstm_cell(self.dim_rnn_cell, self.cell_layer_num, self.lstm_dropout)
            bw_cell = lstm_cell(self.dim_rnn_cell, self.cell_layer_num, self.lstm_dropout)
            
            if dim_embed is not None:
                inputs_embed, self.projector = embedding_lookup(inputs, 
                        dim_input, dim_embed, self.checkpoint_dir, self.embed_config, 
                        draw=True, initializer=initializer, trainable=trainable, scope=scope)
                inputs_reshape = rnn_reshape(inputs_embed, dim_embed, self.max_time_step)
                self.projector.visualize_embeddings(self.embed_writer, self.embed_config)
            else:
                inputs_reshape = rnn_reshape(tf.one_hot(inputs, dim_input), dim_input, self.max_time_step)
            
            outputs = rnn_model(inputs_reshape, length, fw_cell, self.params)
            return outputs

    def build_model(self):
        print("## Building an RNN model")

        unigram_logits = self.ngram_logits(inputs=self.unigram, 
                length=self.lengths, 
                dim_input=self.dim_unigram,
                dim_embed=self.dim_embed_unigram if self.embed else None,
                initializer=self.initializer[0],
                trainable=self.embed_trainable,
                scope='Unigram')

        bigram_logits = self.ngram_logits(inputs=self.bigram, 
                length=self.lengths-1, 
                dim_input=self.dim_bigram,
                dim_embed=self.dim_embed_bigram if self.embed else None,
                initializer=self.initializer[1],
                trainable=self.embed_trainable,
                scope='Bigram')
        
        trigram_logits = self.ngram_logits(inputs=self.trigram, 
                length=self.lengths-2, 
                dim_input=self.dim_trigram,
                dim_embed=self.dim_embed_trigram if self.embed else None,
                initializer=self.initializer[2],
                trainable=self.embed_trainable,
                scope='Trigram')

        if self.ensemble:
            total_logits = tf.concat([unigram_logits, bigram_logits, trigram_logits], axis=1)
        elif self.ngram == 1:
            total_logits = unigram_logits
        elif self.ngram == 2:
            total_logits = bigram_logits
        elif self.ngram == 3:
            total_logits = trigram_logits
        else:
            assert True, 'No specific ngram %d'% ngram

        hidden1 = linear(inputs=total_logits, 
                output_dim=self.dim_hidden,
                dropout_rate=self.hidden_dropout,
                activation=tf.nn.relu,
                scope='Hidden1')
        
        logits = linear(inputs=total_logits,
            output_dim=self.dim_output, 
            scope='Output')

        self.logits = logits 
        self.losses = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
            labels=self.labels))

        tf.summary.scalar('Loss', self.losses)
        self.variables = tf.trainable_variables()

        grads = []
        for grad in tf.gradients(self.losses, self.variables):
            if grad is not None:
                grads.append(tf.clip_by_value(grad, self.min_grad, self.max_grad))
            else:
                grads.append(grad)
        self.optimize = self.optimizer.apply_gradients(zip(grads, self.variables), global_step=self.global_step)

        model_vars = [v for v in tf.global_variables()]
        print('model variables', [model_var.name for model_var in tf.trainable_variables()])
        self.saver = tf.train.Saver(model_vars)
        self.merged_summary = tf.summary.merge_all()

    @staticmethod
    def reset_graph():
        tf.reset_default_graph()

    def save(self, checkpoint_dir, step):
        file_name = "%s.model" % self.model_name
        self.saver.save(self.session, os.path.join(checkpoint_dir, file_name))
        print("Model saved", file_name)

    def load(self, checkpoint_dir):
        file_name = "%s.model" % self.model_name
        file_name += "-10800"
        self.saver.restore(self.session, os.path.join(checkpoint_dir, file_name))
        print("Model loaded", file_name)



ModuleNotFoundError: No module named 'tensorflow.contrib'

In [4]:
from __future__ import absolute_import

import tensorflow as tf
import numpy as np
import time
import sys
import os
import re
import operator
import gensim

from random import shuffle
from utils import *


def get_ethnicity_data(data_dir, params):
    is_ethnicity = params['ethnicity']

    for root, dir, files in os.walk(data_dir):
        unigram_set = []
        bigram_set = []
        trigram_set = []
        length_set = []
        labels = []

        unigram2idx = {}
        idx2unigram = {}
        bigram2idx = {}
        idx2bigram = {}
        trigram2idx = {}
        idx2trigram = {}
        country2idx = {}
        idx2country = {}
        country2ethnicity = {}
        name_max_len = 0

        train_set = []
        valid_set = []
        test_set = []

        for file_cnt, file_name in enumerate(sorted(files)):
            data = open(os.path.join(root, file_name))
            file_len = 0
            
            if file_name == '0_unigram_to_idx.txt':
                for k, line in enumerate(data):
                    file_len = k + 1
                    unigram, index = line[:-1].split('\t')
                    unigram2idx[unigram] = int(index)
                    idx2unigram[int(index)] = unigram
            elif file_name == '1_bigram_to_idx.txt':
                for k, line in enumerate(data):
                    file_len = k + 1
                    bigram, index = line[:-1].split('\t')
                    bigram2idx[bigram] = int(index)
                    idx2bigram[int(index)] = bigram
            elif file_name == '2_trigram_to_idx.txt':
                for k, line in enumerate(data):
                    file_len = k + 1
                    trigram, index = line[:-1].split('\t')
                    trigram2idx[trigram] = int(index)
                    idx2trigram[int(index)] = trigram
            elif file_name == 'country_to_idx.txt':
                for k, line in enumerate(data):
                    file_len = k + 1
                    country, index = line[:-1].split('\t')
                    if not is_ethnicity:
                        index = k       # Change to index when testing nationality
                    country2idx[country] = int(index)
                    idx2country[int(index)] = country
            elif file_name == 'country_to_ethnicity.txt':
                for k, line in enumerate(data):
                    file_len = k + 1
                    country, eth1, eth2 = line[:-1].split('\t')
                    country2ethnicity[int(country)] = [int(eth1), int(eth2)]
            elif 'data_' in file_name:
                for k, line in enumerate(data):
                    name, nationality = line[:-1].split('\t')
                    name = re.sub(r'\ufeff', '', name)    # delete BOM

                    unigram_vector = [unigram2idx[c] if c in unigram2idx else 0 for c in name]
                    bigram_vector= [bigram2idx[c1 + c2] if (c1+c2) in bigram2idx else 0
                            for c1, c2 in zip(*[name[i:] for i in range(2)])]
                    trigram_vector= [trigram2idx[c1 + c2 + c3] if (c1+c2+c3) in trigram2idx else 0 
                            for c1, c2, c3 in zip(*[name[i:] for i in range(3)])]

                    # label vector
                    nationality = country2idx[nationality]
                    if is_ethnicity:
                        ethnicity = country2ethnicity[nationality][1]
                        if ethnicity < 0:
                            continue
                    name_length = len(name)

                    if name_max_len < len(name):
                        name_max_len = len(name)

                    unigram_set.append(unigram_vector)
                    bigram_set.append(bigram_vector)
                    trigram_set.append(trigram_vector)
                    length_set.append(name_length)
                    if is_ethnicity:
                        labels.append(ethnicity)
                    else:
                        labels.append(nationality)
                    file_len = k + 1

                if 'train_ch' in file_name:
                    train_set = [unigram_set, bigram_set, trigram_set, length_set, labels]
                elif 'val' in file_name:
                    valid_set = [unigram_set, bigram_set, trigram_set, length_set, labels]
                elif 'ijcai' in file_name: # test
                    test_set = [unigram_set, bigram_set, trigram_set, length_set, labels]
                else:
                    assert True, 'not allowed file name %s'% file_name
                
                unigram_set = []
                bigram_set = []
                trigram_set = []
                length_set = []
                labels = []
            else:
                print('ignoring file', file_name)

            print('reading', file_name, 'of length', file_len)

    print('total data length:', len(train_set[0]), len(valid_set[0]), len(test_set[0]))
    print('shape of data:', np.array(train_set).shape, np.array(valid_set).shape, np.array(test_set).shape)
    print('name max length:', name_max_len)

    return (train_set, valid_set, test_set,
            [idx2unigram, unigram2idx, idx2country, country2ethnicity, idx2bigram, idx2trigram])


def get_char2vec(train_set, dim_embed, idx2char):
    sentences = []
    for sentence in train_set:
        char_seq = [idx2char[c] for c in sentence]
        sentences.append(char_seq)

    model = gensim.models.Word2Vec(sentences, size=dim_embed, window=5, min_count=0, iter=10)
    initializer = np.zeros((len(idx2char), dim_embed), dtype=np.float32)

    for idx in range(len(idx2char)):
        if idx2char[idx] in model:
            initializer[idx] = model[idx2char[idx]]
   
    '''
    for alphabet in idx2char.values():
        print('most similar to', alphabet, end=' is ')
        try:
            print(' '.join([(s) for s, _ in model.most_similar(positive=[alphabet], topn=5)]))
        except:
            print('no values', alphabet)
    '''
    
    return initializer


def get_data(params):
    ethnicity_dir = params['data_dir']
    is_valid = params['is_valid']
    train_set, valid_set, test_set, dictionary = get_ethnicity_data(ethnicity_dir, params)

    print(train_set[0][0])
    print(train_set[1][0])
    print(train_set[2][0])
    print(train_set[3][0], train_set[4][0])

    if not is_valid:
        train_set[0] = np.append(train_set[0], valid_set[0], axis=0)
        train_set[1] = np.append(train_set[1], valid_set[1], axis=0)
        train_set[2] = np.append(train_set[2], valid_set[2], axis=0)
        train_set[3] = np.append(train_set[3], valid_set[3], axis=0)
        train_set[4] = np.append(train_set[4], valid_set[4], axis=0)
    print('shape of data:', np.array(train_set).shape, np.array(valid_set).shape, np.array(test_set).shape)
    print('preprocessing done\n')
    
    return train_set, valid_set, test_set, dictionary


def experiment(model, dataset, params):
    print('## Training')
    valid_epoch = 1
    test_epoch = 1
    max_top1 = 0
    min_loss = 99999
    max_top5 = 0
    max_top1_epoch = 0
    nochange_cnt = 0
    early_stop = 5
    checkpoint_dir = params['checkpoint_dir']
    continue_train = params['continue_train']
    train_epoch = params['train_epoch']
    is_save = params['save']
    is_valid = params['is_valid']
    sess = model.session

    if not os.path.exists(checkpoint_dir):
        os.mkdir(checkpoint_dir)
    if continue_train is not False:
        model.load(checkpoint_dir)

    start_time = time.time()
    for epoch_idx in range(train_epoch):
        train_cost, train_acc, train_acc5 = run(model, params, dataset[0], is_train=True)
        print("\nTraining loss: %.3f, acc1: %.3f, acc5: %.3f, ep: %d" % (train_cost, train_acc,
            train_acc5, epoch_idx))

        if (epoch_idx % valid_epoch == 0 or epoch_idx == train_epoch - 1) and is_valid:
            valid_cost, valid_acc, valid_acc5 = run(model, params, dataset[1], is_valid=is_valid)
            print("\nValidation loss: %.3f, acc1: %.3f, acc5: %.3f, ep: %d" % (valid_cost, valid_acc,
                valid_acc5, epoch_idx))
            if valid_acc > max_top1:
                max_top1 = valid_acc
                max_top5 = valid_acc5
                max_top1_epoch = epoch_idx
                nochange_cnt = 0
            else:
                nochange_cnt += 1
        elif not is_valid:
            if train_cost < min_loss:
                min_loss = train_cost
                nochange_cnt = 0
            else:
                nochange_cnt += 1

        if epoch_idx % test_epoch == 0 or epoch_idx == train_epoch - 1:
            test_cost, test_acc, test_acc5 = run(model, params, dataset[2], dataset[3], is_test=True)
            print("Testing loss: %.3f, acc1: %.3f, acc5: %.3f" % (test_cost, test_acc,
                test_acc5))
            print()
            if is_save:
                model.save(checkpoint_dir, sess.run(model.global_step))

        if nochange_cnt == early_stop:
            print("Early stopping applied\n")
            test_cost, test_acc, test_acc5 = run(model, params, dataset[2], dataset[3], is_test=True)
            print("Testing loss: %.3f, acc1: %.3f, acc5: %.3f" % (test_cost, test_acc,
                test_acc5))
            break

        # summary = sess.run(model.merged_summary, feed_dict=feed_dict)
        # model.train_writer.add_summary(summary, step)

    # model.save(checkpoint_dir, sess.run(model.global_step))
    model.reset_graph()
    return max_top1, max_top5, max_top1_epoch


def run(model, params, dataset, dictionary=None, is_train=False, is_valid=False, is_test=False):
    batch_size = params['batch_size']
    lstm_dropout = params['lstm_dropout']
    hidden_dropout = params['hidden_dropout']
    output_size = params['dim_output']
    max_time_step = params['max_time_step']
    sess = model.session
    cnt = 0.0
    total_cost = 0.0
    total_acc = 0.0
    total_acc5 = 0.0
    total_pred = None
    
    unigram_set, bigram_set, trigram_set, lengths, labels = dataset
    if is_valid or is_test:
        lstm_dropout = 1.0
        hidden_dropout = 1.0

    for datum_idx in range(0, len(unigram_set), batch_size):
        batch_unigram = unigram_set[datum_idx:datum_idx+batch_size]
        batch_bigram = bigram_set[datum_idx:datum_idx+batch_size]
        batch_trigram = trigram_set[datum_idx:datum_idx+batch_size]
        batch_lengths= lengths[datum_idx:datum_idx + batch_size]
        batch_labels = labels[datum_idx:datum_idx+batch_size]

        batch_unigram_onehot = []
        batch_bigram_onehot = []
        batch_trigram_onehot = []
        for unigram in batch_unigram:
            unigram_onehot = unigram
            while len(unigram_onehot) != max_time_step:
                unigram_onehot.append(0)
            batch_unigram_onehot.append(unigram_onehot)
        for bigram in batch_bigram:
            bigram_onehot = bigram
            while len(bigram_onehot) != max_time_step:
                bigram_onehot.append(0)
            batch_bigram_onehot.append(bigram_onehot)
        for trigram in batch_trigram:
            trigram_onehot = trigram
            while len(trigram_onehot) != max_time_step:
                trigram_onehot.append(0)
            batch_trigram_onehot.append(trigram_onehot)

        feed_dict = {model.unigram: batch_unigram_onehot, model.bigram: batch_bigram_onehot,
                model.trigram: batch_trigram_onehot, 
                model.lengths: batch_lengths, model.labels: batch_labels, 
                model.lstm_dropout: lstm_dropout, model.hidden_dropout: hidden_dropout}
        pred, cost, step = sess.run([model.logits, model.losses, model.global_step], feed_dict=feed_dict)

        if is_train:
            sess.run(model.optimize, feed_dict=feed_dict)
        
        if (datum_idx % (batch_size*5) == 0) or (datum_idx + batch_size >= len(unigram_set)):
            acc = accuracy_score(batch_labels, pred)
            acc5 = top_n_acc(batch_labels, pred, 5)
            _progress = progress((datum_idx + batch_size) / float(len(unigram_set)))
            _progress += " tr loss: %.3f, acc1: %.3f, acc5: %.3f" % (cost,
                    acc, acc5)
            if is_train:
                sys.stdout.write(_progress)
                sys.stdout.flush()
            cnt += 1
            total_cost += cost
            total_acc += acc
            total_acc5 += acc5
            
        if total_pred is None:
            total_pred = pred
        else:
            total_pred = np.append(total_pred, pred, axis=0)
    
    is_ethnicity = params['ethnicity']
    if is_test and not is_ethnicity:
        save_result(total_pred, lengths, labels, unigram_set, dictionary, params['pred_result_path'])
    if is_test and is_ethnicity:
        save_detail_result(total_pred, labels, lengths, unigram_set, dictionary, params['detail_result_path'])

    return total_cost / cnt, total_acc / cnt, total_acc5 / cnt


def accuracy_score(labels, logits):
    correct_prediction = np.equal(labels, np.argmax(logits, 1))
    accuracy = np.mean(correct_prediction.astype(float))
    return accuracy


def top_n_acc(labels, logits, top):
    top_n_logits = [logit.argsort()[-top:][::-1] for logit in logits]
    correct_prediction = np.array([(pred in topn) for pred, topn in zip(labels, top_n_logits)])
    accuracy = np.mean(correct_prediction.astype(float))
    return accuracy


def save_result(logits, indexes, labels, inputs, dictionary, path):
    idx2unigram, unigram2idx, idx2country, country2idx, _, _ = dictionary 
    top_n_logits = [logit.argsort()[-5:][::-1] for logit in logits]

    f = open(path, 'w')
    for logit, logit_index, label, input in zip(top_n_logits, indexes, labels, inputs):
        name = ''.join([idx2unigram[char] for char in input][:logit_index])
        pred = 'pred => ' + str(logit[0]) + ':' + idx2country[logit[0]] + '\n'
        pred += 'pred => ' + str(logit[1]) + ':' + idx2country[logit[1]] + '\n'
        pred += 'pred => ' + str(logit[2]) + ':' + idx2country[logit[2]] + '\n'
        pred += 'pred => ' + str(logit[3]) + ':' + idx2country[logit[3]] + '\n'
        pred += 'pred => ' + str(logit[4]) + ':' + idx2country[logit[4]] + '\n'
        corr = 'real => ' + str(label) + ':' + idx2country[label]
        result = '[correct]' if logit[0] == label else '[wrong]'
        end = '--------------------------------------------'
        f.write(result + '\n' + name + '\n' + pred + '\n' + corr + '\n' + end + '\n')
    f.close()


def save_detail_result(logits, labels, indexes, inputs, dictionary, path):
    idx2unigram, _, idx2country, country2ethnicity, _, _ = dictionary
    tp = dict()
    fp = dict()
    fn = dict()
    tn = dict()

    f = open(path, 'w')
    for ethnicity in range(13):
        key = ethnicity
        tp[key] = 0.0
        fp[key] = 0.0
        fn[key] = 0.0
        tn[key] = 0.0
        for logit, label in zip(logits, labels):
            if np.argmax(logit, 0) == key:
                if label == key:
                    tp[key] += 1
                else:
                    fp[key] += 1
            else:
                if label == key:
                    fn[key] += 1
                else:
                    tn[key] += 1
        if tp[key] == 0:
            continue
        pr = tp[key] / (tp[key] + fp[key])
        rc = tp[key] / (tp[key] + fn[key])
        f1 = 2*pr*rc / (pr+rc)

        f.write(str(ethnicity) + '\t%.2f\t%.2f\t%.2f'% (pr, rc, f1) + '\n')
    f.write('acc %.2f\n'% ((np.sum(list(tp.values())) + np.sum(list(tn.values()))) \
            / (np.sum(list(tp.values())) + np.sum(list(fp.values())) + np.sum(list(fn.values())) +
                np.sum(list(tn.values())))))
    f.close()

In [5]:
import gensim

from dataset import get_ethnicity_data


data_dir = './data/raw'
params = {'ethnicity': False}
train_set, valid_set, test_set, dictionary = get_ethnicity_data(data_dir, params)
vec = 2
dic = 5

sentences = []
for sentence in train_set[vec][:]:
    char_seq = [dictionary[dic][c] for c in sentence]
    sentences.append(char_seq)
for sentence in valid_set[vec][:]:
    char_seq = [dictionary[dic][c] for c in sentence]
    sentences.append(char_seq)
for sentence in test_set[vec][:]:
    char_seq = [dictionary[dic][c] for c in sentence]
    sentences.append(char_seq)

model = gensim.models.Word2Vec(sentences, size=100, window=5, min_count=0, iter=100)

for alphabet in dictionary[dic].values():
    print('most similar to', alphabet, end=' is ')
    try:
        print(' '.join([(s) for s, _ in model.most_similar(positive=[alphabet], topn=5)]))
    except:
        print('no values', alphabet)


reading 0_unigram_to_idx.txt of length 82
reading 1_bigram_to_idx.txt of length 1876
reading 2_trigram_to_idx.txt of length 14767
reading country_to_ethnicity.txt of length 127
reading country_to_idx.txt of length 127
reading data_ijcai_authors of length 2408
reading data_raw_test of length 3543
reading data_raw_train of length 10633
reading data_raw_train_ch of length 10754
reading data_raw_valid of length 3545
total data length: 10754 3545 2408
shape of data: (5, 10754) (5, 3545) (5, 2408)
name max length: 47


  print('shape of data:', np.array(train_set).shape, np.array(valid_set).shape, np.array(test_set).shape)


most similar to   B is nba rui hou Zho Zha
most similar to   H is k   HOY T H HOO 'T 
most similar to   M is e   .M. J.M .E. F.D
most similar to   R is RED n   DGR MFR EDG
most similar to  "F is no values  "F
most similar to  "H is "Ha n " rro arr Nix
most similar to  "L is e " "Li "Le i"  ni"
most similar to  "O is "Ol f " Oll le" e" 
most similar to  "W is "Wi m " ly" m ( ly)
most similar to  'T is R ' 'T  T H HOY OFM
most similar to  (A is (An Zia a ( aia zia
most similar to  (B is (BA L ( ON)  BA -BA
most similar to  (D is l ( (Da an) n)  ani
most similar to  (E is (Ed Edy dy) üse win
most similar to  (F is (Fr Fri tz) z)  h (
most similar to  (G is (Ge rd) erd erh  VÖ
most similar to  (H is (Ha (He s ( Irm Yer
most similar to  (I is z ( (In noz Inn no)
most similar to  (J is (Jo (Ju h ( n ( Jut
most similar to  (K is (Ki (Ko Kik ki) Koo
most similar to  (L is (Le Leo eop d ( eo)
most similar to  (M is fei  Me k)  i U xei
most similar to  (N is (Ne l)  el) ) V Nel
most similar to  



 AE is no values  AE
most similar to  AF is AFO i A AKD FOL dji
most similar to  AG is AGU UIA Y A GUI UY 
most similar to  AH is HLQ : A AHM AAH . A
most similar to  AI is AIT ITI SAI AIN NT 
most similar to  AJ is JIL auq uqe Tau qee
most similar to  AK is AKA ehb II- AKD Veh
most similar to  AL is LEK PTE LPT LBU LEX
most similar to  AM is AMA S A RJI AMY MBR
most similar to  AN is  An AAN SAN -AN HAN
most similar to  AO is UIT AOU OUI Omi hid
most similar to  AP is PPS APP PPA APA APE
most similar to  AQ is IZQ e Q c Q AQU PIQ
most similar to  AR is IAR TAR UAR baz PAR
most similar to  AS is w A ASA TBU AAD STB
most similar to  AT is ATT TTR ATI LAT TTW
most similar to  AU is UDU FFR AUF AUD ZUN
most similar to  AV is AVE AVD JO  VDE LAV
most similar to  AW is no values  AW
most similar to  AX is AXE XEL pij PPS  AA
most similar to  AY is YIK YAC AYI AYV AYK
most similar to  AZ is ZCU AZC ZEV E A AZG
most similar to  Aa is Aal Aag age  Ak Tag
most similar to  Ab is Abb Abd bdo Abe 

In [3]:
import tensorflow as tf
import numpy as np
import pprint

from time import gmtime, strftime
from dataset import get_data, experiment, get_char2vec
from model import RNN


flags = tf.app.flags

# Default parameters
flags.DEFINE_integer("train_epoch", 3000, "Epoch to train")
flags.DEFINE_integer("dim_unigram", 82, "Dimension of input, 42 or 82")
flags.DEFINE_integer("dim_bigram", 1876, "Dimension of input, 925 or 1876")
flags.DEFINE_integer("dim_trigram", 14767, "Dimension of input, 8573 or 14767")
flags.DEFINE_integer("dim_output", 127, "Dimension of output, 95 or 127")
flags.DEFINE_integer("max_time_step", 60, "Maximum time step of RNN")
flags.DEFINE_integer("min_grad", -5, "Minimum gradient to clip")
flags.DEFINE_integer("max_grad", 5, "Maximum gradient to clip")
flags.DEFINE_integer("batch_size", 300, "Size of batch")
flags.DEFINE_integer("ngram", 3, "Ngram feature when ensemble = False.")
flags.DEFINE_float("decay_rate", 0.99, "Decay rate of learning rate")
flags.DEFINE_float("decay_step", 100, "Decay step of learning rate")

# Validation hyper parameters
flags.DEFINE_integer("valid_iteration", 250, "Number of validation iteration.")
flags.DEFINE_integer("dim_rnn_cell", 200, "Dimension of RNN cell")
flags.DEFINE_integer("dim_rnn_cell_min", 200, "Minimum dimension of RNN cell")
flags.DEFINE_integer("dim_rnn_cell_max", 399, "Maximum dimension of RNN cell")
flags.DEFINE_integer("dim_hidden", 200, "Dimension of hidden layer")
flags.DEFINE_integer("dim_hidden_min", 200, "Minimum dimension of hidden layer")
flags.DEFINE_integer("dim_hidden_max", 399, "Maximum dimension of hidden layer")
flags.DEFINE_integer("dim_embed_unigram", 30, "Dimension of character embedding")
flags.DEFINE_integer("dim_embed_unigram_min", 10, "Minimum dimension of character embedding")
flags.DEFINE_integer("dim_embed_unigram_max", 100, "Maximum dimension of character embedding")
flags.DEFINE_integer("dim_embed_bigram", 100, "Dimension of character embedding")
flags.DEFINE_integer("dim_embed_bigram_min", 30, "Minimum dimension of character embedding")
flags.DEFINE_integer("dim_embed_bigram_max", 200, "Maximum dimension of character embedding")
flags.DEFINE_integer("dim_embed_trigram", 130, "Dimension of character embedding")
flags.DEFINE_integer("dim_embed_trigram_min", 30, "Minimum dimension of character embedding")
flags.DEFINE_integer("dim_embed_trigram_max", 320, "Maximum dimension of character embedding")
flags.DEFINE_integer("lstm_layer", 1, "Layer number of RNN ")
flags.DEFINE_integer("lstm_layer_min", 1, "Mimimum layer number of RNN ")
flags.DEFINE_integer("lstm_layer_max", 1, "Maximum layer number of RNN ")
flags.DEFINE_float("lstm_dropout", 0.5, "Dropout of RNN cell")
flags.DEFINE_float("lstm_dropout_min", 0.3, "Minumum dropout of RNN cell")
flags.DEFINE_float("lstm_dropout_max", 0.8, "Maximum dropout of RNN cell")
flags.DEFINE_float("hidden_dropout", 0.5, "Dropout rate of hidden layer")
flags.DEFINE_float("hidden_dropout_min", 0.3, "Minimum dropout rate of hidden layer")
flags.DEFINE_float("hidden_dropout_max", 0.8, "Maximum dropout rate of hidden layer")
flags.DEFINE_float("learning_rate", 0.01, "Learning rate of the optimzier")
flags.DEFINE_float("learning_rate_min", 5e-3, "Minimum learning rate of the optimzier")
flags.DEFINE_float("learning_rate_max", 5e-2, "Maximum learning rate of the optimzier")

# Model settings
flags.DEFINE_boolean("default_params", True, "True to use default params")
flags.DEFINE_boolean("ensemble", True, "True to use ensemble ngram")
flags.DEFINE_boolean("embed", True, "True to use embedding table")
flags.DEFINE_boolean("embed_trainable", False, "True to use embedding table")
flags.DEFINE_boolean("ethnicity", False, "True to test on ethnicity")
flags.DEFINE_boolean("is_train", True, "True for training, False for testing")
flags.DEFINE_boolean("is_valid", True, "True for validation, False for testing")
flags.DEFINE_boolean("continue_train", False, "True to continue training from saved checkpoint. False for restarting.")
flags.DEFINE_boolean("save", False, "True to save")
flags.DEFINE_string("model_name", "default", "Model name, auto saved as YMDHMS")
flags.DEFINE_string("checkpoint_dir", "./checkpoint/", "Directory name to save the checkpoints [checkpoint]")
flags.DEFINE_string("data_dir", "data/raw", "Directory name of input data")
flags.DEFINE_string("valid_result_path", "result/validation", "Validation result save path")
flags.DEFINE_string("pred_result_path", "result/pred.txt", "Prediction result save path")
flags.DEFINE_string("detail_result_path", "result/detail.txt", "Prediction result save path")

FLAGS = flags.FLAGS


def sample_parameters(params):
    combination = [
            params['dim_hidden'],
            params['dim_rnn_cell'],
            params['learning_rate'],
            params['lstm_dropout'],
            params['lstm_layer'],
            params['hidden_dropout'],
            params['dim_embed_unigram'],
            params['dim_embed_bigram'],
            params['dim_embed_trigram']
    ]

    if not params['default_params']:
        combination[0] = params['dim_hidden'] = int(np.random.uniform(
                params['dim_hidden_min'],
                params['dim_hidden_max']) // 50) * 50 
        combination[1] = params['dim_rnn_cell'] = int(np.random.uniform(
                params['dim_rnn_cell_min'],
                params['dim_rnn_cell_max']) // 50) * 50
        combination[2] = params['learning_rate'] = float('{0:.5f}'.format(np.random.uniform(
                params['learning_rate_min'],
                params['learning_rate_max'])))
        combination[3] = params['lstm_dropout'] = float('{0:.5f}'.format(np.random.uniform(
                params['lstm_dropout_min'],
                params['lstm_dropout_max'])))
        combination[4] = params['lstm_layer'] = int(np.random.uniform(
                params['lstm_layer_min'],
                params['lstm_layer_max']))
        combination[5] = params['hidden_dropout'] = float('{0:.5f}'.format(np.random.uniform(
                params['hidden_dropout_min'],
                params['hidden_dropout_max'])))
        combination[6] = params['dim_embed_unigram'] = int(np.random.uniform(
                params['dim_embed_unigram_min'],
                params['dim_embed_unigram_max']) // 10) * 10
        combination[7] = params['dim_embed_bigram'] = int(np.random.uniform(
                params['dim_embed_bigram_min'],
                params['dim_embed_bigram_max']) // 10) * 10
        combination[8] = params['dim_embed_trigram'] = int(np.random.uniform(
                params['dim_embed_trigram_min'],
                params['dim_embed_trigram_max']) // 10) * 10

    return params, combination


# Save default params and set scope
saved_params = FLAGS.__flags
if saved_params['ensemble']:
    model_name = 'ensemble'
elif saved_params['ngram'] == 1:
    model_name = 'unigram'
elif saved_params['ngram'] == 2:
    model_name = 'bigram'
elif saved_params['ngram'] == 3:
    model_name = 'trigram'
else:
    assert True, 'Not supported ngram %d'% saved_params['ngram']
model_name += '_embedding' if saved_params['embed'] else '_no_embedding' 
saved_params['model_name'] = '%s' % model_name
saved_params['checkpoint_dir'] += model_name
pprint.PrettyPrinter().pprint(saved_params)
saved_dataset = get_data(saved_params) 

validation_writer = open(saved_params['valid_result_path'], 'a')
validation_writer.write(model_name + "\n")
validation_writer.write("[dim_hidden, dim_rnn_cell, learning_rate, lstm_dropout, lstm_layer, hidden_dropout, dim_embed]\n")
validation_writer.write("combination\ttop1\ttop5\tepoch\n")

# Run the model
for _ in range(saved_params['valid_iteration']):
    # Sample parameter sets
    params, combination = sample_parameters(saved_params.copy())
    dataset = saved_dataset[:]

    # Initialize embeddings
    uni_init = get_char2vec(dataset[0][0][:], params['dim_embed_unigram'], dataset[3][0])
    bi_init = get_char2vec(dataset[0][1][:], params['dim_embed_bigram'], dataset[3][4])
    tri_init = get_char2vec(dataset[0][2][:], params['dim_embed_trigram'], dataset[3][5])

    print(model_name, 'Parameter sets: ', end='')
    pprint.PrettyPrinter().pprint(combination)

    rnn_model = RNN(params, [uni_init, bi_init, tri_init])
    top1, top5, ep = experiment(rnn_model, dataset, params)

    validation_writer.write(str(combination) + '\t')
    validation_writer.write(str(top1) + '\t' + str(top5) + '\tEp:' + str(ep) + '\n')

validation_writer.close()

tf.app.run()



ModuleNotFoundError: No module named 'tensorflow.contrib'