# 2019/12/03 CoE 202 Activity 5

### **Name Classification**<br/>

**Professor: Yong Hoon, Lee**</br>

**TA : Seungjun moon, Beomgu Kang**

In [1]:
import tensorflow as tf
import numpy as np
import os.path
import string

model_save_path = 'tmp/model.ckpt'
tf.reset_default_graph()

### Hyperparameters

In [0]:
learning_rate       = 0.005
n_epoch             = 200
n_hidden            = 128 # hidden layer features
max_sequence_length = 19 # maximum number of characters is 19

In [0]:
all_letters = string.ascii_letters + " .,;'"
n_input     = len(all_letters)
alphabet    = all_letters
ethnicities = ['Chinese', 'Japanese', 'Vietnamese', 'Korean', 'Arabic','Czech','Dutch','English','French','German','Greek','Irish','Italian','Polish','Portuguese','Russian','Scottish','Spanish']
n_classes   = len(ethnicities) # the number of classes

name_strings, ethnicity_strings, str_list, names_list, ethnicity_list = [], [], [], [], []

## Define functions

In [0]:
def weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial)

In [0]:
def bias_variable(shape):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)

In [0]:
def name_one_hot(name, max_sequence_length):
    result = []
    for char in name:
        v = np.zeros(n_input, dtype=np.int) # count space as a character
        v[alphabet.index(char)] = 1
        result.append(v)
    while len(result) < max_sequence_length:
        result.append(np.zeros(n_input, dtype=np.int))
    result = np.array(result)
    return result

In [0]:
def ethnicity_one_hot(ethnicity):
    v = np.zeros(n_classes, dtype=np.int)
    v[ethnicities.index(ethnicity)] = 1
    return v

## Data load 

In [0]:
with open('names_revised.csv', 'r') as csv:
    for line in csv:       
        l = [s.strip() for s in line.split(',')] # lowercase L, not capital i , l['name', 'ehnicity']
        if(l[1] in ethnicities):
            name_strings.append(l[0])
            ethnicity_strings.append(l[1])
            if len(l[0]) > max_sequence_length:
                l[0] = l[0][:max_sequence_length]
            names_list.append(name_one_hot(l[0], max_sequence_length)) # one-hot vector of each characters of name
            ethnicity_list.append(ethnicity_one_hot(l[1])) # one-hot vector of ethnicity

## Training - Test Seperation

In [0]:
rng_state = np.random.get_state() # use the same random number generator state
np.random.shuffle(names_list)     # when shuffling the two lists
np.random.set_state(rng_state)    # they are effectively shuffled in parallel so that inputs still correspond to outputs after shuffling
np.random.shuffle(ethnicity_list)

In [0]:
size = len(names_list) 
train_size = np.int(size*2/3) 

training_X = np.array(names_list[:train_size])
training_y = np.array(ethnicity_list[:train_size])
testing_X = np.array(names_list[train_size:])
testing_y = np.array(ethnicity_list[train_size:])

## Build a model

In [0]:
X = tf.placeholder(tf.float32, [None, max_sequence_length, n_input])
y = tf.placeholder(tf.float32, [None, n_classes])

In [0]:
out_weights = weight_variable([n_hidden, n_classes])
out_biases = bias_variable([n_classes])

In [0]:
# Basic RNN
# cells = tf.contrib.rnn.BasicRNNCell(num_units = 128)
# LSTM
# cells = tf.contrib.rnn.BasicLSTMCell(num_units = 128)
# GRU
# cells = tf.contrib.rnn.GRUCell(num_units = 128)
# Modified DNN
x = tf.reshape(X, [-1, max_sequence_length * n_input])

w_init = tf.variance_scaling_initializer()
b_init = tf.constant_initializer(0.)

## 1st hidden layer
w1 = tf.get_variable('weight1', [max_sequence_length * n_input, 256], initializer=w_init)      # weight for 1st hidden layer which have 256 units
b1 = tf.get_variable('biases1', [256], initializer=b_init)                                     # bias for 1st hidden layer which have 256 units
h1  = tf.matmul(x, w1) + b1                                                                    # matrix multiplication
h1  = tf.nn.relu(h1)                                                                           # relu activation

## 2nd hidden layer
w2 = tf.get_variable('weight2', [256, 256], initializer=w_init)                                # weight for 2nd hidden layer which have 256 units
b2 = tf.get_variable('biases2', [256], initializer=b_init)                                     # bias for 2nd hidden layer which have 256 units
h2  = tf.matmul(h1, w2) + b2                                                                   # matrix multiplication
h2  = tf.nn.relu(h2)                                                                           # relu activation

## output layer
w3 = tf.get_variable('weight3', [256, 18], initializer=w_init)                                 # weight for output layer which have 18 classes

y_ = tf.matmul(h2, w3)

In [0]:
# y_ = tf.matmul(outputs[:,-1,:], out_weights) + out_biases # predict y based on final rnn output

In [0]:
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=y_, labels=y))
train_step = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)

In [0]:
# Evaluation
correct_prediction = tf.equal(tf.argmax(y_,1), tf.argmax(y,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

In [0]:
# Softmax
pred = tf.nn.softmax(y_)

In [0]:
init = tf.global_variables_initializer()
saver = tf.train.Saver()

## Train a model

In [0]:
sess = tf.InteractiveSession()
sess.run(init)

In [20]:


for _ in range(n_epoch+1):
    sess.run(train_step, feed_dict={X: training_X, y: training_y})
    if _%10 == 0:
        train_accuracy = accuracy.eval(feed_dict={X:training_X, y:training_y})
        print("step %d, training accuracy %g"%(_, train_accuracy))
        test_accuracy = accuracy.eval(feed_dict={X:testing_X, y:testing_y})
        print("testing accuracy", test_accuracy)
saver.save(sess, model_save_path)
print("Model saved in file: %s" % model_save_path)

step 0, training accuracy 0.471898
testing accuracy 0.46196383
step 10, training accuracy 0.479596
testing accuracy 0.47018382
step 20, training accuracy 0.654709
testing accuracy 0.63682556
step 30, training accuracy 0.715321
testing accuracy 0.69406664
step 40, training accuracy 0.760688
testing accuracy 0.729039
step 50, training accuracy 0.803812
testing accuracy 0.75683755
step 60, training accuracy 0.844768
testing accuracy 0.78314155
step 70, training accuracy 0.885501
testing accuracy 0.7868779
step 80, training accuracy 0.91719
testing accuracy 0.7850844
step 90, training accuracy 0.941704
testing accuracy 0.78538334
step 100, training accuracy 0.957474
testing accuracy 0.7834405
step 110, training accuracy 0.967339
testing accuracy 0.7806008
step 120, training accuracy 0.972646
testing accuracy 0.7785084
step 130, training accuracy 0.976158
testing accuracy 0.77611715
step 140, training accuracy 0.977803
testing accuracy 0.7740248
step 150, training accuracy 0.978625
testing 

In [21]:
i=0
while i<5:
    input_name = input('Enter a last name (max 19 letters):')
   
    while len(input_name) > max_sequence_length or len(input_name) == 0:
        input_name = raw_input('Invalid input. Enter a last name (max 19 letters):')
   
    result=pred.eval(feed_dict={X: np.expand_dims(name_one_hot(input_name, 19), axis=0)})[0]
    idx = np.argsort(result)[::-1]
    print("\n(%s): %.4f" % (ethnicities[idx[0]], result[idx[0]]))
    print("(%s): %.4f" % (ethnicities[idx[1]], result[idx[1]]))
    print("(%s): %.4f" % (ethnicities[idx[2]], result[idx[2]]))
    print("==========================================")
    i=i+1

Enter a last name (max 19 letters):Kim

(Korean): 0.5832
(Vietnamese): 0.3729
(English): 0.0190
Enter a last name (max 19 letters):Han

(Vietnamese): 0.4924
(Korean): 0.4850
(Chinese): 0.0139
Enter a last name (max 19 letters):blanc

(English): 0.9708
(Czech): 0.0205
(Spanish): 0.0075
Enter a last name (max 19 letters):sebastian

(Russian): 0.9997
(Greek): 0.0003
(Italian): 0.0000
Enter a last name (max 19 letters):andrew

(English): 0.9592
(Scottish): 0.0323
(Dutch): 0.0082


## 4. Report

### a. Use GRU, LSTM and Simple RNN functions for training . Compare each of results.

### b. Replace the RNN with DNN as below.

In [0]:
x = tf.reshape(X, [-1, max_sequence_length * n_input])

w_init = tf.variance_scaling_initializer()
b_init = tf.constant_initializer(0.)

## 1st hidden layer
w1 = tf.get_variable('weight1', [max_sequence_length * n_input, 256], initializer=w_init)      # weight for 1st hidden layer which have 256 units
b1 = tf.get_variable('biases1', [256], initializer=b_init)                                     # bias for 1st hidden layer which have 256 units
h1  = tf.matmul(x, w1) + b1                                                                    # matrix multiplication
h1  = tf.nn.relu(h1)                                                                           # relu activation

## 2nd hidden layer
w2 = tf.get_variable('weight2', [256, 256], initializer=w_init)                                # weight for 2nd hidden layer which have 256 units
b2 = tf.get_variable('biases2', [256], initializer=b_init)                                     # bias for 2nd hidden layer which have 256 units
h2  = tf.matmul(h1, w2) + b2                                                                   # matrix multiplication
h2  = tf.nn.relu(h2)                                                                           # relu activation

## output layer
w3 = tf.get_variable('weight3', [256, 18], initializer=w_init)                                 # weight for output layer which have 18 classes

y_ = tf.matmul(h2, w3)

### Submission (Due: Dec. 10 Tue.)
Submit your report by Tuesday, December 10 to **"june1212@kaist.ac.kr"**