In [1]:
import tensorflow as tf
import pickle
import numpy as np
from sklearn.feature_extraction.text import HashingVectorizer, CountVectorizer
from sklearn.preprocessing import OneHotEncoder
from konlpy.tag import Kkma

In [2]:
if 'data' not in globals():
    data = pickle.load(open("./train_val_test.pkl", "rb"))

In [3]:
for k, v in data.items():
    print(k, v.shape)

X_val (1741613,)
X_train (5224838,)
y_val (1741613,)
X_test (1741613,)
y_test (1741613,)
y_train (5224838,)


In [4]:
kkma = Kkma()
def space_tokenizer(string):    
    return kkma.nouns(string.strip())

In [5]:
D = 10000
N = 10000 #5224838 // 2

In [6]:
N_sample = N
small_data = {
    'X_train': data['X_train'][:N_sample],
    'y_train': data['y_train'][:N_sample],
    
    'X_val'  : data['X_val'][:N_sample//3+1],
    'y_val'  : data['y_val'][:N_sample//3+1],
    
    'X_test' : data['X_test'][:N_sample//3+1],
    'y_test' : data['y_test'][:N_sample//3+1]    
}

In [7]:
fh = CountVectorizer(tokenizer=space_tokenizer, ngram_range=(1,2))
X_train = fh.fit_transform(small_data['X_train']).toarray()
X_val = fh.transform(small_data['X_val']).toarray()

In [8]:
D = X_train.shape[1]

In [9]:
print(X_train.shape)
print(X_val.shape)

(10000, 52508)
(3334, 52508)


In [10]:
oh = OneHotEncoder()
oh_train = oh.fit_transform(small_data['y_train'].reshape(-1, 1)).toarray()
oh_val   = oh.transform(small_data['y_val'].reshape(-1, 1)).toarray()

In [11]:
tf.reset_default_graph()
x = tf.placeholder(tf.float32, shape=[None, D])
keep_prob = tf.placeholder(tf.float32)
reg = tf.placeholder(tf.float32, shape=None)

W1 = tf.get_variable("W1", shape=(D, 1024), dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer())
b1 = tf.Variable(tf.zeros([1024]), dtype=tf.float32)
z2 = tf.matmul(x, W1) + b1
b2 = tf.contrib.layers.batch_norm(z2)
r2 = tf.nn.relu(b2)
a2 = tf.nn.dropout(r2, keep_prob)

W2 = tf.get_variable("W2", shape=(1024, 10), dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer())
b2 = tf.Variable(tf.zeros([10]), dtype=tf.float32)
z3 = tf.matmul(a2, W2) + b2
b3 = tf.contrib.layers.batch_norm(z3)
a3 = tf.nn.dropout(b3, keep_prob)             
y = tf.nn.softmax(a3) # Predict

y_ = tf.placeholder(tf.float32, [None, 10]) # Real

cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]) + 0.5*reg * tf.nn.l2_loss(W1) + 0.5*reg * tf.nn.l2_loss(W2))
train_step = tf.train.AdamOptimizer(1e-3).minimize(cross_entropy)
correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

In [13]:
HOW_MANY_PRINT = 20
batch_size = 64
epochs = 10
iterations_per_epoch = X_train.shape[0] // batch_size
TOTAL_ITERATIONS = epochs * iterations_per_epoch
PRINT_EVERY = TOTAL_ITERATIONS // HOW_MANY_PRINT
saver = tf.train.Saver()
#saver.restore(sess, "NN_model_1")
kfolds_params = dict()
kfolds_params = {
    "keep_prob": np.random.uniform(0.5, 1, size=5),
    "reg": np.random.uniform(0, 2, size=5),
    "best_params": kfolds_params.get("best_params", dict()),
    "best_acc": kfolds_params.get('best_acc', 0),
    "verbose": False,
}

for rg in kfolds_params['reg']:
    for k_p in kfolds_params['keep_prob']:        
        current_epoch = 1
        init = tf.initialize_all_variables()
        sess = tf.Session()
        sess.run(init)
        for i in range(TOTAL_ITERATIONS):    
            idx = np.random.choice(np.arange(N_sample), size=batch_size)
            batch_x, batch_y = X_train[idx], oh_train[idx]

            f_dict =  {
                        x : batch_x,
                        y_: batch_y,
                        keep_prob: k_p,
                        reg : rg
                      }


            if (i % PRINT_EVERY == 0 or i == TOTAL_ITERATIONS - 1) and kfolds_params['verbose']:
                #f_dict[keep_prob] = 1.0        
                loss = sess.run(cross_entropy, feed_dict = f_dict)
                print("step {}/{} loss: {}".format(i+1, TOTAL_ITERATIONS, loss))

            if i % iterations_per_epoch == 0 and kfolds_params['verbose']:

                train_acc = sess.run(accuracy, feed_dict={x: X_train, y_: oh_train, keep_prob: 1})
                val_acc = sess.run(accuracy, feed_dict={x: X_val, y_: oh_val, keep_prob: 1})
                print("[epoch {}/{}] train_acc: {:.4%} val_acc: {:.4%}".format(current_epoch, epochs, train_acc, val_acc))
                current_epoch += 1


            #f_dict[keep_prob] = 0.5
            sess.run(train_step, feed_dict = f_dict)


        val_acc = sess.run(accuracy, feed_dict={x: X_val, y_: oh_val, keep_prob: 1})
        if kfolds_params['verbose']:
            print("")
            print("final valid acc: {:.4%}".format(val_acc))
        
        if val_acc > kfolds_params['best_acc']:
            kfolds_params['best_acc'] = val_acc
            kfolds_params['best_params']['reg'] = rg
            kfolds_params['best_params']['keep_prob'] = k_p
            print("{} with Acc: {}".format(kfolds_params['best_params'], kfolds_params['best_acc']))

print("complete")

{'keep_prob': 0.74255695110058939, 'reg': 1.0153681480131223} with Acc: 0.5020995736122131
{'keep_prob': 0.67490416503738726, 'reg': 1.0153681480131223} with Acc: 0.5092981457710266
{'keep_prob': 0.74255695110058939, 'reg': 1.6204904327810024} with Acc: 0.5101979374885559
{'keep_prob': 0.98622708263671588, 'reg': 1.6204904327810024} with Acc: 0.52219557762146
complete


In [14]:
HOW_MANY_PRINT = 20
batch_size = 64
epochs = 10
iterations_per_epoch = X_train.shape[0] // batch_size
TOTAL_ITERATIONS = epochs * iterations_per_epoch
PRINT_EVERY = TOTAL_ITERATIONS // HOW_MANY_PRINT
saver = tf.train.Saver()
current_epoch = 1
init = tf.initialize_all_variables()
sess = tf.Session()
sess.run(init)
kfolds_params['verbose'] = True
for i in range(TOTAL_ITERATIONS):    
    idx = np.random.choice(np.arange(N_sample), size=batch_size)
    batch_x, batch_y = X_train[idx], oh_train[idx]

    f_dict =  {
                x : batch_x,
                y_: batch_y,
                keep_prob: kfolds_params['best_params']['keep_prob'],
                reg : kfolds_params['best_params']['reg']/0.5
              }


    if (i % PRINT_EVERY == 0 or i == TOTAL_ITERATIONS - 1) and kfolds_params['verbose']:
        #f_dict[keep_prob] = 1.0        
        loss = sess.run(cross_entropy, feed_dict = f_dict)
        print("step {}/{} loss: {}".format(i+1, TOTAL_ITERATIONS, loss))

    if i % iterations_per_epoch == 0 and kfolds_params['verbose']:

        train_acc = sess.run(accuracy, feed_dict={x: X_train, y_: oh_train, keep_prob: 1})
        val_acc = sess.run(accuracy, feed_dict={x: X_val, y_: oh_val, keep_prob: 1})
        print("[epoch {}/{}] train_acc: {:.4%} val_acc: {:.4%}".format(current_epoch, epochs, train_acc, val_acc))
        current_epoch += 1


    #f_dict[keep_prob] = 0.5
    sess.run(train_step, feed_dict = f_dict)


val_acc = sess.run(accuracy, feed_dict={x: X_val, y_: oh_val, keep_prob: 1})
if kfolds_params['verbose']:
    print("")
    print("final valid acc: {:.4%}".format(val_acc))

step 1/1560 loss: 1645.2509765625
[epoch 1/10] train_acc: 9.2900% val_acc: 8.5783%
step 79/1560 loss: 6.90858268737793
step 157/1560 loss: 7.355433464050293
[epoch 2/10] train_acc: 47.5600% val_acc: 42.6815%
step 235/1560 loss: 6.942873954772949
step 313/1560 loss: 6.266216278076172
[epoch 3/10] train_acc: 50.5000% val_acc: 45.8608%
step 391/1560 loss: 6.470705032348633
step 469/1560 loss: 6.837285995483398
[epoch 4/10] train_acc: 51.7600% val_acc: 46.0708%
step 547/1560 loss: 5.033098220825195
step 625/1560 loss: 5.0623955726623535
[epoch 5/10] train_acc: 53.7500% val_acc: 46.7307%
step 703/1560 loss: 4.340278625488281
step 781/1560 loss: 4.201669692993164
[epoch 6/10] train_acc: 56.1700% val_acc: 49.6101%
step 859/1560 loss: 3.692446708679199
step 937/1560 loss: 3.4516656398773193
[epoch 7/10] train_acc: 56.5500% val_acc: 49.6101%
step 1015/1560 loss: 3.197599411010742
step 1093/1560 loss: 2.942538261413574
[epoch 8/10] train_acc: 56.6800% val_acc: 50.2999%
step 1171/1560 loss: 2.924

In [15]:
saver.save(sess, "NN_model_1")

'NN_model_1'

In [16]:
test_string = np.array([
        "꿀잼",
        "이영화 별로에요",
        "아오 진심 재미없네 개 쓰레기 영화 1점 주기도 아깝다 ㅡㅡ"
    ])
test = fh.transform(test_string).toarray()
pred = (sess.run(tf.argmax(y, 1)+1, feed_dict={x: test, keep_prob: 1}))
print(pred)

[10 10  1]


In [17]:
X_test = fh.transform(small_data['X_test']).toarray()

In [18]:
y_test = oh.transform(small_data['y_test'].reshape(-1, 1)).toarray()

In [19]:
sess.run(accuracy, feed_dict={x:X_test, y_: y_test, keep_prob: 1})

0.5233953

In [20]:
sess.close()