In [1]:
import re

import pandas as pd
import numpy as np

import sklearn
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE

from nltk import word_tokenize
from nltk.corpus import stopwords

from gensim.models import Word2Vec
from gensim.models import Doc2Vec

import tensorflow as tf

import matplotlib.pyplot as plt

In [4]:
train_title = np.load('./train_title_obesity.npy')
train_abstract = np.load('./train_abstract_obesity.npy')
train_claim = np.load('./train_claim_obesity.npy')

test_title = np.load('./test_title_obesity.npy')
test_abstract = np.load('./test_abstract_obesity.npy')
test_claim = np.load('./test_claim_obesity.npy')

In [9]:
print(train_title.shape)
print(train_abstract.shape)
print(train_claim.shape)

(104, 101)
(104, 101)
(104, 101)


In [10]:
train_label = train_title[:, -1]
test_label = test_title[:, -1]

In [13]:
train_label.shape ##104
test_label.shape ##44

(44,)

In [14]:
train_title = train_title[:, :-1].astype('float32')
train_abstract = train_abstract[:, :-1].astype('float32')
train_claim = train_claim[:, :-1].astype('float32')

test_title = test_title[:, :-1].astype('float32')
test_abstract = test_abstract[:, :-1].astype('float32')
test_claim = test_claim[:, :-1].astype('float32')

In [15]:
train_title.shape

(104, 100)

In [16]:
train_text = np.concatenate([train_title, train_abstract, train_claim], axis=1) ##(104,300 )

In [17]:
train_text.shape

(104, 300)

In [20]:
test_text = np.concatenate([test_title, test_abstract, test_claim], axis=1) ##(44,300)

In [21]:
test_text.shape

(44, 300)

In [26]:
tf.reshape(test_text, [-1, 1, 100, 1]).shape

TensorShape([Dimension(132), Dimension(1), Dimension(100), Dimension(1)])

In [22]:
sm = SMOTE(random_state=12, ratio = 1.0)
x_train_res, y_train_res = sm.fit_sample(train_text, train_label)



In [23]:
le = preprocessing.LabelEncoder()
le.fit(np.unique(train_label))

LabelEncoder()

In [28]:
le

LabelEncoder()

In [29]:
y_train_n = le.transform(y_train_res)
test_y = le.transform(test_label)

In [30]:
def inception(X, input_channel, first_channel):
    """
    :param X: np array([None, 300])
    :return:
    """
    # X reshape 필요한가
    if input_channel == 1:
        X = tf.reshape(X, [-1, 1, 100, 1])
    
    l1_size = [1, 1, input_channel, first_channel*2]
    l2_size = [1, 3, input_channel, first_channel*4]
    l3_size = [1, 5, input_channel, first_channel*1]
    p_size = [1, 1, input_channel, first_channel*1]
    
    W1 = tf.Variable(tf.random_normal(l1_size, stddev=0.01))
    L1 = tf.nn.conv2d(X, W1, strides=[1, 1, 1, 1], padding='SAME')
    L1 = tf.nn.relu(L1)

    W2 = tf.Variable(tf.random_normal(l2_size, stddev=0.01))
    L2 = tf.nn.conv2d(X, W2, strides=[1, 1, 1, 1], padding='SAME')
    L2 = tf.nn.relu(L2)

    W3 = tf.Variable(tf.random_normal(l3_size, stddev=0.01))
    L3 = tf.nn.conv2d(X, W3, strides=[1, 1, 1, 1], padding='SAME')
    L3 = tf.nn.relu(L3)

    P = tf.Variable(tf.random_normal(p_size, stddev=0.01))
    P = tf.nn.conv2d(X, P, strides=[1, 1, 1, 1], padding='SAME')
    P = tf.nn.relu(P)
    P = tf.nn.max_pool(P, ksize=[1, 1, 3, 1], strides=[1, 1, 1, 1], padding='SAME')

    return tf.concat([L1, L2, L3, P], axis=3)

In [31]:
def CNN(X, input_shape, filter_size, pool_size=[1, 1, 3, 1]):
    X = tf.reshape(X, input_shape)
    W = tf.Variable(tf.random_normal(filter_size, stddev=0.01))
    L = tf.nn.conv2d(X, W, strides=[1, 1, 1, 1], padding='SAME')
    L = tf.nn.relu(L)
    # POOLING
    L = tf.nn.max_pool(L, ksize=pool_size, strides=[1, 1, 1, 1], padding='SAME')
    return L

In [32]:
tf.reset_default_graph()

In [39]:
from collections import Counter as counter
counter(y_train_n)

Counter({0: 42, 1: 42, 2: 42, 3: 42, 4: 42, 5: 42})

In [41]:
y_one_hot

array([[0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 1.],
       ...,
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.]])

In [52]:
title_value = tf.placeholder(tf.float32, [None, 100])
abstract_value = tf.placeholder(tf.float32, [None, 100])
claim_value = tf.placeholder(tf.float32, [None, 100])

y_one_hot = np.eye(6)[y_train_n]
test_y_one_hot = np.eye(6)[test_y]

train_epochs = 10000
keep_prob = tf.placeholder(tf.float32)
label = tf.placeholder(tf.int32, [None, 6])


# Inception
title = inception(title_value, 1, 32)
abstract = inception(abstract_value, 1, 32)
claim = inception(claim_value, 1, 32)

ob_text = tf.concat([title, abstract, claim], axis=3) # Shape : [246, 1, 100 768]  # axis=3으로 해보자

In [53]:
ob_text

<tf.Tensor 'concat_19:0' shape=(?, 1, 100, 768) dtype=float32>

In [54]:
# CNN
L1 = CNN(ob_text, input_shape=[-1, 1, 100, 768], filter_size=[1, 3, 768, 512])
L2 = CNN(L1, input_shape=[-1, 1, 100, 512], filter_size=[1, 2, 512, 256])

# Inception
L3 = inception(L2, 256, 32)

# CNN
L4 = CNN(L3, [-1, 1, 100, 256], filter_size=[1, 3, 256, 128])
L5 = CNN(L4, [-1, 1, 100, 128], filter_size=[1, 3, 128, 64])



text_flatten = tf.reshape(L5, [-1, 1*100*64])

O_W1 = tf.get_variable('O_W1', shape=[1*100*64, 2048], initializer=tf.contrib.layers.xavier_initializer())
# SH_W1 = tf.Variable(tf.random_normal([900, 512]))
O_B1 = tf.Variable(tf.random_normal([2048]))
L1 = tf.nn.relu(tf.matmul(text_flatten, O_W1) + O_B1)
L1 = tf.nn.dropout(L1, keep_prob=keep_prob)


O_W2 = tf.get_variable('O_W2', shape=[2048, 1024], initializer=tf.contrib.layers.xavier_initializer())
# SH_W2 = tf.Variable(tf.random_normal([512, 256]))
O_B2 = tf.Variable(tf.random_normal([1024]))
L2 = tf.nn.relu(tf.matmul(L1, O_W2) + O_B2)
L2 = tf.nn.dropout(L2, keep_prob=keep_prob)


O_W3 = tf.get_variable('O_W3', shape=[1024, 256], initializer=tf.contrib.layers.xavier_initializer())
O_B3 = tf.Variable(tf.random_normal([256]))
L3 = tf.nn.relu(tf.matmul(L2, O_W3) + O_B3)
L3 = tf.nn.dropout(L3, keep_prob=keep_prob)

O_W4 = tf.get_variable('O_W4', shape=[256, 64], initializer=tf.contrib.layers.xavier_initializer())
# O_W3 = tf.Variable(tf.random_normal([128, 32]))
O_B4 = tf.Variable(tf.random_normal([64]))
L4 = tf.nn.relu(tf.matmul(L3, O_W4) + O_B4)
L4 = tf.nn.dropout(L4, keep_prob=keep_prob)

O_W5 = tf.get_variable('O_W5', shape=[64, 6], initializer=tf.contrib.layers.xavier_initializer())
# O_W3 = tf.Variable(tf.random_normal([128, 32]))
O_B5 = tf.Variable(tf.random_normal([6]))


#hypo = tf.nn.relu(tf.add(tf.matmul(feature, O_W1), O_B))
hypo = tf.matmul(L4, O_W5) + O_B5

In [55]:
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=hypo, labels=label))
optimizer = tf.train.AdamOptimizer(learning_rate=0.001).minimize(cost)
prediction = tf.argmax(hypo, 1)
correct_prediction = tf.equal(prediction, tf.argmax(y_one_hot, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

with tf.Session() as sess:
#     sess.run(init)
    sess.run(tf.initialize_all_variables())

    for step in range(100):
        sess.run(optimizer, feed_dict={title_value: x_train_res[:, :100], abstract_value: x_train_res[:, 100:200], 
                                       claim_value: x_train_res[:, 200:], label: y_one_hot, keep_prob:0.7})
        if step % 5 == 0: # Test Accuracy 찍어보기!
            loss, acc = sess.run([cost, accuracy], feed_dict={title_value: x_train_res[:, :100], abstract_value: x_train_res[:, 100:200], 
                                       claim_value: x_train_res[:, 200:], label: y_one_hot, keep_prob:0.7})
            
            pred_val = sess.run(prediction, feed_dict={title_value: test_title,
                                                       abstract_value: test_abstract,
                                                       claim_value: test_claim, keep_prob: 1})  # claim, ab 순서 바뀜
            correct = sum(pred_val == test_y)
            test_acc = round((correct / len(pred_val)), 2)
            print("Step: {:5}\tLoss: {:.3f}\tAcc: {:.2%}\tTest ACC: {:%}".format(
                step, loss, acc, test_acc))

    # Let's see if we can predict
    pred = sess.run(prediction, feed_dict={title_value: test_title,
                                                       abstract_value: test_abstract,
                                                       claim_value: test_claim, keep_prob: 1})
    # y_data: (N,1) = flatten => (N, ) matches pred.shape
    total_cnt = 0
    right_cnt = 0
    for p, y in zip(pred, test_y):
        total_cnt+=1
        if p == int(y):
            right_cnt+=1
        print("[{}] Prediction: {} True Y: {}".format(p == int(y), p, int(y)))

print('Accuracy :', round(right_cnt/total_cnt, 3))

Instructions for updating:
Use `tf.global_variables_initializer` instead.
Step:     0	Loss: 3.952	Acc: 15.08%	Test ACC: 7.000000%
Step:     5	Loss: 2.199	Acc: 18.65%	Test ACC: 7.000000%
Step:    10	Loss: 2.090	Acc: 19.05%	Test ACC: 7.000000%
Step:    15	Loss: 1.909	Acc: 17.06%	Test ACC: 9.000000%
Step:    20	Loss: 1.860	Acc: 23.02%	Test ACC: 14.000000%
Step:    25	Loss: 1.740	Acc: 26.59%	Test ACC: 14.000000%
Step:    30	Loss: 1.620	Acc: 31.35%	Test ACC: 16.000000%
Step:    35	Loss: 1.383	Acc: 49.21%	Test ACC: 14.000000%
Step:    40	Loss: 1.131	Acc: 54.76%	Test ACC: 14.000000%
Step:    45	Loss: 0.942	Acc: 67.46%	Test ACC: 16.000000%
Step:    50	Loss: 0.665	Acc: 78.17%	Test ACC: 18.000000%
Step:    55	Loss: 0.531	Acc: 82.14%	Test ACC: 18.000000%
Step:    60	Loss: 0.468	Acc: 83.73%	Test ACC: 18.000000%
Step:    65	Loss: 0.328	Acc: 89.29%	Test ACC: 23.000000%
Step:    70	Loss: 0.215	Acc: 90.87%	Test ACC: 32.000000%
Step:    75	Loss: 0.147	Acc: 95.63%	Test ACC: 32.000000%
Step:    80	Loss: 

In [56]:
tf.reset_default_graph()

In [57]:
train_epochs = 10000
keep_prob = tf.placeholder(tf.float32)
feature = tf.placeholder(tf.float32, [None, 300])
label = tf.placeholder(tf.int32, [None, 6])


OB_W1 = tf.get_variable('OB_W1', shape=[300, 512], initializer=tf.contrib.layers.xavier_initializer())
# OB_W1 = tf.Variable(tf.random_normal([900, 512]))
OB_B = tf.Variable(tf.random_normal([512]))
L1 = tf.nn.relu(tf.matmul(feature, OB_W1) + OB_B)
L1 = tf.nn.dropout(L1, keep_prob=keep_prob)


OB_W2 = tf.get_variable('OB_W2', shape=[512, 256], initializer=tf.contrib.layers.xavier_initializer())
# OB_W2 = tf.Variable(tf.random_normal([512, 256]))
OB_B2 = tf.Variable(tf.random_normal([256]))
L2 = tf.nn.relu(tf.matmul(L1, OB_W2) + OB_B2)
L2 = tf.nn.dropout(L2, keep_prob=keep_prob)


OB_W3 = tf.get_variable('OB_W3', shape=[256, 32], initializer=tf.contrib.layers.xavier_initializer())
# OB_W3 = tf.Variable(tf.random_normal([256, 32]))
OB_B3 = tf.Variable(tf.random_normal([32]))
L3 = tf.nn.relu(tf.matmul(L2, OB_W3) + OB_B3)
L3 = tf.nn.dropout(L3, keep_prob=keep_prob)


OB_W4 = tf.get_variable('OB_W4', shape=[32, 6], initializer=tf.contrib.layers.xavier_initializer())
# OB_W4 = tf.Variable(tf.random_normal([32, 6]))
OB_B4 = tf.Variable(tf.random_normal([6]))
#L4 = tf.nn.relu(tf.matmul(feature, OB_W1) + OB_B4)

#hypo = tf.nn.relu(tf.add(tf.matmul(feature, OB_W1), OB_B))
hypo = tf.matmul(L3, OB_W4) + OB_B4
# hypo = tf.add(tf.matmul(), B)

cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=hypo, labels=label))
optimizer = tf.train.AdamOptimizer(learning_rate=0.0001).minimize(cost)
prediction = tf.argmax(hypo, 1)
correct_prediction = tf.equal(prediction, tf.argmax(y_one_hot, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

In [58]:
with tf.Session() as sess:
#     sess.run(init)
    sess.run(tf.initialize_all_variables())

    for step in range(1000):
        sess.run(optimizer, feed_dict={feature: x_train_res, label: y_one_hot, keep_prob:1})
        if step % 50 == 0: # Test Accuracy 찍어보기!
            loss, acc = sess.run([cost, accuracy], feed_dict={feature: x_train_res, label: y_one_hot, keep_prob:1})
            pred_val = sess.run(prediction, feed_dict={feature: test_text, keep_prob: 1})
            test_acc = round(sum(pred_val == test_y)/len(pred_val), 2)
            print("Step: {:5}\tLoss: {:.3f}\tAcc: {:.2%}\tTest ACC: {:%}".format(
                step, loss, acc, test_acc))

    # Let's see if we can predict
    pred = sess.run(prediction, feed_dict={feature: test_text, keep_prob:1})
    # y_data: (N,1) = flatten => (N, ) matches pred.shape
    total_cnt = 0
    right_cnt = 0
    for p, y in zip(pred, test_y):
        total_cnt+=1
        if p == int(y):
            right_cnt+=1
        print("[{}] Prediction: {} True Y: {}".format(p == int(y), p, int(y)))

print('Accuracy :', round(right_cnt/total_cnt, 3))

Instructions for updating:
Use `tf.global_variables_initializer` instead.
Step:     0	Loss: 2.297	Acc: 16.67%	Test ACC: 25.000000%
Step:    50	Loss: 0.538	Acc: 98.02%	Test ACC: 36.000000%
Step:   100	Loss: 0.107	Acc: 99.60%	Test ACC: 34.000000%
Step:   150	Loss: 0.032	Acc: 99.60%	Test ACC: 34.000000%
Step:   200	Loss: 0.014	Acc: 100.00%	Test ACC: 32.000000%
Step:   250	Loss: 0.007	Acc: 100.00%	Test ACC: 34.000000%
Step:   300	Loss: 0.004	Acc: 100.00%	Test ACC: 34.000000%
Step:   350	Loss: 0.003	Acc: 100.00%	Test ACC: 30.000000%
Step:   400	Loss: 0.002	Acc: 100.00%	Test ACC: 32.000000%
Step:   450	Loss: 0.001	Acc: 100.00%	Test ACC: 32.000000%
Step:   500	Loss: 0.001	Acc: 100.00%	Test ACC: 32.000000%
Step:   550	Loss: 0.001	Acc: 100.00%	Test ACC: 32.000000%
Step:   600	Loss: 0.001	Acc: 100.00%	Test ACC: 34.000000%
Step:   650	Loss: 0.000	Acc: 100.00%	Test ACC: 36.000000%
Step:   700	Loss: 0.000	Acc: 100.00%	Test ACC: 36.000000%
Step:   750	Loss: 0.000	Acc: 100.00%	Test ACC: 34.000000%
St

In [74]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

In [75]:
patent_tree = RandomForestClassifier(criterion='entropy', max_depth=3, random_state=0)
patent_tree.fit( x_train_res, y_one_hot)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [76]:
from sklearn.metrics import accuracy_score

y_pred_tr = patent_tree.predict(test_text)

print('Accuracy: %.2f' % accuracy_score(test_y_one_hot, y_pred_tr))

Accuracy: 0.00
