# 第8回講義 宿題

## 課題
RNNを用いてIMDbのsentiment analysisを実装してみましょう。

ネットワークの形などは特に制限を設けませんし、今回のLessonで扱った内容以外の工夫も組み込んでもらって構いません。

## 目標値
F値：0.9

## ルール
- 以下のセルで指定されている`x_train, t_train`以外の学習データは使わないでください。

## 提出方法
- 2つのファイルを提出していただきます。
  1. テストデータ (x_test) に対する予測ラベルを`submission_pred.csv`として保存し、Homeworkタブから**chap08**を選択して提出してください。
  2. それに対応するpythonのコードを`submission_code.py`として保存し、Homeworkタブから**chap08 (code)**を選択して提出してください。
    - セルに書いたコードを.py形式で保存するためには%%writefileコマンドなどを利用してください。
    - writefileコマンドではファイルの保存のみが行われセル内のpythonコード自体は実行されません。そのため、実際にコードを走らせる際にはwritefileコマンドをコメントアウトしてください


- コードの内容を変更した場合は、1と2の両方を提出し直してください。

- なお、採点は1で行い、2はコードの確認用として利用します。(成績優秀者はコード内容を公開させていただくかもしれません。)

- **宿題の締め切りは【出題週の翌週水曜日24時】です。**

## 評価方法

- 予測ラベルの（`t_test`に対する）F値で評価します。
- 毎日24時にテストデータの一部に対するF値でLeader Boardを更新します。
- 締切日の夜24時にテストデータ全体に対するF値でLeader Boardを更新します。これを最終的な評価とします。

## データの読み込み（このセルは修正しないでください）

In [23]:
import numpy as np

def load_dataset():
    # 学習データ
    x_train = np.load('/root/userspace/public/chap08/data/x_train.npy')
    t_train = np.load('/root/userspace/public/chap08/data/t_train.npy')
    
    # テストデータ
    x_test = np.load('/root/userspace/public/chap08/data/x_test.npy')

    return (x_train, x_test, t_train)

x_train, x_test, t_train = load_dataset()

## 実装

In [2]:
# %%writefile /root/userspace/chap08/materials/submission_code.py
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tensorflow.keras.preprocessing.sequence import pad_sequences

### 関数定義 ###
def tf_log(x):
    return tf.log(tf.clip_by_value(x, 1e-10, x))

### レイヤー定義 ###
class Embedding:
    def __init__(self, vocab_size, emb_dim, scale = 0.08):
        self.V = tf.Variable(tf.random_normal(shape = [vocab_size, emb_dim], stddev = scale), name = "V")
        
    def __call__(self, x):
        return tf.nn.embedding_lookup(self.V, x)
    
class RNN:
    def __init__(self, hid_dim, seq_len = None, initial_state = None):
        self.cell = tf.nn.rnn_cell.BasicRNNCell(hid_dim)
        self.initial_state = initial_state
        self.seq_len = seq_len
    
    def __call__(self, x):
        if self.initial_state is None:
            self.initial_state = self.cell.zero_state(tf.shape(x)[0], tf.float32)
            
        # outputsは各系列長分以降は0になるので注意
        outputs, state = tf.nn.dynamic_rnn(self.cell, x, self.seq_len, self.initial_state)
        return tf.gather_nd(outputs, indices = tf.stack([tf.range(tf.shape(x)[0]), self.seq_len-1], axis = 1 ))

### グラフ構築 ###
tf.reset_default_graph()

emb_dim = 100
hid_dim = 50
# np.hstackで横方向に並べるので謎だが、要は
# x_train, x_test全てのなかで最もidが大きい単語は何かを抽出している?
num_words = max([max(s) for s in np.hstack((x_train, x_test))])
pad_index = 0

x = tf.placeholder(tf.int32, [None, None], name='x')
t = tf.placeholder(tf.float32, [None, None], name='t')

seq_len = tf.reduce_sum(tf.cast(tf.not_equal(x, pad_index), tf.int32), axis=1)

h = Embedding(num_words, emb_dim)(x)
h = RNN(hid_dim, seq_len)(h)
y = tf.layers.Dense(1, tf.nn.sigmoid)(h)

cost = -tf.reduce_mean(t*tf_log(y) + (1 - t)*tf_log(1 - y))
optimizer = tf.train.AdamOptimizer()
train = optimizer.minimize(cost)

test = tf.round(y)

### データの準備 ###
x_train, x_valid, t_train, t_valid = train_test_split(x_train, t_train)

### 学習 ###
n_epochs = 5
batch_size = 100 # バッチサイズが大きいと、ResourceExhaustedErrorになることがあります
n_batches_train = len(x_train) // batch_size
n_batches_valid = len(x_valid) // batch_size
n_batches_test = len(x_test) // batch_size

init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for epoch in range(n_epochs):
        # Train
        train_costs = []
        for i in range(n_batches_train):
            start = i * batch_size
            end = start + batch_size
            
            x_train_batch = np.array(pad_sequences(x_train[start:end], padding='post', value=pad_index))
            t_train_batch = np.array(t_train[start:end])[:, None]

            _, train_cost = sess.run([train, cost], feed_dict={x: x_train_batch, t: t_train_batch})
            train_costs.append(train_cost)
        
        # Valid
        valid_costs = []
        y_pred = []
        for i in range(n_batches_valid):
            start = i * batch_size
            end = start + batch_size
            
            x_valid_pad = np.array(pad_sequences(x_valid[start:end], padding='post', value=pad_index))
            t_valid_pad = np.array(t_valid[start:end])[:, None]
            
            pred, valid_cost = sess.run([test, cost], feed_dict={x: x_valid_pad, t: t_valid_pad})
            y_pred += pred.flatten().tolist()
            valid_costs.append(valid_cost)
        print('EPOCH: {}, Training Cost: {:.3f}, Validation Cost: {:.3f}, Validation F1: {:.3f}'.format(epoch+1, np.mean(train_costs), np.mean(valid_costs), f1_score(t_valid, y_pred, average='macro')))
    
    # Test
    test_costs = []
    test_y_pred = []
    for i in range(n_batches_test):
        start = i * batch_size
        end = start + batch_size
        x_test_pad = np.array(pad_sequences(x_test[start:end], padding = 'post', value = pad_index))
        _pred = sess.run(test, feed_dict = {x:x_test_pad})
        test_y_pred += _pred.flatten().tolist()
        
    ### 出力 ###
    submission = pd.Series(test_y_pred, name='label')
    submission.to_csv('/root/userspace/chap08/submission/submission_pred.csv', header=True, index_label='id')

Instructions for updating:
This class is equivalent as tf.keras.layers.SimpleRNNCell, and will be replaced by that in Tensorflow 2.0.
EPOCH: 1, Training Cost: 0.529, Validation Cost: 0.555, Validation F1: 0.709
EPOCH: 2, Training Cost: 0.452, Validation Cost: 0.647, Validation F1: 0.600
EPOCH: 3, Training Cost: 0.470, Validation Cost: 0.451, Validation F1: 0.810
EPOCH: 4, Training Cost: 0.291, Validation Cost: 0.467, Validation F1: 0.799
EPOCH: 5, Training Cost: 0.340, Validation Cost: 0.585, Validation F1: 0.716


NameError: name 'pd' is not defined

In [10]:
# print(t_valid.shape)
type(x_train[1])

list

In [21]:
# %%writefile /root/userspace/chap08/submission/submission_code_LSTM.py
np.random.seed(34)
import math
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tensorflow.keras.preprocessing.sequence import pad_sequences

### 関数定義 ###
def tf_log(x):
    return tf.log(tf.clip_by_value(x, 1e-10, x))

### レイヤー定義 ###
class Embedding:
    def __init__(self, vocab_size, emb_dim, scale = 0.08):
        self.V = tf.Variable(tf.random_normal(shape = [vocab_size, emb_dim], stddev = scale), name = "V")
        
    def __call__(self, x):
        return tf.nn.embedding_lookup(self.V, x)
    
class RNN:
    def __init__(self, hid_dim, seq_len = None, initial_state = None):
        self.cell = tf.nn.rnn_cell.BasicRNNCell(hid_dim)
        self.initial_state = initial_state
        self.seq_len = seq_len
    
    def __call__(self, x):
        if self.initial_state is None:
            self.initial_state = self.cell.zero_state(tf.shape(x)[0], tf.float32)
            
        # outputsは各系列長分以降は0になるので注意
        outputs, state = tf.nn.dynamic_rnn(self.cell, x, self.seq_len, self.initial_state)
        return tf.gather_nd(outputs, indices = tf.stack([tf.range(tf.shape(x)[0]), self.seq_len-1], axis = 1 ))
    
### LSTMによるレイヤー定義 ###
class LSTM:
    def __init__(self, in_dim, hid_dim, seq_len = None, initial_state = None):
        self.in_dim = in_dim
        self.hid_dim = hid_dim

        glorot = tf.cast(tf.sqrt(6/(in_dim + hid_dim*2)), tf.float32)
        
        # 入力ゲート
        self.W_i = tf.Variable(tf.random_uniform([in_dim + hid_dim, hid_dim], minval=-glorot, maxval=glorot), name='W_i')
        self.b_i  = tf.Variable(tf.zeros([hid_dim]), name='b_i')
        
        # 忘却ゲート
        self.W_f = tf.Variable(tf.random_uniform([in_dim + hid_dim, hid_dim], minval=-glorot, maxval=glorot), name='W_f')
        self.b_f  = tf.Variable(tf.zeros([hid_dim]), name='b_f')

        # 出力ゲート
        self.W_o = tf.Variable(tf.random_uniform([in_dim + hid_dim, hid_dim], minval=-glorot, maxval=glorot), name='W_o')
        self.b_o  = tf.Variable(tf.zeros([hid_dim]), name='b_o')

        # セル
        self.W_c = tf.Variable(tf.random_uniform([in_dim + hid_dim, hid_dim], minval=-glorot, maxval=glorot), name='W_c')
        self.b_c  = tf.Variable(tf.zeros([hid_dim]), name='b_c')

        # マスク
        self.seq_len = seq_len
        
        self.initial_state = initial_state

    def __call__(self, x):
        # tf.scanへの適用関数fn
        # WRITE ME
        def fn(prev_state, x_and_m):
            x_t, m_t = x_and_m
            c_prev, h_prev = prev_state[0], prev_state[1]
            inputs = tf.concat([x_t, h_prev], axis = -1)
            
            # 入力ゲート
            i_t = tf.nn.sigmoid(tf.matmul(inputs, self.W_i) + self.b_i)
            # 忘却ゲート
            f_t = tf.nn.sigmoid(tf.matmul(inputs, self.W_f) + self.b_f)
            # 出力ゲート
            o_t = tf.nn.sigmoid(tf.matmul(inputs, self.W_o) + self.b_o)
            
            # セル
            c_t = tf.multiply(f_t, c_prev) + tf.nn.tanh(tf.matmul(inputs, self.W_c) + self.b_c)
            # 隠れ層
            h_t = tf.multiply(o_t, tf.nn.tanh(c_t))
            
            # マスクの適用
            c_t = m_t*c_t + (1 - m_t)*c_prev
            h_t = m_t*h_t + (1 - m_t)*h_prev
            # 出力
            return tf.stack([c_t, h_t])

        # 入力の時間順化
        x_tmaj = tf.transpose(x, perm=[1, 0, 2])
        
        # マスクの生成＆時間順化
        mask = tf.cast(tf.sequence_mask(self.seq_len, tf.shape(x)[1]), tf.float32)
        mask_tmaj = tf.transpose(tf.expand_dims(mask, axis=-1), perm=[1, 0, 2])
        
        if self.initial_state is None:
            batch_size = tf.shape(x)[0]
            self.initial_state = tf.stack([tf.zeros([batch_size, self.hid_dim]), tf.zeros([batch_size, self.hid_dim])])

        state_seq = tf.scan(fn=fn, elems=[x_tmaj, mask_tmaj], initializer=self.initial_state)
        
        return state_seq[-1][1]

### グラフ構築 ###
tf.reset_default_graph()

emb_dim = 32
hid_dim = 100
# np.hstackで横方向に並べるので謎だが、要は
# x_train, x_test全てのなかで最もidが大きい単語は何かを抽出している?
num_words = max([max(s) for s in np.hstack((x_train, x_test))])
pad_index = 0

x = tf.placeholder(tf.int32, [None, None], name='x')
t = tf.placeholder(tf.float32, [None, None], name='t')

seq_len = tf.reduce_sum(tf.cast(tf.not_equal(x, pad_index), tf.int32), axis=1)

h = Embedding(num_words, emb_dim)(x)
# h = RNN(hid_dim, seq_len)(h)
h = LSTM(emb_dim, hid_dim, seq_len)(h)
y = tf.layers.Dense(1, tf.nn.sigmoid)(h)

cost = -tf.reduce_mean(t*tf_log(y) + (1 - t)*tf_log(1 - y))

# train = tf.train.AdamOptimizer().minimize(cost) を以下に置き換え
optimizer = tf.train.AdamOptimizer()
# grads = optimizer.compute_gradients(cost)
# clipped_grads = [(tf.clip_by_value(grad_val, -1., 1.), var) for grad_val, var in grads]
# train = optimizer.apply_gradients(clipped_grads)
train = optimizer.minimize(cost)

test = tf.round(y)

### データの準備 ###
top_words = 5000 # We only use most frequent 5000 words
x_train, x_valid, t_train, t_valid = train_test_split(x_train, t_train)

### 頻度の高い単語のみを抽出 ###
def freq_words(x, top_words = 5000):
    n = len(x)
    for i in range(n):
        x[i] = [j for j in x[i] if j <= top_words]
    return x

### 抽出された軽いデータ ###
x_train_freq = freq_words(x_train)
x_valid_freq = freq_words(x_valid)
x_test_freq = freq_words(x_test)


### 学習 ###
n_epochs = 3
batch_size = 100 # バッチサイズが大きいと、ResourceExhaustedErrorになることがあります
n_batches_train = math.ceil(len(x_train) / batch_size)
n_batches_valid = math.ceil(len(x_valid) / batch_size)
n_batches_test = math.ceil(len(x_test) / batch_size)

init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for epoch in range(n_epochs):
        # Train
        train_costs = []
        for i in range(n_batches_train):
            start = i * batch_size
            end = min(start + batch_size, len(x_train))
            
            # x_train_batch = np.array(pad_sequences(x_train[start:end], padding='post', value=pad_index))
            x_train_batch = np.array(pad_sequences(x_train_freq[start:end], padding='post', value=pad_index))
            t_train_batch = np.array(t_train[start:end])[:, None]

            _, train_cost = sess.run([train, cost], feed_dict={x: x_train_batch, t: t_train_batch})
            train_costs.append(train_cost)
        
        # Valid
        valid_costs = []
        y_pred = []
        for i in range(n_batches_valid):
            start = i * batch_size
            end = min(start + batch_size, len(x_valid))
            # x_valid_pad = np.array(pad_sequences(x_valid[start:end], padding='post', value=pad_index))
            x_valid_pad = np.array(pad_sequences(x_valid_freq[start:end], padding='post', value=pad_index))
            t_valid_pad = np.array(t_valid[start:end])[:, None]
            
            pred, valid_cost = sess.run([test, cost], feed_dict={x: x_valid_pad, t: t_valid_pad})
            y_pred += pred.flatten().tolist()
            valid_costs.append(valid_cost)
        print('EPOCH: {}, Training Cost: {:.3f}, Validation Cost: {:.3f}, Validation F1: {:.3f}'.format(epoch+1, np.mean(train_costs), np.mean(valid_costs), f1_score(t_valid, y_pred, average='macro')))
        if (f1_score(t_valid, y_pred, average='macro')>0.85):
            break
        
    
    # Test
    test_costs = []
    test_y_pred = []
    for i in range(n_batches_test):
        start = i * batch_size
        end = start + batch_size
        # x_test_pad = np.array(pad_sequences(x_test[start:end], padding = 'post', value = pad_index))
        x_test_pad = np.array(pad_sequences(x_test_freq[start:end], padding = 'post', value = pad_index))
        _pred = sess.run(test, feed_dict = {x:x_test_pad})
        test_y_pred += _pred.flatten().tolist()
        
    ### 出力 ###
    submission = pd.Series(test_y_pred, name='label')
    submission.to_csv('/root/userspace/chap08/submission/submission_pred_LSTM_freq.csv', header=True, index_label='id')



KeyboardInterrupt: 

In [15]:
# %%writefile /root/userspace/chap08/submission/submission_code_LSTM.py
import math
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tensorflow.keras.preprocessing.sequence import pad_sequences

### 関数定義 ###
def tf_log(x):
    return tf.log(tf.clip_by_value(x, 1e-10, x))

### レイヤー定義 ###
class Embedding:
    def __init__(self, vocab_size, emb_dim, scale = 0.08):
        self.V = tf.Variable(tf.random_normal(shape = [vocab_size, emb_dim], stddev = scale), name = "V")
        
    def __call__(self, x):
        return tf.nn.embedding_lookup(self.V, x)
    
class RNN:
    def __init__(self, hid_dim, seq_len = None, initial_state = None):
        self.cell = tf.nn.rnn_cell.BasicRNNCell(hid_dim)
        self.initial_state = initial_state
        self.seq_len = seq_len
    
    def __call__(self, x):
        if self.initial_state is None:
            self.initial_state = self.cell.zero_state(tf.shape(x)[0], tf.float32)
            
        # outputsは各系列長分以降は0になるので注意
        outputs, state = tf.nn.dynamic_rnn(self.cell, x, self.seq_len, self.initial_state)
        return tf.gather_nd(outputs, indices = tf.stack([tf.range(tf.shape(x)[0]), self.seq_len-1], axis = 1 ))
    
### LSTMによるレイヤー定義 ###
class LSTM:
    def __init__(self, in_dim, hid_dim, seq_len = None, initial_state = None):
        self.in_dim = in_dim
        self.hid_dim = hid_dim

        glorot = tf.cast(tf.sqrt(6/(in_dim + hid_dim*2)), tf.float32)
        
        # 入力ゲート
        self.W_i = tf.Variable(tf.random_uniform([in_dim + hid_dim, hid_dim], minval=-glorot, maxval=glorot), name='W_i')
        self.b_i  = tf.Variable(tf.zeros([hid_dim]), name='b_i')
        
        # 忘却ゲート
        self.W_f = tf.Variable(tf.random_uniform([in_dim + hid_dim, hid_dim], minval=-glorot, maxval=glorot), name='W_f')
        self.b_f  = tf.Variable(tf.zeros([hid_dim]), name='b_f')

        # 出力ゲート
        self.W_o = tf.Variable(tf.random_uniform([in_dim + hid_dim, hid_dim], minval=-glorot, maxval=glorot), name='W_o')
        self.b_o  = tf.Variable(tf.zeros([hid_dim]), name='b_o')

        # セル
        self.W_c = tf.Variable(tf.random_uniform([in_dim + hid_dim, hid_dim], minval=-glorot, maxval=glorot), name='W_c')
        self.b_c  = tf.Variable(tf.zeros([hid_dim]), name='b_c')

        # マスク
        self.seq_len = seq_len
        
        self.initial_state = initial_state

    def __call__(self, x):
        # tf.scanへの適用関数fn
        # WRITE ME
        def fn(prev_state, x_and_m):
            x_t, m_t = x_and_m
            c_prev, h_prev = prev_state[0], prev_state[1]
            inputs = tf.concat([x_t, h_prev], axis = -1)
            
            # 入力ゲート
            i_t = tf.nn.sigmoid(tf.matmul(inputs, self.W_i) + self.b_i)
            # 忘却ゲート
            f_t = tf.nn.sigmoid(tf.matmul(inputs, self.W_f) + self.b_f)
            # 出力ゲート
            o_t = tf.nn.sigmoid(tf.matmul(inputs, self.W_o) + self.b_o)
            
            # セル
            c_t = tf.multiply(f_t, c_prev) + tf.nn.tanh(tf.matmul(inputs, self.W_c) + self.b_c)
            # 隠れ層
            h_t = tf.multiply(o_t, tf.nn.tanh(c_t))
            
            # マスクの適用
            c_t = m_t*c_t + (1 - m_t)*c_prev
            h_t = m_t*h_t + (1 - m_t)*h_prev
            # 出力
            return tf.stack([c_t, h_t])

        # 入力の時間順化
        x_tmaj = tf.transpose(x, perm=[1, 0, 2])
        
        # マスクの生成＆時間順化
        mask = tf.cast(tf.sequence_mask(self.seq_len, tf.shape(x)[1]), tf.float32)
        mask_tmaj = tf.transpose(tf.expand_dims(mask, axis=-1), perm=[1, 0, 2])
        
        if self.initial_state is None:
            batch_size = tf.shape(x)[0]
            self.initial_state = tf.stack([tf.zeros([batch_size, self.hid_dim]), tf.zeros([batch_size, self.hid_dim])])

        state_seq = tf.scan(fn=fn, elems=[x_tmaj, mask_tmaj], initializer=self.initial_state)
        
        return state_seq[-1][1]

### 畳み込み層定義 ###
class Conv1D:
    def __init__(self, filter_shape, function = lambda x: x, strides = 1, padding = 'VALID'):
        # He initializationを使う
        # filter_shape = Height * Width * Num of input_channels * Num of output_channels
        fun_in = np.prod(filter_shape[:3])
        fun_out = np.prod(filter_shape[:2]) * filter_shape[3]
        self.W = tf.Variable(rng.uniform(
                low = -np.sqrt(6/ fun_in),
                high = np.sqrt(6/ fun_out),
                size = filter_shape
                ).astype('float32'), name = 'W')
        self.b = tf.Variable(np.zeros((filter_shape[2]), dtype = 'float32'), name = 'b')
        self.function = function
        self.strides = strides
        self.padding = padding
    
    def __call__(self, x):
        u = tf.nn.conv1d(x, self.W, strides = self.strides, padding = self.padding) + self.b
        return self.function(u)

### グラフ構築 ###
tf.reset_default_graph()

emb_dim = 32
hid_dim = 100
# np.hstackで横方向に並べるので謎だが、要は
# x_train, x_test全てのなかで最もidが大きい単語は何かを抽出している?
num_words = max([max(s) for s in np.hstack((x_train, x_test))])
pad_index = 0

x = tf.placeholder(tf.int32, [None, None], name='x')
t = tf.placeholder(tf.float32, [None, None], name='t')

seq_len = tf.reduce_sum(tf.cast(tf.not_equal(x, pad_index), tf.int32), axis=1)

h = Embedding(num_words, emb_dim)(x)
# h = Conv1D(filter_shape = [3,1,1], padding = "SAME")(h)
h = tf.layers.Conv1D(filters = 32, kernel_size = 3, strides = 1, padding = "SAME")(h)
h = tf.layers.MaxPooling1D(pool_size = 2, strides = 2)(h)
h = LSTM(emb_dim, hid_dim, seq_len)(h)


y = tf.layers.Dense(1, tf.nn.sigmoid)(h)

cost = -tf.reduce_mean(t*tf_log(y) + (1 - t)*tf_log(1 - y))

# train = tf.train.AdamOptimizer().minimize(cost) を以下に置き換え
optimizer = tf.train.AdamOptimizer()
# grads = optimizer.compute_gradients(cost)
# clipped_grads = [(tf.clip_by_value(grad_val, -1., 1.), var) for grad_val, var in grads]
# train = optimizer.apply_gradients(clipped_grads)
train = optimizer.minimize(cost)

test = tf.round(y)

### データの準備 ###
top_words = 5000 # We only use most frequent 5000 words
x_train, x_valid, t_train, t_valid = train_test_split(x_train, t_train)

### 頻度の高い単語のみを抽出 ###
def freq_words(x, top_words = 5000):
    n = len(x)
    for i in range(n):
        x[i] = [j for j in x[i] if j <= top_words]
    return x

### 抽出された軽いデータ ###
x_train_freq = freq_words(x_train)
x_valid_freq = freq_words(x_valid)
x_test_freq = freq_words(x_test)


### 学習 ###
n_epochs = 5
batch_size = 100 # バッチサイズが大きいと、ResourceExhaustedErrorになることがあります
n_batches_train = math.ceil(len(x_train) / batch_size)
n_batches_valid = math.ceil(len(x_valid) / batch_size)
n_batches_test = math.ceil(len(x_test) / batch_size)

init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for epoch in range(n_epochs):
        # Train
        train_costs = []
        for i in range(n_batches_train):
            start = i * batch_size
            end = min(start + batch_size, len(x_train))
            
            # x_train_batch = np.array(pad_sequences(x_train[start:end], padding='post', value=pad_index))
            x_train_batch = np.array(pad_sequences(x_train_freq[start:end], padding='post', value=pad_index))
            t_train_batch = np.array(t_train[start:end])[:, None]

            _, train_cost = sess.run([train, cost], feed_dict={x: x_train_batch, t: t_train_batch})
            train_costs.append(train_cost)
        
        # Valid
        valid_costs = []
        y_pred = []
        for i in range(n_batches_valid):
            start = i * batch_size
            end = min(start + batch_size, len(x_valid))
            # x_valid_pad = np.array(pad_sequences(x_valid[start:end], padding='post', value=pad_index))
            x_valid_pad = np.array(pad_sequences(x_valid_freq[start:end], padding='post', value=pad_index))
            t_valid_pad = np.array(t_valid[start:end])[:, None]
            
            pred, valid_cost = sess.run([test, cost], feed_dict={x: x_valid_pad, t: t_valid_pad})
            y_pred += pred.flatten().tolist()
            valid_costs.append(valid_cost)
        print('EPOCH: {}, Training Cost: {:.3f}, Validation Cost: {:.3f}, Validation F1: {:.3f}'.format(epoch+1, np.mean(train_costs), np.mean(valid_costs), f1_score(t_valid, y_pred, average='macro')))
        if (f1_score(t_valid, y_pred, average='macro')>0.9):
            break
        
    
    # Test
    test_costs = []
    test_y_pred = []
    for i in range(n_batches_test):
        start = i * batch_size
        end = start + batch_size
        # x_test_pad = np.array(pad_sequences(x_test[start:end], padding = 'post', value = pad_index))
        x_test_pad = np.array(pad_sequences(x_test_freq[start:end], padding = 'post', value = pad_index))
        _pred = sess.run(test, feed_dict = {x:x_test_pad})
        test_y_pred += _pred.flatten().tolist()
        
    ### 出力 ###
    submission = pd.Series(test_y_pred, name='label')
    submission.to_csv('/root/userspace/chap08/submission/submission_pred_LSTM_freq_Conv.csv', header=True, index_label='id')



EPOCH: 1, Training Cost: 0.693, Validation Cost: 0.693, Validation F1: 0.336
EPOCH: 2, Training Cost: 0.693, Validation Cost: 0.693, Validation F1: 0.350
EPOCH: 3, Training Cost: 0.693, Validation Cost: 0.693, Validation F1: 0.340


KeyboardInterrupt: 

In [29]:
# %%writefile /root/userspace/chap08/submission/submission_code_Keras_conv.py

# LSTM for sequence classification in the IMDB dataset
import numpy
import tensorflow as tf
import math
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPooling1D
from tensorflow.keras.preprocessing import sequence
# fix random seed for reproducibility
numpy.random.seed(7)

# load the dataset but only keep the top n words, zero the rest
### データの準備 ###
top_words = 5000 # We only use most frequent 5000 words
x_train, x_valid, t_train, t_valid = train_test_split(x_train, t_train)

### 頻度の高い単語のみを抽出 ###
def freq_words(x, top_words = 5000):
    n = len(x)
    for i in range(n):
        x[i] = [j for j in x[i] if j <= top_words]
    return x

### 抽出された軽いデータ ###
x_train_freq = freq_words(x_train)
x_valid_freq = freq_words(x_valid)
x_test_freq = freq_words(x_test)

# truncate and pad input sequences
max_review_length = 500
x_train_pad = sequence.pad_sequences(x_train_freq, maxlen=max_review_length)
x_valid_pad = sequence.pad_sequences(x_valid_freq, maxlen=max_review_length)
x_test_pad = sequence.pad_sequences(x_test_freq, maxlen=max_review_length)
# create the model
embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(x_train_pad, t_train, epochs=3, batch_size=64)
# Final evaluation of the model
scores = model.evaluate(x_valid_pad, t_valid, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

y_pred = model.predict(x_test_pad)
y_pred = np.round(y_pred).reshape(-1,)
submission = pd.Series(y_pred, name='label')
submission.to_csv('/root/userspace/chap08/submission/submission_pred_LSTM_keras.csv', header=True, index_label='id')


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 500, 32)           160000    
_________________________________________________________________
conv1d (Conv1D)              (None, 500, 32)           3104      
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 250, 32)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 100)               53200     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 216,405
Trainable params: 216,405
Non-trainable params: 0
_________________________________________________________________
None


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 89.54%


Exception: Data must be 1-dimensional

In [37]:
# y_pred_round = round(y_pred)
# submission = pd.Series(y_pred, name='label')
# submission.to_csv('/root/userspace/chap08/submission/submission_pred_LSTM_freq_Conv.csv', header=True, index_label='id')
print(y_pred[0:9])

[[0.2649611 ]
 [0.05151569]
 [0.01493361]
 [0.98068684]
 [0.98333436]
 [0.0089713 ]
 [0.9959    ]
 [0.9520119 ]
 [0.02216182]]


In [34]:
y_pred_round = np.round(y_pred).reshape(-1,)
submission = pd.Series(y_pred_round, name = 'label')
submission.to_csv('/root/userspace/chap08/submission/submission_pred_LSTM_keras.csv', header=True, index_label='id')

In [35]:
print(y_pred_round[1:10])

[0. 0. 1. 1. 0. 1. 1. 0. 1.]


In [36]:
print(submission)

0       0.0
1       0.0
2       0.0
3       1.0
4       1.0
5       0.0
6       1.0
7       1.0
8       0.0
9       1.0
10      1.0
11      0.0
12      0.0
13      1.0
14      0.0
15      1.0
16      1.0
17      0.0
18      1.0
19      1.0
20      1.0
21      1.0
22      1.0
23      0.0
24      1.0
25      1.0
26      1.0
27      1.0
28      1.0
29      0.0
       ... 
9970    0.0
9971    0.0
9972    0.0
9973    1.0
9974    0.0
9975    1.0
9976    0.0
9977    0.0
9978    0.0
9979    0.0
9980    0.0
9981    0.0
9982    1.0
9983    0.0
9984    1.0
9985    1.0
9986    1.0
9987    0.0
9988    0.0
9989    0.0
9990    0.0
9991    0.0
9992    1.0
9993    0.0
9994    1.0
9995    0.0
9996    1.0
9997    0.0
9998    1.0
9999    1.0
Name: label, Length: 10000, dtype: float32
