In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
#データの読み込み
with open('pg2265.txt', 'r', encoding='utf-8') as f: 
    text=f.read()

text = text[15858:]
#テキストデータで使われている文字を抽出
chars = set(text)
#文字を整数に変更
char2int = {ch:i for i,ch in enumerate(chars)}
#整数を文字に変更
int2char = dict(enumerate(chars))

#char2intを元にテキストデータを整数に変更
text_ints = np.array([char2int[ch] for ch in text], 
                     dtype=np.int32)

In [5]:
def reshape_data(sequence, batch_size, num_steps):
    mini_batch_length = batch_size * num_steps
    num_batches = int(len(sequence) / mini_batch_length)
    if num_batches*mini_batch_length + 1 > len(sequence):
        num_batches = num_batches - 1
    
    #シーケンスの最後の部分から完全なバッチにならない半端な文字を削除
    x = sequence[0 : num_batches*mini_batch_length]
    y = sequence[1 : num_batches*mini_batch_length + 1]
    #xとyをシーケンスのバッチのリストに分割
    x_batch_splits = np.split(x, batch_size)
    y_batch_splits = np.split(y, batch_size)
    #それらのバッチを結合
    x = np.stack(x_batch_splits)
    y = np.stack(y_batch_splits)
    
    return x, y


train_x, train_y = reshape_data(text_ints, 64, 10)
print(train_x.shape)
print(train_y.shape)
print(train_x[0, :10])
print(train_y[0, :10])
print(''.join(int2char[i] for i in train_x[0, :50]))

(64, 2540)
(64, 2540)
[ 1 27 18 47  1 52 15 36 18 30]
[27 18 47  1 52 15 36 18 30 61]
The Tragedie of Hamlet

Actus Primus. Scoena Prima


In [8]:
np.random.seed(123)
#ミニバッチを順番に処理をしていくためのコード
def create_batch_generator(data_x, data_y, num_steps):
    batch_size, tot_batch_length = data_x.shape    
    num_batches = int(tot_batch_length/num_steps)
    for b in range(num_batches):
        yield (data_x[:, b*num_steps: (b+1)*num_steps], 
               data_y[:, b*num_steps: (b+1)*num_steps])
        
bgen = create_batch_generator(train_x[:,:100], train_y[:,:100], 15)
for b in bgen:
    print(b[0].shape, b[1].shape, end='  ')
    print(''.join(int2char[i] for i in b[0][0,:]).replace('\n', '*'), '    ',
          ''.join(int2char[i] for i in b[1][0,:]).replace('\n', '*'))

(64, 15) (64, 15)  The Tragedie of      he Tragedie of 
(64, 15) (64, 15)   Hamlet**Actus       Hamlet**Actus P
(64, 15) (64, 15)  Primus. Scoena       rimus. Scoena P
(64, 15) (64, 15)  Prima.**Enter B      rima.**Enter Ba
(64, 15) (64, 15)  arnardo and Fra      rnardo and Fran
(64, 15) (64, 15)  ncisco two Cent      cisco two Centi


# ・コンストラクタ
   ##  学習パラメータを設定し、計算グラフを作成する。さらに、トレーニングモードとサンプリングモードに基づいて計算グラフを作成する
# ・buildメソッド
   ##    データを供給するためのプレースホルダを定義し、LSTMセルを使ってRNNを作成する。
# ・trainメソッド
   ##   ミニバッチを順番に処理しながら、指定された数のエポックでRNNのトレーニングを行う。
# ・sampleメソッド
   ##   与えられた文字列を元に文字列を作成  

In [22]:
import tensorflow as tf
import os

class CharRNN(object):
    #コンストラクタ
    def __init__(self, num_classes, batch_size=64, 
                 num_steps=100, lstm_size=128, 
                 num_layers=1, learning_rate=0.001, 
                 keep_prob=0.5, grad_clip=5, 
                 sampling=False):
        self.num_classes = num_classes
        self.batch_size = batch_size
        self.num_steps = num_steps
        self.lstm_size = lstm_size
        self.num_layers = num_layers
        self.learning_rate = learning_rate
        self.keep_prob = keep_prob
        self.grad_clip = grad_clip
        
        self.g = tf.Graph()
        with self.g.as_default():
            tf.set_random_seed(123)

            self.build(sampling=sampling)
            self.saver = tf.train.Saver()
            self.init_op = tf.global_variables_initializer()
     
    #buildメソッド
    def build(self, sampling):
        if sampling == True:
            batch_size, num_steps = 1, 1
        else:
            batch_size = self.batch_size
            num_steps = self.num_steps

        tf_x = tf.placeholder(tf.int32, 
                              shape=[batch_size, num_steps], 
                              name='tf_x')
        tf_y = tf.placeholder(tf.int32, 
                              shape=[batch_size, num_steps], 
                              name='tf_y')
        tf_keepprob = tf.placeholder(tf.float32, 
                              name='tf_keepprob')

        # One-hot エンコーディングを適用
        x_onehot = tf.one_hot(tf_x, depth=self.num_classes)
        y_onehot = tf.one_hot(tf_y, depth=self.num_classes)

        #多層RNNのセルを構築
        cells = tf.contrib.rnn.MultiRNNCell(
            [tf.contrib.rnn.DropoutWrapper(
                tf.contrib.rnn.BasicLSTMCell(self.lstm_size), 
                output_keep_prob=tf_keepprob) 
            for _ in range(self.num_layers)])
        
        #初期状態を定義
        self.initial_state = cells.zero_state(
                    batch_size, tf.float32)

        #RNNで各シーケンスステップを実行
        lstm_outputs, self.final_state = tf.nn.dynamic_rnn(
                    cells, x_onehot, 
                    initial_state=self.initial_state)
        
        print('  << lstm_outputs  >>', lstm_outputs)
　　　　#２次元テンソルに変形　
        seq_output_reshaped = tf.reshape(
                    lstm_outputs, 
                    shape=[-1, self.lstm_size],
                    name='seq_output_reshaped')
　　　　#総入力を取得　
        logits = tf.layers.dense(
                    inputs=seq_output_reshaped, 
                    units=self.num_classes,
                    activation=None,
                    name='logits')
　　　　#次の文字バッチの確率を計算
        proba = tf.nn.softmax(
                    logits, 
                    name='probabilities')
        print(proba)
　　　　#コスト関数を定義　
        y_reshaped = tf.reshape(
                    y_onehot, 
                    shape=[-1, self.num_classes],
                    name='y_reshaped')
        cost = tf.reduce_mean(
                    tf.nn.softmax_cross_entropy_with_logits(
                        logits=logits, 
                        labels=y_reshaped),
                    name='cost')

        # 勾配発散問題を回避するための勾配刈り込み
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(
                    tf.gradients(cost, tvars), 
                    self.grad_clip)
        optimizer = tf.train.AdamOptimizer(self.learning_rate)
        train_op = optimizer.apply_gradients(
                    zip(grads, tvars),
                    name='train_op')
    #trainメソッド    
    def train(self, train_x, train_y, 
              num_epochs, ckpt_dir='./model/'):
        #チェックポイントディレクトリがまだ存在しない場合は作成
        if not os.path.exists(ckpt_dir):
            os.mkdir(ckpt_dir)
            
        with tf.Session(graph=self.g) as sess:
            sess.run(self.init_op)

            n_batches = int(train_x.shape[1]/self.num_steps)
            iterations = n_batches * num_epochs
            for epoch in range(num_epochs):

                # ネットワークをトレーニング
                new_state = sess.run(self.initial_state)
                loss = 0
                #ミニバッチジェネレータ
                bgen = create_batch_generator(
                        train_x, train_y, self.num_steps)
                for b, (batch_x, batch_y) in enumerate(bgen, 1):
                    iteration = epoch*n_batches + b
                    
                    feed = {'tf_x:0': batch_x,
                            'tf_y:0': batch_y,
                            'tf_keepprob:0': self.keep_prob,
                            self.initial_state : new_state}
                    batch_cost, _, new_state = sess.run(
                            ['cost:0', 'train_op', 
                                self.final_state],
                            feed_dict=feed)
                    if iteration % 10 == 0:
                        print('Epoch %d/%d Iteration %d'
                              '| Training loss: %.4f' % (
                              epoch + 1, num_epochs, 
                              iteration, batch_cost))

                #トレーニングモデルを保存
                self.saver.save(
                        sess, os.path.join(
                            ckpt_dir, 'language_modeling.ckpt'))
                              
                              
    #　sampleメソッド            
    def sample(self, output_length, 
               ckpt_dir, starter_seq="The "):
        observed_seq = [ch for ch in starter_seq]        
        with tf.Session(graph=self.g) as sess:
            self.saver.restore(
                sess, 
                tf.train.latest_checkpoint(ckpt_dir))
            #　starter_seqを使ってモデルを実行
            new_state = sess.run(self.initial_state)
            for ch in starter_seq:
                x = np.zeros((1, 1))
                x[0,0] = char2int[ch]
                feed = {'tf_x:0': x,
                        'tf_keepprob:0': 1.0,
                        self.initial_state: new_state}
                proba, new_state = sess.run(
                        ['probabilities:0', self.final_state], 
                        feed_dict=feed)

            ch_id = get_top_char(proba, len(chars))
            observed_seq.append(int2char[ch_id])
            
            #　更新されたobserved_seqを使ってモデルを実行
            for i in range(output_length):
                x[0,0] = ch_id
                feed = {'tf_x:0': x,
                        'tf_keepprob:0': 1.0,
                        self.initial_state: new_state}
                proba, new_state = sess.run(
                        ['probabilities:0', self.final_state], 
                        feed_dict=feed)

                ch_id = get_top_char(proba, len(chars))
                observed_seq.append(int2char[ch_id])

        return ''.join(observed_seq)

In [23]:
def get_top_char(probas, char_size, top_n=5):
    p = np.squeeze(probas)
    p[np.argsort(p)[:-top_n]] = 0.0
    p = p / np.sum(p)
    ch_id = np.random.choice(char_size, 1, p=p)[0]
    return ch_id

## CharRNNモデルの作成とトレーニング

In [24]:
batch_size = 64
num_steps = 100 
train_x, train_y = reshape_data(text_ints, 
                                batch_size, 
                                num_steps)

rnn = CharRNN(num_classes=len(chars), batch_size=batch_size)
rnn.train(train_x, train_y, 
          num_epochs=100,
          ckpt_dir='./model-100/')

AttributeError: module 'tensorflow' has no attribute 'contrib'

## サンプリングモードのCharRNNモデル

In [16]:
del rnn

np.random.seed(123)
rnn = CharRNN(len(chars), sampling=True)
print(rnn.sample(ckpt_dir='./model-200/', 
                 output_length=500))

  << lstm_outputs  >> Tensor("rnn/transpose_1:0", shape=(1, 1, 128), dtype=float32)
Tensor("probabilities:0", shape=(1, 65), dtype=float32)
INFO:tensorflow:Restoring parameters from ./model-200/language_modeling.ckpt
The dast,
With a with a peote sintist and theng theres
To be to me to that theng in a mattes ast the

   Ham. Where he somier doue as these make seaker
To the Pattangonst they out his wilce,
To sere the Songut is wordd bode arate

   Ham. Houst

   Hor. The serull hus shall broone on him

   Ophe. Woo then  of the bost to hounds, we tound and thend it,
Wish whore thought to that that that shinge and Madestiess

   Ham. What meane since the mathirg alleste in that, wall not
Whole warks and aruale, and
