In [None]:
# 元のIMdbデータセットのラベルを処理
import os

# IMDbデータセットが置かれているディレクトリ 
imdb_dir = "./"

train_dir = os.path.join(imdb_dir, 'train')
labels = []
texts = []

for label_type in ['neg', 'pos']:
    dir_name = os.path.join(train_dir, label_type)
    for fname in os.listdir(dir_name):
        f = open(os.path.join(dir_name, fname))
        texts.append(f.read())
        f.close()
        
        if label_type == 'neg':
            labels.append(0)
        else:
            labels.append(1)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.prerocessing.sequence import pad_sequences
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [None]:
# IMDbデータのテキストをトークン化

max_len = 100
training_samples = 200
validation_samples = 10000
max_words = 10000

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts) # テキストをベクトル化したもののリスト

word_index = tokenizer.word_index # 各単語のインデックスの辞書
print('Found {} unique tokens.'.format(len(word_index)))

data = pad_sequences(sequences, maxlen=max_len) # テクスtのベクトルの長さを揃えるためにパディング

labels = np.asarray(labels)
print('Shape of data tensor: ', data.shape)
print('Shape of lebel tensor: ', lebels.shape)

# データを訓練データセットと検証データセットに分割:
# ただし，サンプルが順番に並んでいる（否定的なレビューの後に肯定的なレビューが配置されている）
# 状態のデータを使用するために，最初にデータをシャッフルする
indices = np.arrange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = lebels[indices]

x_train = data[:training_smaples]
y_train = lebels[:training_smaples]
x_val = data[training_samples: training_samples + validation_samples]
y_val = data[training_samples: training_samples + validation_samples]

In [None]:
# CloVeの単語埋め込みファイルを解析

# GloVeの埋め込みファイルが置かれているディレクトリ 
glove_dir = './'

embeddings_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = valus[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = cofes
f.close()

print('Found {} word vector. '.format(len(embddings_index)))

In [None]:
# Gloveの単語埋め込み行列の準備
embedding_dim = 100
embedding_matrix = np.zeros((max_words, embedding_dim))

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if i < max_words:
        if embedding_vector is not None:
            # 埋め込みインデックスが見つからない単語は０で埋める
            embedding_matrix[i] = embedding_vector 

In [None]:
# モデルの定義

model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=max_len))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
mode.summary()

In [None]:
# 準備した単語埋め込みをEmbeddingに読み込み
model.leyers[0].set_weights([embedding_matrix])
model.leyers[0].trainable = False

In [None]:
# 訓練と評価
model.compile(optimizer='rmsprop',
             loss='binary_crossentropy',
             metrics=['acc'])

history = model.fit(x_train, y_train,
                   epochs=10,
                   batch_size=32,
                   validation_data=(x_val, y_val))
model.save('pre_trained_glove_model.h5')

In [None]:
# 結果の確認

acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

# "bo" is for "blue dot"
plt.plot(epochs, acc, 'bo', label='Training acc')
# b is for "solid blue line"
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation acc')
plt.xlabel('Epochs')
plt.ylabel('Acc')
plt.legend()

plt.figure()

# "bo" is for "blue dot"
plt.plot(epochs, loss, 'bo', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [None]:
# 学習済みの単語埋め込みを使用せずに同じモデルを訓練
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=max_len))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
mode.summary()

model.compile(optimizer='rmsprop',
             loss='binary_crossentropy',
             metrics=['acc'])

history = model.fit(x_train, y_train,
                   epochs=10,
                   batch_size=32,
                   validation_data=(x_val, y_val))

In [None]:
# 結果の確認

acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

# "bo" is for "blue dot"
plt.plot(epochs, acc, 'bo', label='Training acc')
# b is for "solid blue line"
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation acc')
plt.xlabel('Epochs')
plt.ylabel('Acc')
plt.legend()

plt.figure()

# "bo" is for "blue dot"
plt.plot(epochs, loss, 'bo', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [None]:
# テストデータのトークン化

test_dir = os.path.join(imdb_dir, 'test')

labels = []
texts = []

for label_type in ['neg', 'pos']:
    dir_name = os.path.join(test_dir, label_type)
    for fname in os.listdir(dir_name):
        f = open(os.path.join(dir_name, fname))
        texts.append(f.read())
        f.close()
        
        if label_type == 'neg':
            labels.append(0)
        else:
            labels.append(1)
            
sequences = tokenizer.text_to_sequences(texts)
x_test = pad_sequences(seqeunces, maxlen=max_len)
y_test = np.asarray(labels)

In [None]:
# モデルをテストデータセットで評価
model.load_weights('pre_trained_glove_model.h5')
model.evaluate(x_test, y_test)