In [21]:
import os
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

imdb_dir = './datas/imdb/'

train_dir = os.path.join(imdb_dir, 'train')
labels = []
texts = []

for label_type in ['neg', 'pos']:
    dir_name = os.path.join(train_dir, label_type)
    print(dir_name)
    for fname in os.listdir(dir_name):
        if fname[-4:] == '.txt':
            with open(os.path.join(dir_name, fname), encoding = 'utf8') as f:
                texts.append(f.read())
            labels.append(0 if label_type == 'neg' else 1)

print(texts[0])
print(labels[0])
print(texts[12500])
print(labels[12500])

./datas/imdb/train/neg
./datas/imdb/train/pos
Working with one of the best Shakespeare sources, this film manages to be creditable to it's source, whilst still appealing to a wider audience.<br /><br />Branagh steals the film from under Fishburne's nose, and there's a talented cast on good form.
0
For a movie that gets no respect there sure are a lot of memorable quotes listed for this gem. Imagine a movie where Joe Piscopo is actually funny! Maureen Stapleton is a scene stealer. The Moroni character is an absolute scream. Watch for Alan "The Skipper" Hale jr. as a police Sgt.
1


In [3]:
maxlen = 200 #over 200 words, thrown out
training_samples = 10000
validations_samples = 10000
max_words = 10000

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=maxlen)
labels = np.asarray(labels)
print('Size of Data Tensor:', data.shape)
print('size of Label Tensor:', labels.shape)

Found 88582 unique tokens.
Size of Data Tensor: (25000, 200)
size of Label Tensor: (25000,)


In [20]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples: training_samples+validations_samples]
y_val = labels[training_samples: training_samples + validations_samples]

model = Sequential()
model.add(Embedding(max_words, 64, input_length=maxlen))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(32)))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights= True)
model_checkpoint=ModelCheckpoint('./model/imdb/best_model.keras', save_best_only=True, monitor='val_loss')

history = model.fit(x_train, y_train, epochs=20, batch_size=64, validation_data=(x_val, y_val), callbacks=[early_stopping, model_checkpoint])

Epoch 1/20
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 345ms/step - acc: 0.5874 - loss: 0.6599 - val_acc: 0.7573 - val_loss: 0.5259
Epoch 2/20
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 343ms/step - acc: 0.8244 - loss: 0.4282 - val_acc: 0.7279 - val_loss: 0.5300
Epoch 3/20
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 360ms/step - acc: 0.8619 - loss: 0.3510 - val_acc: 0.8194 - val_loss: 0.4076
Epoch 4/20
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 371ms/step - acc: 0.9136 - loss: 0.2411 - val_acc: 0.7695 - val_loss: 0.4910
Epoch 5/20
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 348ms/step - acc: 0.8975 - loss: 0.2625 - val_acc: 0.8115 - val_loss: 0.4779
Epoch 6/20
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 361ms/step - acc: 0.9357 - loss: 0.1825 - val_acc: 0.8134 - val_loss: 0.4690
Epoch 7/20
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

In [30]:
test_texts = [ #아래 문장은 강의 채널에 올려 놓음
    "I hated this movie. It was terrible and the acting was horrible.",
    "This was the worst film I have ever seen. Not worth the time.",
    "I loved this movie. It was fantastic and the acting was great.",
    "This was the best film I have seen in a long time. Totally worth it.",
    "I had high hopes for this movie, but it was a complete letdown. The plot made no sense and the characters were flat.",
    "This film was a disaster from start to finish. The dialogue was awkward, and the pacing was painfully slow.",
    "What an amazing film! The plot was deeply engaging, and the cinematography was stunning from beginning to end.",
    "I was thoroughly impressed by this film. The direction, the music, and the performances all came together perfectly."
]

test_sequences = tokenizer.texts_to_sequences(test_texts)
test_data = pad_sequences(test_sequences, maxlen=maxlen)

best_model = load_model('./model/imdb/best_model.keras', custom_objects = None, compile=True)
predictions = best_model.predict(test_data)

for i, test_text in enumerate(test_texts):
    print(f"Text: {test_text[:40]}...")
    print(f"Prediction: {'Positive' if predictions[i] > 0.5 else 'Negative'} (Acc: {predictions[i][0]:.4f})")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 588ms/step
Text: I hated this movie. It was terrible and ...
Prediction: Negative (Acc: 0.1136)
Text: This was the worst film I have ever seen...
Prediction: Positive (Acc: 0.8603)
Text: I loved this movie. It was fantastic and...
Prediction: Positive (Acc: 0.9566)
Text: This was the best film I have seen in a ...
Prediction: Positive (Acc: 0.9512)
Text: I had high hopes for this movie, but it ...
Prediction: Negative (Acc: 0.1199)
Text: This film was a disaster from start to f...
Prediction: Negative (Acc: 0.1145)
Text: What an amazing film! The plot was deepl...
Prediction: Positive (Acc: 0.9367)
Text: I was thoroughly impressed by this film....
Prediction: Positive (Acc: 0.9570)
