In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from tensorflow import keras
from tensorflow.keras import backend as K
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import GRU,LSTM, Input, Dense, Embedding
from tensorflow.keras.callbacks import EarlyStopping
from nltk.translate.bleu_score import sentence_bleu

In [3]:
''' Reading Data '''
df = pd.read_csv("/content/wiki_movie_plots_deduped - wiki_movie_plots_deduped.csv")
df = df[(df['Origin/Ethnicity'] == 'American') | (df['Origin/Ethnicity'] == 'Chinese') |
        (df['Origin/Ethnicity'] == 'British') | (df['Origin/Ethnicity'] == 'Japanese') |
        (df['Origin/Ethnicity'] == 'Bollywood')]
df_txt = df.loc[:len(df)//2, :]

''' Tokenization & Padding '''
token = Tokenizer()
token.fit_on_texts(df_txt['Plot'])
seq = token.texts_to_sequences(df_txt['Plot'])
max_len = max([len(x) for x in seq])
pad = pad_sequences(seq, maxlen=max_len, padding='post')

token2 = Tokenizer()
token2.fit_on_texts(df_txt['Title'])
seq2 = token2.texts_to_sequences(df_txt['Title'])
max_len_2 = max([len(y) for y in seq2])
pad2 = pad_sequences(seq2, maxlen=max_len_2, padding='post')

voc_size = len(token.word_index) + 1
voc_size2 = len(token2.word_index) + 1

''' Clear session and Model Definition '''
K.clear_session()

latent_dim = 128  # Dimension of hidden state
embedding_dim = 100  # Coba 50, 100, atau 200 untuk dimensi embedding

# Encoder with 3 layers of GRU
encoder_inputs = Input(shape=(pad.shape[1],))
encoder_embedding = Embedding(voc_size, embedding_dim)(encoder_inputs)
encoder_gru1 = GRU(latent_dim, return_state=True)
encoder_outputs1, state_h1 = encoder_gru1(encoder_embedding)

encoder_gru2 = GRU(latent_dim, return_state=True)
encoder_outputs2, state_h2 = encoder_gru2(encoder_outputs1)

encoder_gru3 = GRU(latent_dim, return_state=True)
encoder_outputs3, state_h3 = encoder_gru3(encoder_outputs2)

encoder_states = [state_h3]

# Decoder
decoder_inputs = Input(shape=(pad2.shape[1],))
decoder_embedding = Embedding(voc_size2, embedding_dim)(decoder_inputs)
decoder_gru = GRU(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _ = decoder_gru(decoder_embedding, initial_state=encoder_states)

decoder_dense = Dense(voc_size2, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

''' Compile Model '''
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

''' EarlyStopping Callback '''
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1)

''' Training Model '''
history = model.fit([pad, pad2], pad2.reshape(pad2.shape[0], pad2.shape[1], 1), epochs=10, batch_size=16,
                    callbacks=[es], validation_split=0.2)

''' Plot Training and Validation Loss and Accuracy '''
plt.figure(figsize=(12, 5))

# Plot Loss
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.title('Training and Validation Loss')

# Plot Accuracy
plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.title('Training and Validation Accuracy')

plt.tight_layout()
plt.show()

''' Evaluasi Model '''
val_loss, val_accuracy = model.evaluate([pad, pad2[::-1]], pad2.reshape(pad2.shape[0], pad2.shape[1], 1), verbose=0)
print(f"Validation Loss: {val_loss:.4f}")
print(f"Validation Accuracy: {val_accuracy:.4f}")

''' Calculate BLEU Score '''
def calculate_bleu(model, x_val, y_val):
    bleu_scores = []
    for i in range(len(x_val)):
        predicted_seq = model.predict(x_val[i:i+1])
        predicted_seq = np.argmax(predicted_seq, axis=-1)
        reference = [y_val[i].tolist()]
        candidate = predicted_seq.flatten().tolist()
        bleu_score = sentence_bleu(reference, candidate)
        bleu_scores.append(bleu_score)
    return np.mean(bleu_scores)

bleu_score = calculate_bleu(model, [pad, pad2[::-1]], pad2)
print(f"Validation BLEU Score: {bleu_score:.4f}")


ValueError: Input 0 of layer "gru_1" is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: (None, 128)