# Penyelesaian

## Import Library Tensorflow

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from tensorflow import keras
from tensorflow.keras import backend as K
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import GRU,LSTM, Input, Dense, Embedding
from tensorflow.keras.callbacks import EarlyStopping
from nltk.translate.bleu_score import sentence_bleu

## Prapemrosessan data

In [2]:
''' reading data '''
df = pd.read_csv("/content/wiki_movie_plots_deduped - wiki_movie_plots_deduped.csv")

In [3]:
''' displaying first five rows of data '''
df.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...


In [4]:
''' checking shape of data '''
df.shape

(34886, 8)

In [5]:
''' checking null values '''
df.isnull().sum()

Unnamed: 0,0
Release Year,0
Title,0
Origin/Ethnicity,0
Director,0
Cast,1422
Genre,28
Wiki Page,0
Plot,0


In [6]:
''' checking info of data '''
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34886 entries, 0 to 34885
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Release Year      34886 non-null  int64 
 1   Title             34886 non-null  object
 2   Origin/Ethnicity  34886 non-null  object
 3   Director          34886 non-null  object
 4   Cast              33464 non-null  object
 5   Genre             34858 non-null  object
 6   Wiki Page         34886 non-null  object
 7   Plot              34886 non-null  object
dtypes: int64(1), object(7)
memory usage: 2.1+ MB


In [7]:
df = df[(df['Origin/Ethnicity'] == 'American') | (df['Origin/Ethnicity'] == 'Chinese') | (df['Origin/Ethnicity'] == 'British')
          | (df['Origin/Ethnicity'] == 'Japanese') | (df['Origin/Ethnicity'] == 'Bollywood')]

In [8]:
df_txt = df.loc[:len(df)/2, :]

## Pemrosesan Data : Tokenizer

In [9]:
# Membuat objek Tokenizer
token = Tokenizer()

# Melakukan fitting pada teks untuk membuat indeks kata
token.fit_on_texts(df_txt['Plot'])

# Mengonversi teks menjadi urutan (sequence) angka
seq = token.texts_to_sequences(df_txt['Plot'])

# Menentukan panjang maksimal dari sequence untuk padding
max_len = max([len(x) for x in seq])

# Menambahkan padding untuk menyamakan panjang semua sequence
pad = pad_sequences(seq, maxlen=max_len, padding='post')

In [10]:
print("Maximum ssequence lenght: ", max_len)
print("shape of  pad: ", pad.shape)

Maximum ssequence lenght:  2965
shape of  pad:  (12815, 2965)


In [11]:
voc_size = len(token.word_index) + 1

In [12]:
token2 = Tokenizer()
token2.fit_on_texts(df_txt['Title'])
seq2 = token2.texts_to_sequences(df_txt['Title'])
max_len_2 = max([len(y) for y in seq2])
pad2 = pad_sequences(seq2, maxlen = max_len_2, padding='post')

In [13]:
print("max_len_2: ", max_len_2)
print('shape of pad2: ', np.shape(pad2))

max_len_2:  15
shape of pad2:  (12815, 15)


In [14]:
voc_size2 = len(token2.word_index) + 1

# Modeling

Jika LSTM

In [15]:
K.clear_session()
latent_dim = 120

''' Encoder '''
enc_inp = Input(shape=(max_len,))

''' Embedding Layer '''
out = Embedding(voc_size, 40,trainable=True)(enc_inp)

''' LSTM1 '''
enc_lstm1 = LSTM(latent_dim, return_sequences=True, return_state=True)
enc_out1, state_h1, state_c1 = enc_lstm1(out)

''' LSTM 2 '''
enc_lstm2 = LSTM(latent_dim, return_sequences=True, return_state=True)
enc_out2, state_h2, state_c2 = enc_lstm2(enc_out1)

''' LSTM 3'''
enc_lstm3=LSTM(latent_dim, return_state=True, return_sequences=True)
enc_out, state_h, state_c= enc_lstm3(enc_out2)

enc_states = [state_h, state_c]

''' Decoder '''
dec_inp = Input(shape=(None,))
''' Embebedding layer '''
dec_layer = Embedding(voc_size2, 20,trainable=True)
dec_emb = dec_layer(dec_inp)

'''LSTM using encoder_states as initial state'''
dec_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
dec_out, dec_fwd_state, dec_back_state = dec_lstm(dec_emb, initial_state=enc_states)

dec_out = Dense(voc_size, activation='softmax')(dec_out)

''' Model '''
model = Model([enc_inp, dec_inp], dec_out)

''' compile the model '''
model.compile(optimizer = 'rmsprop', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])

''' lets see how model looks like '''
model.summary()

Jika GRU

In [18]:
# Modeling dengan Multilayer GRU
K.clear_session()

# Parameter model
latent_dim = 128  # Dimensi hidden state
embedding_dim = 100  # Dimensi embedding

# Encoder
encoder_inputs = Input(shape=(pad.shape[1],))
encoder_embedding = Embedding(voc_size, embedding_dim)(encoder_inputs)

# GRU Layer 1
encoder_gru1 = GRU(latent_dim, return_sequences=True, return_state=True)
encoder_outputs1, state_h1 = encoder_gru1(encoder_embedding)

# GRU Layer 2
encoder_gru2 = GRU(latent_dim, return_sequences=True, return_state=True)
encoder_outputs2, state_h2 = encoder_gru2(encoder_outputs1)

# GRU Layer 3
encoder_gru3 = GRU(latent_dim, return_state=True, return_sequences=True)
encoder_outputs3, state_h3 = encoder_gru3(encoder_outputs2)

encoder_states = [state_h3]

# Decoder
decoder_inputs = Input(shape=(pad2.shape[1],))
decoder_embedding = Embedding(voc_size2, embedding_dim)(decoder_inputs)

# GRU Layer Decoder
decoder_gru = GRU(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _ = decoder_gru(decoder_embedding, initial_state=encoder_states)

# Dense Layer
decoder_dense = Dense(voc_size2, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


model.summary()

In [None]:
''' EarlyStopping '''
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1)

pad.shape, pad2.shape, (pad2.reshape(pad2.shape[0], pad2.shape[1], 1)).shape

''' training '''
# The target data (pad2) should not be reversed when passed to the model during training.
# The decoder GRU is designed to predict the next word in the sequence,
# and reversing the target would disrupt this process.
history = model.fit([pad, pad2], pad2.reshape(pad2.shape[0], pad2.shape[1], 1), epochs=10,  batch_size=1000,
                    callbacks=[es], validation_split=0.2)



# Plotting Loss dan Akurasi
plt.figure(figsize=(12, 5))

# Plot untuk Training & Validation Loss
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.title('Training and Validation Loss')

# Plot untuk Training & Validation Accuracy
plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.title('Training and Validation Accuracy')

plt.tight_layout()
plt.show()

# Evaluasi akhir pada data validasi
val_loss, val_accuracy = model.evaluate([pad, pad2[::-1]], pad2.reshape(pad2.shape[0], pad2.shape[1], 1), verbose=0)
print(f"Validation Loss: {val_loss:.4f}")
print(f"Validation Accuracy: {val_accuracy:.4f}")


Epoch 1/10


# Evaluasi

In [None]:
# Menghitung BLEU score pada data validasi
def calculate_bleu(model, x_val, y_val):
    bleu_scores = []
    for i in range(len(x_val)):
        predicted_seq = model.predict(x_val[i:i+1])  # Prediksi urutan output untuk tiap contoh
        predicted_seq = np.argmax(predicted_seq, axis=-1)  # Mendapatkan index dengan probabilitas tertinggi

        # BLEU score dengan nltk - sesuaikan y_val[i] sebagai list token referensi
        reference = [y_val[i].tolist()]
        candidate = predicted_seq.flatten().tolist()
        bleu_score = sentence_bleu(reference, candidate)
        bleu_scores.append(bleu_score)
    return np.mean(bleu_scores)

# Menghitung BLEU pada data validasi
bleu_score = calculate_bleu(model, [pad, pad2[::-1]], pad2)
print(f"Validation BLEU Score: {bleu_score:.4f}")


# Analisis
## Bagaimana perbandingan performa antara model multilayer GRU dan single layer GRU dalam prediksi judul film?