Friday, May 30th 2024

1. Import Library

In [2]:
import numpy as np
import matplotlib.pyplot as plt

from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional

2. Install gdown
---
Gdown adalah alat yang berguna untuk mengunduh file dari Google Drive
menggunakan terminal atau command line interface (CLI)

In [3]:
!pip install gdown==5.1.0



3. Download file yang akan digunakan sebagai dataset

In [4]:
# sonnets.txt
!gdown --id 108jAePKK4R3BVYBbYJZ32JWUwxeMg20K

Downloading...
From: https://drive.google.com/uc?id=108jAePKK4R3BVYBbYJZ32JWUwxeMg20K
To: /content/sonnets.txt
100% 93.6k/93.6k [00:00<00:00, 133MB/s]


4. Read file

In [5]:
FILE = './sonnets.txt'

with open(FILE) as f:
  ds = f.read()

5. Convert ke huruf kecil dan bagi menjadi list

In [6]:
ds = ds.lower().split('\n')

In [7]:
print(len(ds))
print(ds[0])
print(ds[1])

2159
from fairest creatures we desire increase,
that thereby beauty's rose might never die,


6. Ubah setiap kata menjadi indeks menggunakan Tokenizer
---
Tokenizer digunakan untuk mengubah teks menjadi urutan token atau indeks.


In [8]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(ds)

In [9]:
total_words = len(tokenizer.word_index) + 1 # tambah 1 karena index biasanya dimulai dari 0

In [21]:
print(total_words)

3211


7. Change text to sequences

In [10]:
#WARNING : DON'T PASS STRING TO THE FUNCTION, MUST BE A LIST
print(ds[0])
tokenizer.texts_to_sequences(ds[0]) #THIS IS WRONG, text_to_sequences receive list
#ds[0] CONTAIN A STRING WHICH CAN ITERATE AS A CHARACTER
#WHEN YOU PASS ds to fit_on_texts() IT CHANGE WORD TO AN INDEX SO YOU SHOULD PASS A LIST OF STRINGS

tokenizer.texts_to_sequences([ds[0]]) #RIGHT WAY TO DO
tokenizer.texts_to_sequences([ds[0]])[0]

from fairest creatures we desire increase,


[34, 417, 877, 166, 213, 517]

In [11]:
def n_gram_seqs(ds, tokenizer):
  input_sequences = [] #consist all list that already converted to index and all the subsequence of that list
  for row in ds:
    row_in_token = tokenizer.texts_to_sequences([row])[0] #be careful in this part should pass in list shape and only take the inner value from [[a]]
    for i in range(1, len(row_in_token)):
      n_seq = row_in_token[:i+1]
      input_sequences.append(n_seq)
  return input_sequences

In [12]:
input_sequences = n_gram_seqs(ds, tokenizer)
max_seq_len = max([len(x) for x in input_sequences])

In [13]:
print(len(input_sequences))
print(max_seq_len)

15462
11


8. Add padding to sequences
---
add padding to all list make all the list have the same length and we make max_seq_len as the max len and a list lower than that number should add padding 0 to the left empty part

In [14]:
def pad_seqs(input_sequences, max_seq_len):
  padded_seqs = pad_sequences(input_sequences, maxlen=max_seq_len, padding='pre')
  return padded_seqs

In [15]:
padded_input_seqs = pad_seqs(input_sequences, max_seq_len)

In [17]:
print(f"padded dataset has shape: {padded_input_seqs.shape}")

padded dataset has shape: (15462, 11)


9. Split Data into features and labels

In [18]:
def features_and_labels(seq, total_words):
  features = seq[:, :-1]
  labels = seq[:, -1]
  one_hot_labels = to_categorical(labels, num_classes=total_words, dtype='int8')
  return features, one_hot_labels

In [19]:
features, labels = features_and_labels(padded_input_seqs, total_words)

10. Create Model

SOMETHING CAN IMPROVE:
*   OUTPUT DIM : instead using 100 might be another option
*   LSTM UNITS : instead using 100 might be anothe option



In [30]:
def model(total_words, input_len):
  model = Sequential()
  model.add(Embedding(total_words, 100, input_length=input_len))
  model.add(Bidirectional(LSTM(100)))
  model.add(Dense(total_words, activation='softmax'))

  model.compile(
      loss='categorical_crossentropy',
      optimizer='adam',
      metrics=['accuracy']
  )
  return model

In [31]:
model = model(total_words, max_seq_len-1)

In [32]:
history = model.fit(features, labels, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


TRY TO PREDICT

In [34]:
seed_text = "I like"
next_words = 10
for _ in range(next_words):
    # Convert the text into sequences
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    # Pad the sequences
    token_list = pad_sequences([token_list], maxlen=max_seq_len-1, padding='pre')
    # Get the probabilities of predicting a word
    predicted = model.predict(token_list, verbose=0)
    # Choose the next word based on the maximum probability
    predicted = np.argmax(predicted, axis=-1).item()
    # Get the actual word from the word index
    output_word = tokenizer.index_word[predicted]
    # Append to the current text
    seed_text += " " + output_word

print(seed_text)

I like to be fair more than my friend can five tend
