In [1]:
import pandas as pd

In [None]:
# !wget https://storage.googleapis.com/babert-pretraining/IndoNLG_finals/downstream_task/downstream_task_datasets.zip
# !unzip downstream_task_datasets.zip
# !rm downstream_task_datasets.zip

In [None]:
train_data = pd.read_json('DATA/MT_JAVNRF_INZNTV/train_preprocess.json')
test_data = pd.read_json('DATA/MT_JAVNRF_INZNTV/test_preprocess.json')
val_data = pd.read_json('DATA/MT_JAVNRF_INZNTV/valid_preprocess.json')

In [None]:
train_data.isna().sum()

In [None]:
train_data.info()

In [None]:
train_data.drop(columns='id', inplace=True)
test_data.drop(columns='id', inplace=True)
val_data.drop(columns='id', inplace=True)

In [None]:
train_data

In [None]:
test_data

In [None]:
val_data

Preprocess Data only use train_data

In [None]:
import pandas as pd

# Initialize lists for inputs, intermediate outputs, and outputs
inputs = []
outputs_i = []
outputs = []

# Iterate over the rows of the train_data DataFrame
for _, row in train_data.iterrows():
    ip = row['text']
    temp_op = row['label']
    op_i = '<sos> ' + temp_op
    op = temp_op + ' <eos>'
    inputs.append(ip)
    outputs_i.append(op_i)
    outputs.append(op)

# Print statistics
print('Total inputs =', len(inputs))
print('Total intermediate outputs =', len(outputs_i))
print('Total outputs =', len(outputs))
print('\nSample:')
print(inputs[100])
print(outputs_i[100])
print(outputs[100])

Params

In [None]:
NUM_SENTENCES = 20000 # Use only the first 20,000 records.
MAX_NUM_WORDS = 20000 # Use 20,000 words for tokenizing
MAX_SENT_LEN = 50

EMBEDDING_SIZE = 100

LSTM_NEURONS = 100

BATCH_SIZE = 64
EPOCHS = 5

In [None]:
pip install tensorflow


In [None]:
#Tokenize
from keras.preprocessing.text import Tokenizer

input_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
input_tokenizer.fit_on_texts(inputs)

inputs_seq = input_tokenizer.texts_to_sequences(inputs)

inputs_word2index = input_tokenizer.word_index
print('Total unique words in input:', len(inputs_word2index))

inputs_numwords = len(inputs_word2index)+1

inputs_maxlen = max(len(s) for s in inputs_seq)
print('Length of longest sentence in input:', inputs_maxlen)

output_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, filters='')
output_tokenizer.fit_on_texts(outputs_i + outputs)

outputs_i_seq = output_tokenizer.texts_to_sequences(outputs_i)
outputs_seq = output_tokenizer.texts_to_sequences(outputs)

outputs_word2index = output_tokenizer.word_index
print('Total unique words in output:', len(outputs_word2index))

outputs_numwords = len(outputs_word2index)+1

outputs_maxlen = max(len(s) for s in outputs_seq)
print('Length of longest sentence in output:', outputs_maxlen)

In [None]:
#Padding
from keras.preprocessing.sequence import pad_sequences

encoder_input_sequences = pad_sequences(inputs_seq, maxlen=inputs_maxlen)
print('encoder_input_sequences shape:', encoder_input_sequences.shape)

decoder_input_sequences = pad_sequences(outputs_i_seq, maxlen=outputs_maxlen, padding='post')
print('decoder_inputs_sequences shape:', decoder_input_sequences.shape)

decoder_output_sequences = pad_sequences(outputs_seq, maxlen=outputs_maxlen, padding='post')
print('decoder_output_sequences shape:', decoder_output_sequences.shape)

In [None]:
from numpy import asarray, zeros

embeddings_dict = dict()

glove_file = open('glove.6B.100d.txt', encoding='utf8')

for line in glove_file:
    records = line.split()
    word = records[0]
    vector = asarray(records[1:], dtype='float32')
    embeddings_dict[word] = vector

glove_file.close()

num_words = min(MAX_NUM_WORDS, len(inputs_word2index)+1)

embedding_matrix = zeros((num_words, EMBEDDING_SIZE))

for word, index in inputs_word2index.items():
    embedding_vector = embeddings_dict.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [None]:
from keras.layers import Input, Embedding, LSTM, Dense, TimeDistributed
from numpy import zeros

decoder_outputs = []

encoder_embedding_layer = Embedding(num_words, EMBEDDING_SIZE, weights=[embedding_matrix], input_length=inputs_maxlen)
decoder_embedding_layer = Embedding(outputs_numwords, LSTM_NEURONS)

decoder_outputs_onehot = zeros((len(inputs), outputs_maxlen, outputs_numwords), dtype='float32')
print('decoder_outputs_onehot shape:', decoder_outputs_onehot.shape)

for i, d in enumerate(decoder_outputs):
    for t, w in enumerate(d):
        decoder_outputs_onehot[i, t, w] = 1

encoder_inputs = Input(shape=(inputs_maxlen,))
encoder_inputs_emb = encoder_embedding_layer(encoder_inputs)
encoder = LSTM(LSTM_NEURONS, return_state=True)
encoder_outputs, h, c = encoder(encoder_inputs_emb)
encoder_states = [h, c]

decoder_inputs = Input(shape=(outputs_maxlen,))
decoder_inputs_emb = decoder_embedding_layer(decoder_inputs)
decoder = LSTM(LSTM_NEURONS, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder(decoder_inputs_emb, initial_state=encoder_states)

# Apply Dense layer to each time step of the decoder output sequence
output_dense_layer = TimeDistributed(Dense(outputs_numwords, activation='softmax'))
outputs = output_dense_layer(decoder_outputs)



In [None]:
from keras.models import Model

model = Model([encoder_inputs, decoder_inputs], outputs)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
print(model.summary())

In [None]:
from keras.utils import plot_model

plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

In [None]:
trn = model.fit([encoder_input_sequences, decoder_input_sequences],
               decoder_outputs_onehot,
               batch_size=BATCH_SIZE, epochs=EPOCHS, validation_split=0.01
)