# Generating Indonesian Lyric using Deep Learning

Notebook of https://medium.com/@haryoaw/generating-indonesian-lyric-using-deep-learning-first-part-2c7634237475

In [1]:
import pandas as pd
import numpy as np
import re
from sklearn import preprocessing
from keras.layers import Dense
from keras.layers import Embedding
from keras import Sequential
import keras
import pickle
from IPython.display import HTML


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Text Cleaning

In [3]:
def clean_text_lyric(lyric, max_length):
    lyric_sample = lyric.split('</span>')
    lyric_sample = lyric_sample[:-1]
    
    clean_song_lyric = ""
    for sentence in lyric_sample:
        if 'http' not in sentence:
            sentence = re.sub(r'<div .*>','',sentence)
            sentence = re.sub(r'<span .*>','',sentence)
            sentence = sentence.strip()
            if len(sentence) > 0:
                sentence = re.sub(r'[^a-zA-Z0-9<\/>\s]',' ', sentence)

                sentence = " <div> " + sentence + " </div> "
                sentence = re.sub(r'<div>\s+<br>','<br> <div> ', sentence)
                sentence = re.sub(r'(\d|\r|\t|\n)','',sentence)

                clean_song_lyric += sentence

    clean_song_lyric = ' <start> ' * (max_length-1) + clean_song_lyric
    return clean_song_lyric

In [5]:
max_length = 10

scrape_lyric = pd.read_csv('st_12_generator/lirik_lagu_scraper/kapanlagi.csv')
scrape_lyric = scrape_lyric[['song','band']]
scrape_lyric = scrape_lyric.dropna()
scrape_lyric = scrape_lyric[scrape_lyric.song.str.contains('span')]
scrape_lyric.song = scrape_lyric.song.apply(lambda x: clean_text_lyric(x,max_length))
scrape_lyric = scrape_lyric[~(scrape_lyric.song.str.contains('Belum Ada Lirik'))]
scrape_lyric = scrape_lyric.reset_index(drop=True)

# Preprocessing Data

In [7]:
def generate_train_dataset(song_series, window):
    train_data = []
    train_label = []
    for song_string in song_series:
        song_arr = song_string.split()
        len_song_arr = len(song_arr)

        for i in range(0,len_song_arr-window+1):
            train_data.append(' '.join(song_arr[i:i+window-1]))
            train_label.append(song_arr[i+window-1])
    return np.array(train_data), np.array(train_label)

In [8]:
X, y = generate_train_dataset(scrape_lyric.song, window=max_length)
pre = keras.preprocessing.text.Tokenizer(filters='!"#$%&()*+,-.:;=?@[\\]^_`{|}~\t\n')
pre.fit_on_texts(X.flatten())

vocab_size = len(pre.word_index)
X = pre.texts_to_sequences(X)
y = pre.texts_to_sequences(y)

# Deep Learning Architecture

In [16]:
model = Sequential()
e = Embedding(vocab_size,100 ,input_length=max_length-1, trainable=True)
model.add(e)
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Bidirectional(keras.layers.CuDNNLSTM(100)))
model.add(keras.layers.BatchNormalization())

model.add(Dense(100, activation='relu'))
model.add(keras.layers.BatchNormalization())

model.add(Dense(vocab_size+1, activation='softmax'))
# compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy')

# Train Model

In [14]:
def batch_data(start,end):
    global X,y,y_input, X_input
    X_input = np.array(X)[start:end]
    y_input = np.array(y)[start:end]
    y_input = keras.utils.to_categorical(y_input,vocab_size+1)


In [17]:
i = 0
max_epoch = 1 # Change this max_epoch
instance_max = len(X)
batch_size = 10000

for i in range(max_epoch):
    start_idx_counter = 0
    print("epoch %d" % (i))
    while start_idx_counter < instance_max:
        batch_data(start_idx_counter, min(start_idx_counter+batch_size,instance_max))
        model.fit(X_input, y_input, epochs=1, batch_size=64) 
        start_idx_counter += batch_size



epoch 0
Epoch 1/1
Epoch 1/1
 1984/10000 [====>.........................] - ETA: 10s - loss: 6.7436

KeyboardInterrupt: 

In [None]:
model.save('lyric_gen_model.h5')
pickle.dump( pre, open( "tokenizer_pre_lyric.p", "wb" ) )

# Generate the Lyric

In [4]:
model = keras.models.load_model('lyric_gen_model.h5')
pre = pickle.load( open( "tokenizer_pre_lyric.p", "rb" ) )

In [22]:
def preprocess_predict(X):
    X = pre.texts_to_sequences(X)
    return X

def is_word_in_dict(words, word_idx):
    for word in words.split():
        if word not in word_idx:
            print(word_idx)
            print(word)
            return False
    return True

def generate_song_by_model(model, idx_word, X_sent, total_length, word_idx, total_classes, word_generated=80):
    string_returned = X_sent
    if is_word_in_dict(X_sent, word_idx):
        X = np.array(preprocess_predict([X_sent]))
        counter = 1

        for i in range(word_generated):
            y_pred = model.predict(X)
            rnd_choice = np.random.choice(range(0,total_classes),size=1,replace=False, p=y_pred[0])
            word_predict = idx_word[rnd_choice[0]]
            string_returned = string_returned + " " + word_predict
            X = ' '.join(string_returned.split()[counter:])
            X = np.array(preprocess_predict([X]))
            counter += 1

        return string_returned

In [25]:
max_length = 10
vocab_size = len(pre.word_index)
seed = "<start> " * (max_length-1)
generated_lyric = generate_song_by_model(model, pre.index_word, seed, max_length, pre.word_index, vocab_size+1,word_generated=50)
HTML(generated_lyric)