In [1]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import string, os 
import tensorflow as tf

# keras module for building LSTM 
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Dropout, LSTM, Dense, Bidirectional 
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from keras.models import Sequential
from keras.regularizers import L1, L2, L1L2

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
PATH_TO_FILE = "/content/drive/MyDrive/DTS/Try Out/Try out/1/lyrics-data.csv"
LIMIT_ROWS   = None
VAL_SPLIT    = .1
OOV_TOK      = "<<OOV>>"
MAX_SEQ_LEN  = 120
EPOCH_NUMBER = 150

In [4]:
# csv file
lyrics = pd.read_csv(PATH_TO_FILE, nrows = LIMIT_ROWS)
lyrics.shape

(379931, 5)

In [5]:
# first few rows
lyrics.head()

Unnamed: 0,ALink,SName,SLink,Lyric,language
0,/ivete-sangalo/,Arerê,/ivete-sangalo/arere.html,"Tudo o que eu quero nessa vida,\nToda vida, é\...",pt
1,/ivete-sangalo/,Se Eu Não Te Amasse Tanto Assim,/ivete-sangalo/se-eu-nao-te-amasse-tanto-assim...,Meu coração\nSem direção\nVoando só por voar\n...,pt
2,/ivete-sangalo/,Céu da Boca,/ivete-sangalo/chupa-toda.html,É de babaixá!\nÉ de balacubaca!\nÉ de babaixá!...,pt
3,/ivete-sangalo/,Quando A Chuva Passar,/ivete-sangalo/quando-a-chuva-passar.html,Quando a chuva passar\n\nPra quê falar\nSe voc...,pt
4,/ivete-sangalo/,Sorte Grande,/ivete-sangalo/sorte-grande.html,A minha sorte grande foi você cair do céu\nMin...,pt


In [6]:
lyrics = lyrics['Lyric'].astype(str).str.lower()

# tokenization
tokenizer = Tokenizer(num_words = MAX_SEQ_LEN, oov_token = OOV_TOK)
tokenizer.fit_on_texts(lyrics)

total_words = len(tokenizer.word_index)+1
sequences = tokenizer.texts_to_sequences(lyrics)

In [None]:
# slash sequences into n gram sequence
input_sequences = []

for sequence in sequences:
  for i in range(1, len(sequence)):
    n_gram_sequence = sequence[:i+1]
    input_sequences.append(n_gram_sequence)

max_seq_len = max([len(x) for x in input_sequences])

In [None]:
# padding sequences
input_sequences = pad_sequences(input_sequences, 
                                maxlen  = max_seq_len, 
                                padding = 'pre')
input_sequences = np.array(input_sequences)

In [None]:
# create predictors and label
X, labels = input_sequences[:,:-1], input_sequences[:,-1]

# one-hot encode labels
y = tf.keras.utils.to_categorical(labels, num_classes = total_words)

In [None]:
X.shape, total_words, max_seq_len

In [None]:
def create_model(optimizer, loss, metrics, units = 128, drop_rate = .5):
  embed_units = units // 4
  lstm_units  = units // 4
  dense_units = units // 2
  a = 0.01

  model = Sequential()
  model.add(Embedding(total_words, embed_units, input_length = max_seq_len - 1))
  model.add(Bidirectional(LSTM(lstm_units, return_sequences = True)))
  model.add(Bidirectional(LSTM(lstm_units)))
  model.add(Dense(dense_units, activation = "relu", regularizer = L1(a)))
  model.add(Dropout(drop_rate))
  model.add(Dense(dense_units, activation = "relu", regularizer = L2(a)))
  model.add(Dropout(drop_rate))
  model.add(Dense(dense_units, activation = "relu", regularizer = L1L2(a, a)))
  model.add(Dropout(drop_rate))
  model.add(Dense(total_words, activation = "softmax"))
  model.compile(optimizer = optimizer, loss = loss, metrics = metrics)

  return model

In [None]:
# create model
optimizer = tf.keras.optimizers.Adam()
loss      = tf.keras.losses.CategoricalCrossentropy()
metrics   = ["accuracy"]

model = create_model(optimizer, loss, metrics, 128, .2)
model.summary()

In [None]:
# define callbacks for training
callbacks = [
    EarlyStopping(monitor   = 'val_accuracy', 
                  min_delta = 1e-5, 
                  patience  = 3, 
                  mode      = 'auto',
                  verbose   = 1),
    ReduceLROnPlateau(monitor   = 'val_accuracy',
                      factor    = 0.1,
                      patience  = 5,
                      min_delta = 1e-4,
                      cooldown  = 1,
                      verbose   = 1)
]

In [None]:
# train the model
history = model.fit(
    x = X, 
    y = y, 
    epochs = EPOCH_NUMBER, 
    callbacks = callbacks, 
    validation_split = VAL_SPLIT
)

In [None]:
# Plot Utility
def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()

# Plot the accuracy and loss history
plot_graphs(history, 'accuracy')
plot_graphs(history, 'loss')