In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

CSV data path

In [6]:
data = '/Applications/ML projects/Song Lyrics/Dataset - 2/archive/csv'

Read lyrics

In [7]:
def readLyrics(data):
  lyrics = []
  for CSV_FILE in tqdm(os.listdir(data)):
    CSV_PATH = os.path.join(data, CSV_FILE)
    df = pd.read_csv(CSV_PATH)
    df = df.dropna()
    dfLyrics = df['Lyric'].values
    for lyric in dfLyrics:
      lyrics.append(lyric)

  return lyrics


In [8]:
lyrics = readLyrics(data)

100%|██████████| 21/21 [00:00<00:00, 64.29it/s]


Clean lyrics

In [8]:
def cleanLyrics(lyrics):
  for i in tqdm(range(len(lyrics))):
    lyric = lyrics[i]
    lyric = lyric.lower()
    lyric = lyric.replace('[^A-Za-z]', '')
    lyric = lyric.replace('\s+', ' ')
    lyric = 'start  ' + " ".join([word for word in lyric.split() if len(word) > 1]) + '  end'
    lyrics[i] = lyric

In [9]:
lyrics[0]

'Tudo o que eu quero nessa vida,\nToda vida, é\nÉ amar você\nAmar você\n\nO seu amor é como uma chama acesa\nQueima de prazer\nDe prazer\n\nEu já falei com Deus que não vou te deixar\nVou te levar pra onde for\nQualquer lugar\nJá fiz de tudo pra não te perder\n\nArerê,\nUm lobby, um hobby, um love com você\nArerê,\nUm lobby, um hobby, um love com você\n\nCai, cai, cai, cai, cai pra cá\nHey, hey, hey\nTu-do,tu-do, vai rolar'

In [7]:
cleanLyrics(lyrics)

100%|██████████| 3422/3422 [00:00<00:00, 13927.16it/s]


In [8]:
lyrics[0]

"start  one one one one one talkin' in my sleep at night makin' myself crazy out of my mind out of my mind wrote it down and read it out hopin' it would save me too many times too many times refrain my love he makes me feel like nobody else nobody else but my love he doesn't love me so tell myself tell myself pre one don't pick up the phone you know he's only callin' 'cause he's drunk and alone two don't let him in you'll have to kick him out again three don't be his friend you know you're gonna wake up in his bed in the morning and if you're under him you ain't gettin' over him got new rules count 'em got new rules count 'em gotta tell them to myself got new rules count 'em gotta tell them to myself keep pushin' forwards but he keeps pullin' me backwards nowhere to turn no way nowhere to turn no now i'm standin' back from it finally see the pattern never learn never learn refrain but my love he doesn't love me so tell myself tell myself do do do pre one don't pick up the phone you kno

In [9]:
len(lyrics)

3422

In [10]:
from keras.preprocessing.text import Tokenizer

In [11]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lyrics)
vocab_size = len(tokenizer.word_index) + 1
vocab_size

44888

In [12]:
max_length = max(len(lyric.split()) for lyric in lyrics)
max_length

5287

Train and Test Split

In [13]:
split = int(len(lyrics) * 0.80)
train = lyrics[:split]
test = lyrics[split:]

Data Generator

In [14]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [15]:
def dataGenerator(lyrics, tokenizer, max_length, vocab_size, batch_size):
  X, y = list(), list()
  n = 0
  while 1:
    for lyric in lyrics:
      n += 1
      seq = tokenizer.texts_to_sequences([lyric])[0]
      for i in range(len(seq)):
        in_seq, out_seq = seq[:i], seq[i]
        in_seq = pad_sequences([in_seq], maxlen = max_length)[0]
        out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
        X.append(in_seq)
        y.append(out_seq)
      if n == batch_size:
        X, y = np.array(X), np.array(y)
        yield X, y
        X, y = list(), list()
        n = 0

Build Model

In [16]:
from keras import Model
from keras.layers import Input, Dense, Dropout, LSTM, Embedding

In [17]:
input = Input(shape=(max_length, ))
l1 = Embedding(vocab_size, 256, mask_zero=0)(input)
l2 = Dropout(0.4)(l1)
l3 = LSTM(256)(l2)
l4 = Dense(256, activation='relu')(l3)
output = Dense(vocab_size, activation='softmax')(l4)

model = Model(inputs=input, outputs=output)
model.compile(loss='categorical_crossentropy', optimizer='adam')

2023-03-26 12:57:35.687259: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Training Process

In [18]:
EPOCHS = 30
BATCH_SIZE = 32
STEPS = len(train) // BATCH_SIZE

In [19]:
for i in range(EPOCHS):
  data_gen = dataGenerator(train, tokenizer, max_length, vocab_size, BATCH_SIZE)
  model.fit(data_gen, epochs=1, steps_per_epoch=STEPS, verbose=1)

In [None]:
models_path = ''
model.save(os.path.join(models_path, 'best_model.h5'))

Lyric Generation

In [None]:
def idxToWord(index, tokenizer):
  if index in tokenizer.word_index:
    return tokenizer.word_index[index]
  else:
    return None

In [None]:
def predictLyric(model, lyric, tokenizer, max_length):
  in_lyric = lyric
  split = len(in_lyric)

  for i in range(max_length):
    sequence = tokenizer.texts_to_sequences([in_lyric])[0]
    sequence = pad_sequences([sequence], max_length)
    indexes = model.predict([sequence], verbose=0)
    index = np.argmax(indexes)
    word = idxToWord(index, tokenizer)
    if word == None:
      break
    in_lyric += ' ' + word
    if word == 'end':
      out_lyric = in_lyric[split:]
      return out_lyric

  return ''