In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm

In [3]:
data = '/content/drive/MyDrive/ML datasets/Dataset - 2/archive/reduced_csv'

Data Read

In [4]:
def read_csv(data):
  lyrics = []
  for CSV_FILE in tqdm(os.listdir(data)):
    CSV_FILE_PATH = os.path.join(data, CSV_FILE)
    DF = pd.read_csv(CSV_FILE_PATH)
    DF = DF.dropna()
    DF_LYRICS = DF['Lyric'].values
    for LYRIC in DF_LYRICS:
      lyrics.append(LYRIC)
  return lyrics


In [5]:
lyrics = read_csv(data)

100%|██████████| 1/1 [00:00<00:00,  1.41it/s]


In [6]:
len(lyrics)

63

Data Preprocessing

In [7]:
def clean_lyrics(lyrics):
  for i in tqdm(range(len(lyrics))):
    lyric = lyrics[i]
    lyric = lyric.lower()
    lyric = lyric.replace('[^A-Za-z]', '')
    lyric = lyric.replace('\s+', ' ')
    lyric = " ".join([word for word in lyric.split() if len(word) > 1])
    lyrics[i] = lyric

In [8]:
lyrics[0]

"post malone hahahahaha tank god ayy ayy   post malone i've been fuckin' hoes and poppin' pillies man i feel just like a rockstar star ayy ayy all my brothers got that gas and they always be smokin' like a rasta 'sta fuckin' with me call up on a uzi and show up man them the shottas 'tas when my homies pull up on your block they make that thing go grrratatata ta pow pow pow ayy ayy   post malone switch my whip came back in black i'm startin' sayin' rest in peace to bon scott scott ayy close that door we blowin' smoke she ask me light a fire like i'm morrison 'son ayy act a fool on stage prolly leave my fuckin' show in a cop car car ayy shit was legendary threw a tv out the window of the montage cocaine on the table liquor pourin' don't give a damn dude your girlfriend is a groupie she just tryna get in sayin' i'm with the band ayy ayy now she actin' outta pocket tryna grab up on my pants hundred bitches in my trailer say they ain't got a man and they all brought a friend yeah ayy ayy ay

In [9]:
clean_lyrics(lyrics)

100%|██████████| 63/63 [00:00<00:00, 8800.41it/s]


In [10]:
lyrics[0]

"post malone hahahahaha tank god ayy ayy post malone i've been fuckin' hoes and poppin' pillies man feel just like rockstar star ayy ayy all my brothers got that gas and they always be smokin' like rasta 'sta fuckin' with me call up on uzi and show up man them the shottas 'tas when my homies pull up on your block they make that thing go grrratatata ta pow pow pow ayy ayy post malone switch my whip came back in black i'm startin' sayin' rest in peace to bon scott scott ayy close that door we blowin' smoke she ask me light fire like i'm morrison 'son ayy act fool on stage prolly leave my fuckin' show in cop car car ayy shit was legendary threw tv out the window of the montage cocaine on the table liquor pourin' don't give damn dude your girlfriend is groupie she just tryna get in sayin' i'm with the band ayy ayy now she actin' outta pocket tryna grab up on my pants hundred bitches in my trailer say they ain't got man and they all brought friend yeah ayy ayy ayy post malone i've been fuck

In [11]:
from keras.preprocessing.text import Tokenizer

In [12]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lyrics)
vocab_size = len(tokenizer.word_index) + 1
vocab_size

2506

In [13]:
max_len = max(len(lyric.split()) for lyric in lyrics)
max_len

902

In [14]:
tokenized_lyrics = tokenizer.texts_to_sequences(lyrics)

In [15]:
input_lyrics = []
for tokenized_lyric in tqdm(tokenized_lyrics):
  for i in range(1, len(tokenized_lyric)):
    input_lyric = tokenized_lyric[:i + 1]
    input_lyrics.append(input_lyric)

100%|██████████| 63/63 [00:00<00:00, 650.13it/s]


In [16]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [17]:
input_lyrics = np.array(pad_sequences(input_lyrics, maxlen=max_len, padding='pre'))

In [18]:
from keras.utils import to_categorical

In [19]:
X, labels = input_lyrics[:, :-1], input_lyrics[:, -1]
y = to_categorical(labels, num_classes=vocab_size)

Model

In [20]:
from keras.models import Sequential
from keras.layers import Embedding, Dropout, LSTM, Dense, Bidirectional
from keras.callbacks import EarlyStopping
from keras.models import load_model

In [None]:
model = Sequential()

model.add(Embedding(vocab_size, 64, input_length=max_len - 1, mask_zero=True))
model.add(Bidirectional(LSTM(250)))
model.add(Dropout(0.2))
model.add(Dense(vocab_size, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 901, 64)           160384    
                                                                 
 bidirectional_6 (Bidirectio  (None, 500)              630000    
 nal)                                                            
                                                                 
 dropout_6 (Dropout)         (None, 500)               0         
                                                                 
 dense_6 (Dense)             (None, 2506)              1255506   
                                                                 
Total params: 2,045,890
Trainable params: 2,045,890
Non-trainable params: 0
_________________________________________________________________


Training Process

In [None]:
model.fit(X, y, epochs=2, verbose=1)
model.save('/content/drive/MyDrive/ML datasets/Dataset - 2/archive/model_e1.h5')

Epoch 1/2
Epoch 2/2


In [22]:
model_path = '/content/drive/MyDrive/ML datasets/Dataset - 2/Models'

In [23]:
model_e2 = load_model(os.path.join(model_path, 'model_e1.h5'))

In [None]:
model_e2.fit(X, y, epochs=5, verbose=1)
model_e2.save(os.path.join(model_path, 'model_e7.h5'))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Lyrics Generations

In [24]:
model = load_model(os.path.join(model_path, 'model_e7.h5'))

In [55]:
def generate_lyrics(input_lyrics, tokenizer, max_len):
  next_words = 100
  
  for _ in range(next_words):
    tokenized_input = tokenizer.texts_to_sequences([input_lyrics])[0]
    padded_input = pad_sequences([tokenized_input], maxlen=max_len - 1, padding='pre')
    predicted = np.argmax(model.predict(padded_input, verbose=0), axis=-1)

    output_word = ''
    for word, index in tokenizer.word_index.items():
      if index == predicted:
        output_word = word
        break
    input_lyrics += ' ' + output_word
  return input_lyrics

In [57]:
lyrics[0]

"post malone hahahahaha tank god ayy ayy post malone i've been fuckin' hoes and poppin' pillies man feel just like rockstar star ayy ayy all my brothers got that gas and they always be smokin' like rasta 'sta fuckin' with me call up on uzi and show up man them the shottas 'tas when my homies pull up on your block they make that thing go grrratatata ta pow pow pow ayy ayy post malone switch my whip came back in black i'm startin' sayin' rest in peace to bon scott scott ayy close that door we blowin' smoke she ask me light fire like i'm morrison 'son ayy act fool on stage prolly leave my fuckin' show in cop car car ayy shit was legendary threw tv out the window of the montage cocaine on the table liquor pourin' don't give damn dude your girlfriend is groupie she just tryna get in sayin' i'm with the band ayy ayy now she actin' outta pocket tryna grab up on my pants hundred bitches in my trailer say they ain't got man and they all brought friend yeah ayy ayy ayy post malone i've been fuck

In [58]:
print(generate_lyrics("i'm morrison 'son ayy act fool", tokenizer, max_len))

i'm morrison 'son ayy act fool all the time to get the clothe of the top just just wanna go in my belly though like an one keep one one was sucker on the top top top top one and happens and i'm trust at the whip but my bitch is happens like you don't answer it don't listen to see me but you call me call me up in my trunk that ain't be like you got my bed no still still want no listen to me no relief can't even what you want that don't drive away don't drive away don't want no romancin'
