In [1]:
import pandas as pd
from os import listdir
from os.path import join
from tqdm import tqdm
import numpy as np

In [2]:
csv_path = '/Applications/ML projects/Song Lyrics/Dataset - 2/archive/csv'
csv_files = []

In [3]:
for CSV_NAME in tqdm(listdir(csv_path)):
    CSV_PATH = join(csv_path, CSV_NAME)
    csv_files.append(CSV_PATH)

dataframe = pd.concat([pd.read_csv(file) for file in csv_files], ignore_index=True).dropna()
dataframe[:5]

100%|██████████| 21/21 [00:00<00:00, 112347.43it/s]


Unnamed: 0.1,Unnamed: 0,Artist,Title,Album,Year,Date,Lyric
0,0.0,Dua Lipa,New Rules,Dua Lipa,2017.0,2017-06-02,one one one one one talkin' in my sleep at n...
1,1.0,Dua Lipa,Don’t Start Now,Future Nostalgia,2019.0,2019-11-01,if you don't wanna see me did a full 80 craz...
2,2.0,Dua Lipa,IDGAF,Dua Lipa,2017.0,2017-06-02,you call me all friendly tellin' me how much y...
3,3.0,Dua Lipa,Blow Your Mind (Mwah),Dua Lipa,2016.0,2016-08-26,i know it's hot i know we've got something tha...
4,4.0,Dua Lipa,Be the One,Dua Lipa,2015.0,2015-10-30,i see the moon i see the moon i see the moon o...


In [4]:
lyrics = dataframe['Lyric'].values

In [5]:
def tokenization(lyrics, max_words):
    hashmap1 = {}
    for lyric in lyrics:
        for word in lyric:
            if word not in hashmap1:
                hashmap1[word] = 1
            else:
                hashmap1[word] += 1

    max_frequency = max(hashmap1.values()) + 1
    ranks = []
    hashmap2 = {}
    for key in hashmap1.keys():
        rank = max_frequency - hashmap1[key]
        hashmap2[key] = rank
        ranks.append(rank)
    ranks = sorted(ranks)
    threshold_rank = ranks[max_words]

    hashmap2Keys = hashmap2.keys()
    for key in hashmap2Keys:
        if hashmap2[key] >= threshold_rank:
            ranks.remove(hashmap2[key])
            hashmap2[key] = -1
    ranks = sorted(ranks)

    tokenizer = {}
    for key in hashmap2.keys():
        if hashmap2[key] != -1:
            rank = ranks.index(hashmap2[key]) + 1
            tokenizer[key] = rank
        
    tokenized_lyrics = []
    for lyric in lyrics:
        temp = []
        for word in lyric:
            if word in tokenizer:
                temp.append(tokenizer[word])
        tokenized_lyrics.append(temp)

    return tokenized_lyrics, tokenizer

In [11]:
def normalization(tokenized_lyrics, tokenizer):
    max_rank = float(max(tokenizer.values()))
    for i in range(len(tokenized_lyrics)):
        for j in range(len(tokenized_lyrics[i])):
            tokenized_lyrics[i][j] = tokenized_lyrics[i][j] / max_rank

In [12]:
def preprocess(lyrics, max_words, strip):
    split_lyrics = []

    for lyric in lyrics:
        split_lyrics.append(lyric.split(' '))

    tokenized_lyrics, tokenizer = tokenization(split_lyrics, max_words)
    normalization(tokenized_lyrics, tokenizer)
    
    normalized_lyrics = sorted(tokenized_lyrics, key=len)
    length = len(normalized_lyrics)
    lower_strip = int(strip * length)
    higher_strip = length - lower_strip
    LYRICS = normalized_lyrics[lower_strip: higher_strip]

    return LYRICS, tokenizer

In [13]:
LYRICS, TOKENIZER = preprocess(lyrics, 5000, 0.2)

In [15]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

def prepare_data(LYRICS, input_length, output_length):
    X = []
    y = []

    for lyric in LYRICS:
        X.append(lyric[:input_length])
        y.append(lyric[input_length: input_length + output_length])

    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
    x_train, x_test, y_train, y_test = np.array(x_train), np.array(x_test), np.array(y_train), np.array(y_test)
    x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
    x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))
    return x_train, x_test, y_train, y_test

In [16]:
x_train, x_test, y_train, y_test = prepare_data(LYRICS, 150, 50)

In [17]:
from keras.layers import LSTM, Dense, Dropout
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint

In [39]:
def RNN_MODEL(rnn_units, time_steps, features, output_steps):
    model = Sequential()
    model.add(LSTM(rnn_units, input_shape=(time_steps, features), return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(rnn_units))
    model.add(Dense(output_steps, activation='relu'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model

In [40]:
filepath = "weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [41]:
time_steps = x_train.shape[1]
features = x_train.shape[2]
output_steps = y_train.shape[1]
rnn_units = 256

In [42]:
model = RNN_MODEL(rnn_units, time_steps, features, output_steps)

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_3 (LSTM)               (None, 150, 256)          264192    
                                                                 
 dropout_2 (Dropout)         (None, 150, 256)          0         
                                                                 
 lstm_4 (LSTM)               (None, 256)               525312    
                                                                 
 dense_1 (Dense)             (None, 50)                12850     
                                                                 
Total params: 802,354
Trainable params: 802,354
Non-trainable params: 0
_________________________________________________________________


In [43]:
EPOCHS = 20
BATCH_SIZE = 32