In [1]:
import pandas as pd
from os import listdir
from os.path import join
from tqdm import tqdm
import numpy as np
import tensorflow as tf

Data Read

In [2]:
csv_path = '/Applications/ML projects/Song Lyrics/Dataset - 2/archive/csv'
csv_files = []

In [3]:
for CSV_NAME in tqdm(listdir(csv_path)):
    CSV_PATH = join(csv_path, CSV_NAME)
    csv_files.append(CSV_PATH)

dataframe = pd.concat([pd.read_csv(file) for file in csv_files], ignore_index=True).dropna()
dataframe[:5]

100%|██████████| 21/21 [00:00<00:00, 54981.51it/s]


Unnamed: 0.1,Unnamed: 0,Artist,Title,Album,Year,Date,Lyric
0,0.0,Dua Lipa,New Rules,Dua Lipa,2017.0,2017-06-02,one one one one one talkin' in my sleep at n...
1,1.0,Dua Lipa,Don’t Start Now,Future Nostalgia,2019.0,2019-11-01,if you don't wanna see me did a full 80 craz...
2,2.0,Dua Lipa,IDGAF,Dua Lipa,2017.0,2017-06-02,you call me all friendly tellin' me how much y...
3,3.0,Dua Lipa,Blow Your Mind (Mwah),Dua Lipa,2016.0,2016-08-26,i know it's hot i know we've got something tha...
4,4.0,Dua Lipa,Be the One,Dua Lipa,2015.0,2015-10-30,i see the moon i see the moon i see the moon o...


In [4]:
lyrics = dataframe['Lyric'].values

In [6]:
all_lyrics = []
for lyric in lyrics:
    all_lyrics.append(lyric)

In [7]:
len(all_lyrics)

3207

In [8]:
max_length = max(len(lyric.split()) for lyric in all_lyrics)
max_length

5768

Data Preprocess

In [5]:
def tokenization(lyrics, max_words):
    hashmap1 = {}
    for lyric in lyrics:
        for word in lyric:
            if word not in hashmap1:
                hashmap1[word] = 1
            else:
                hashmap1[word] += 1

    max_frequency = max(hashmap1.values()) + 1
    ranks = []
    hashmap2 = {}
    for key in hashmap1.keys():
        rank = max_frequency - hashmap1[key]
        hashmap2[key] = rank
        ranks.append(rank)
    ranks = sorted(ranks)
    threshold_rank = ranks[max_words]

    hashmap2Keys = hashmap2.keys()
    for key in hashmap2Keys:
        if hashmap2[key] >= threshold_rank:
            ranks.remove(hashmap2[key])
            hashmap2[key] = -1
    ranks = sorted(ranks)

    tokenizer = {}
    for key in hashmap2.keys():
        if hashmap2[key] != -1:
            rank = ranks.index(hashmap2[key]) + 1
            tokenizer[key] = rank
        
    tokenized_lyrics = []
    for lyric in lyrics:
        temp = []
        for word in lyric:
            if word in tokenizer:
                temp.append(tokenizer[word])
        tokenized_lyrics.append(temp)

    return tokenized_lyrics, tokenizer

In [6]:
from math import pow

def normalization(lyrics, hashmap):
    ranks = hashmap.values()
    n = len(ranks)

    rank_mean = sum(ranks) / n
    rank_sub2 = []
    for rank in ranks:
        rank_sub2.append(pow(rank - rank_mean, 2))
    rank_sub2sum = sum(rank_sub2)
    rank_variance = rank_sub2sum / n
    rank_sd = pow(rank_variance, 0.5)

    normalized_lyrics = []
    for lyric in lyrics:
        temp = []
        for word in lyric:
            rank = word
            n1 = (rank - rank_mean) / rank_sd
            n2 = -1 / (n1 + 1)
            temp.append(n2)
        normalized_lyrics.append(temp)

    return normalized_lyrics, rank_mean, rank_sd, 

In [7]:
def preprocess(lyrics, max_words, strip):
    split_lyrics = []

    for lyric in lyrics:
        split_lyrics.append(lyric.split(' '))

    tokenized_lyrics, tokenizer = tokenization(split_lyrics, max_words)
    normalized_lyrics, rank_mean, rank_sd = normalization(tokenized_lyrics, tokenizer)
    
    normalized_lyrics = sorted(normalized_lyrics, key=len)
    length = len(normalized_lyrics)
    lower_strip = int(strip * length)
    higher_strip = length - lower_strip
    LYRICS = normalized_lyrics[lower_strip: higher_strip]

    return LYRICS, rank_mean, rank_sd, tokenizer


In [8]:
LYRICS, MEAN, STANDARD_DEVIATION, TOKENIZER = preprocess(lyrics, 5000, 0.2)

Data Split

In [36]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

def prepare_data(LYRICS, input_length, output_length):
    X = []
    y = []

    for lyric in LYRICS:
        X.append(lyric[:input_length])
        y.append(lyric[input_length: input_length + output_length])

    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
    x_train, x_test, y_train, y_test = np.array(x_train), np.array(x_test), np.array(y_train), np.array(y_test)
    x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
    x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))
    return x_train, x_test, y_train, y_test

In [37]:
x_train, x_test, y_train, y_test = prepare_data(LYRICS, 150, 50)

Model

In [38]:
from keras.layers import LSTM, Dense, Dropout
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint

In [63]:
def RNN_MODEL(rnn_units, time_steps, features, output_steps):
    model = Sequential()
    model.add(LSTM(rnn_units, input_shape=(time_steps, features)))
    model.add(Dense(output_steps, activation='tanh'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model

In [64]:
filepath = "weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [65]:
time_steps = x_train.shape[1]
features = x_train.shape[2]
output_steps = y_train.shape[1]
rnn_units = 256

In [66]:
model = RNN_MODEL(rnn_units, time_steps, features, output_steps)

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_5 (LSTM)               (None, 256)               264192    
                                                                 
 dense_5 (Dense)             (None, 50)                12850     
                                                                 
Total params: 277,042
Trainable params: 277,042
Non-trainable params: 0
_________________________________________________________________


Training

In [67]:
EPOCHS = 20
BATCH_SIZE = 32

In [68]:
model.fit(x_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, callbacks=callbacks_list)

Epoch 1/20
Epoch 1: loss improved from inf to 378.10962, saving model to weights-improvement-01-378.1096.hdf5
Epoch 2/20
Epoch 2: loss improved from 378.10962 to 321.94189, saving model to weights-improvement-02-321.9419.hdf5
Epoch 3/20
Epoch 3: loss improved from 321.94189 to 317.43234, saving model to weights-improvement-03-317.4323.hdf5
Epoch 4/20
Epoch 4: loss did not improve from 317.43234
Epoch 5/20
Epoch 5: loss did not improve from 317.43234
Epoch 6/20
Epoch 6: loss did not improve from 317.43234
Epoch 7/20
Epoch 7: loss did not improve from 317.43234
Epoch 8/20
Epoch 8: loss did not improve from 317.43234
Epoch 9/20
Epoch 9: loss did not improve from 317.43234
Epoch 10/20
Epoch 10: loss did not improve from 317.43234
Epoch 11/20
Epoch 11: loss did not improve from 317.43234
Epoch 12/20
Epoch 12: loss did not improve from 317.43234
Epoch 13/20
Epoch 13: loss did not improve from 317.43234
Epoch 14/20
Epoch 14: loss did not improve from 317.43234
Epoch 15/20
Epoch 15: loss did n

<keras.callbacks.History at 0x148253550>

In [69]:
model.predict(x_test[0])



array([[0.07945235, 0.00763285, 0.07255211, ..., 0.10205323, 0.06520914,
        0.04087023],
       [0.05987512, 0.00835881, 0.05648387, ..., 0.07536648, 0.05511349,
        0.03340356],
       [0.08481648, 0.00743614, 0.07697365, ..., 0.1093156 , 0.0679851 ,
        0.04301887],
       ...,
       [0.05987512, 0.00835881, 0.05648387, ..., 0.07536648, 0.05511349,
        0.03340356],
       [0.06125525, 0.00830721, 0.05761297, ..., 0.07725696, 0.05582319,
        0.03391042],
       [0.05987512, 0.00835881, 0.05648387, ..., 0.07536648, 0.05511349,
        0.03340356]], dtype=float32)

In [70]:
y_test[0]

array([ 1.37312847,  1.33498754,  1.36773859,  1.33626744,  1.3375498 ,
        1.3375498 ,  1.34012191, -2.13456708,  1.72916934,  1.34270392,
        1.35973264,  1.98815926,  1.35973264,  1.98815926,  1.37042823,
        1.37042823,  1.3375498 ,  1.3375498 ,  1.33626744,  1.36505949,
        1.35051003,  1.33498754,  1.77768936,  1.36908209,  1.33626744,
        1.43534788,  1.56246353,  1.35973264,  1.47953391,  1.36908209,
        1.33626744,  1.33498754,  2.63454172,  1.43534788,  1.37177702,
        1.59829813,  1.42945953,  1.45939454,  1.37042823,  1.33498754,
        1.36505949,  1.35051003,  1.33626744,  1.40922537,  1.34141167,
        1.46399327,  1.39094277,  1.36639773,  1.39791812,  1.42653343])