In [1]:
import pandas as pd
from tqdm import tqdm
import numpy

In [2]:
languages = ['en']

In [3]:
lyrics_data = pd.read_csv('archive/lyrics-data.csv')
lyrics_data = lyrics_data.dropna()
lyrics_data = lyrics_data[lyrics_data.language.isin(languages)]
lyrics_data

Unnamed: 0,ALink,SName,SLink,Lyric,language
69,/ivete-sangalo/,Careless Whisper,/ivete-sangalo/careless-whisper.html,I feel so unsure\nAs I take your hand and lead...,en
86,/ivete-sangalo/,Could You Be Loved / Citação Musical do Rap: S...,/ivete-sangalo/could-you-be-loved-citacao-musi...,"Don't let them fool, ya\nOr even try to school...",en
88,/ivete-sangalo/,Cruisin' (Part. Saulo),/ivete-sangalo/cruisin-part-saulo.html,"Baby, let's cruise, away from here\nDon't be c...",en
111,/ivete-sangalo/,Easy,/ivete-sangalo/easy.html,"Know it sounds funny\nBut, I just can't stand ...",en
140,/ivete-sangalo/,For Your Babies (The Voice cover),/ivete-sangalo/for-your-babies-the-voice-cover...,You've got that look again\nThe one I hoped I ...,en
...,...,...,...,...,...
379926,/clegg-johnny/,The Waiting,/clegg-johnny/the-waiting.html,Chorus\nHere we stand waiting on the plain\nDa...,en
379927,/clegg-johnny/,Too Early For The Sky,/clegg-johnny/too-early-for-the-sky.html,I nearly disappeared into the mouth of a croco...,en
379928,/clegg-johnny/,Warsaw 1943 (I Never Betrayed The Revolution),/clegg-johnny/warsaw-1943-i-never-betrayed-the...,"Amambuka, amambuka azothengisa izwe lakithi, i...",en
379929,/clegg-johnny/,When The System Has Fallen,/clegg-johnny/when-the-system-has-fallen.html,Sweat in the heat for days on end\nwaiting for...,en


In [4]:
lyrics = lyrics_data['Lyric'].values
language = lyrics_data['language'].values

In [5]:
def tokenization(lyrics):
    di = {}
    count = 1
    for lyric in tqdm(lyrics):
      for word in lyric:
        if word not in di:
          di[word] = count
          count += 1

    returnArray = []
    for lyric in tqdm(lyrics):
      temp = []
      for word in lyric:
        temp.append(di[word])
      returnArray.append(temp)

    return returnArray, count

def preprocess(lyrics):
    returnArray = []

    for lyric in tqdm(lyrics):
        returnArray.append(lyric.split(' '))

    return tokenization(returnArray)

In [6]:
tok_lyrics, input_words = preprocess(lyrics)

100%|██████████| 191812/191812 [00:05<00:00, 36237.48it/s]
100%|██████████| 191812/191812 [00:06<00:00, 30406.54it/s]
100%|██████████| 191812/191812 [00:10<00:00, 18555.61it/s]


In [7]:
max_review_length = 0

for tok_lyric in tqdm(tok_lyrics):
    if len(tok_lyric) > max_review_length:
        max_review_length = len(tok_lyric)

100%|██████████| 191812/191812 [00:00<00:00, 2462128.86it/s]


In [8]:
def padList(lyrics, max_review_length):
    for i in tqdm(range(len(lyrics))):
        diff = max_review_length - len(lyrics[i])
        lyrics[i].extend([0] * diff)

In [9]:
import copy

padTok_lyrics = copy.deepcopy(tok_lyrics)
padList(padTok_lyrics, max_review_length)
padTok_lyrics = numpy.array(padTok_lyrics).astype('int32')

100%|██████████| 191812/191812 [00:03<00:00, 52806.11it/s]


In [10]:
n_features = 1
padTok_lyrics = padTok_lyrics.reshape((padTok_lyrics.shape[0], padTok_lyrics.shape[1], n_features))

In [11]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
encoded_language = le.fit_transform(language)

In [12]:
shuffle = numpy.arange(padTok_lyrics.shape[0])
numpy.random.shuffle(shuffle)
padTok_lyrics = padTok_lyrics[shuffle]
encoded_language = encoded_language[shuffle]

In [13]:
def preprocessLyrics(lyrics, n_features = 1):
    sourceLyrics = []
    targetLyrics = []

    for lyric in tqdm(lyrics):
        length = len(lyric)
        split = length // 2
        sourceLyrics.append(lyric[:split])
        targetLyrics.append(lyric[split:])

    source_maxLength = 0
    target_maxLength = 0

    for lyric in tqdm(sourceLyrics):
        if len(lyric) > source_maxLength:
            source_maxLength = len(lyric)

    for lyric in tqdm(targetLyrics):
        if len(lyric) > target_maxLength:
            target_maxLength = len(lyric)   

    padList(sourceLyrics, source_maxLength)
    padList(targetLyrics, target_maxLength)

    sourceLyrics = numpy.array(sourceLyrics).astype('int32')
    targetLyrics = numpy.array(targetLyrics).astype('int32')

    sourceLyrics = sourceLyrics.reshape((sourceLyrics.shape[0], sourceLyrics.shape[1], n_features))
    targetLyrics = targetLyrics.reshape((targetLyrics.shape[0], targetLyrics.shape[1], n_features))

    return sourceLyrics, source_maxLength, targetLyrics, target_maxLength

In [14]:
sourceLyrics, source_maxLength, targetLyrics, target_maxLength = preprocessLyrics(tok_lyrics)

100%|██████████| 191812/191812 [00:04<00:00, 45218.05it/s]
100%|██████████| 191812/191812 [00:00<00:00, 2862187.02it/s]
100%|██████████| 191812/191812 [00:00<00:00, 3086523.71it/s]
100%|██████████| 191812/191812 [00:01<00:00, 100875.48it/s]
100%|██████████| 191812/191812 [00:02<00:00, 92735.93it/s]


In [15]:
from sklearn.model_selection import train_test_split

split = len(sourceLyrics) // 2
trly, tely, trla, tela = train_test_split(padTok_lyrics, encoded_language, test_size=0.50)
strlys, stelys = sourceLyrics[:split], sourceLyrics[split:]
ttrlys, ttelys = targetLyrics[:split], targetLyrics[split:]

In [16]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

In [17]:
def lanClassModel(input_words, max_review_length, embedding_vector_length=32):
  model = Sequential()
  model.add(Embedding(input_words + 1, embedding_vector_length, input_length=max_review_length))
  model.add(LSTM(100, activation='relu', input_shape=(None, max_review_length, embedding_vector_length)))
  model.add(Dense(1, activation='relu'))
  model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])
  print(model.summary())

  return model

In [18]:
lanModel = lanClassModel(input_words, max_review_length)

2023-02-14 11:00:44.784774: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 2986, 32)          93017952  
                                                                 
 lstm (LSTM)                 (None, 100)               53200     
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 93,071,253
Trainable params: 93,071,253
Non-trainable params: 0
_________________________________________________________________
None


In [30]:
lanModel.fit(trly, trla, epochs=1, batch_size=32, verbose=1)

 145/3172 [>.............................] - ETA: 2:09:21 - loss: 0.0741 - accuracy: 0.9450

KeyboardInterrupt: 

In [80]:
lanModel.save('lanModel.h5')

In [31]:
from sklearn.metrics import accuracy_score

prediction = lanModel.predict(tely)
prediction = numpy.round_(prediction)
lanAccuracy = accuracy_score(tela, prediction)
lanAccuracy



0.9449136257477064

In [59]:
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.layers import Input
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy


In [63]:
def lyClassModel(n_step_in, n_step_out, n_features = 1):
    model = Sequential()
    model.add(Input(shape=(n_step_in, 1)))
    model.add(LSTM(64, return_sequences=False))
    model.add(RepeatVector(n_step_out))
    model.add(LSTM(64, return_sequences=True, dropout=0.2))
    model.add(TimeDistributed(Dense(1)))
    model.compile(optimizer=Adam(1e-3), loss='mse', metrics=['accuracy'])
    print(model.summary())

    return model

In [64]:
lyModel_en = lyClassModel(source_maxLength, target_maxLength, n_features)

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_21 (LSTM)              (None, 64)                16896     
                                                                 
 repeat_vector_10 (RepeatVec  (None, 1493, 64)         0         
 tor)                                                            
                                                                 
 lstm_22 (LSTM)              (None, 1493, 64)          33024     
                                                                 
 time_distributed_9 (TimeDis  (None, 1493, 1)          65        
 tributed)                                                       
                                                                 
Total params: 49,985
Trainable params: 49,985
Non-trainable params: 0
_________________________________________________________________
None


In [65]:
lyModel_en.fit(strlys, ttrlys, epochs=10, verbose=1)

Epoch 1/10
 224/2998 [=>............................] - ETA: 39:44 - loss: 7589249024.0000 - accuracy: 0.0389

KeyboardInterrupt: 

In [66]:
prediction = lyModel_en.predict(stelys[:2])



In [67]:
prediction[1]

array([[17.554838],
       [22.73543 ],
       [23.555368],
       ...,
       [23.699305],
       [23.699305],
       [23.699305]], dtype=float32)

In [68]:
strlys[1]

array([[113],
       [114],
       [115],
       ...,
       [  0],
       [  0],
       [  0]], dtype=int32)