In [16]:
import keras
from keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
import pandas as pd
import csv
from sklearn.model_selection import train_test_split
from keras.callbacks import ModelCheckpoint
import regex as re
import numpy as np

from keras.utils import np_utils


In [3]:
df = pd.read_csv('../raw_data/pink_floyd_lyrics.csv')

In [4]:
tokenizer = Tokenizer(char_level=False)
tokenizer.fit_on_texts([df.lyrics.iloc[1].replace('\n',' \n ')])
tokenizer.word_index
    

{'side': 1,
 'that': 2,
 'cat’s': 3,
 'something': 4,
 'i': 5,
 'can’t': 6,
 'explain': 7,
 'cat': 8,
 'you’re': 9,
 'a': 10,
 'the': 11,
 'be': 12,
 'lucifer': 13,
 'always': 14,
 'by': 15,
 'your': 16,
 'around': 17,
 'sam': 18,
 'siam': 19,
 'sitting': 20,
 'jennifer': 21,
 'gentle': 22,
 'witch': 23,
 'left': 24,
 'he’s': 25,
 'right': 26,
 'oh': 27,
 'no': 28,
 'go': 29,
 'to': 30,
 'sea': 31,
 'hip': 32,
 'ship’s': 33,
 'somewhere': 34,
 'anywhere': 35,
 'at': 36,
 'night': 37,
 'prowling': 38,
 'sifting': 39,
 'sand': 40,
 'hiding': 41,
 'on': 42,
 'ground': 43,
 'he’ll': 44,
 'found': 45,
 'when': 46}

In [5]:
text = df.lyrics.iloc[1].split('\n')
text = [re.sub(r'\d+', '', i) for i in text]
corpus = list(set(text))

In [6]:
corpus

['Lucifer Sam, siam cat',
 '',
 'You’re the left side, he’s the right side',
 'Always sitting by your side',
 'Always by your side',
 'Jennifer Gentle, you’re a witch',
 'Oh, no!',
 'Lucifer, go to sea',
 'That cat’s something I can’t explain',
 'Hiding around on the ground',
 'Be a hip cat, be a ship’s cat',
 'Somewhere, anywhere',
 'At night prowling, sifting sand',
 'He’ll be found when you’re around']

In [7]:
lines=[]

for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    print('--'*25)
    print(token_list)
    print('--'*25)
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        print(n_gram_sequence)

--------------------------------------------------
[13, 18, 19, 8]
--------------------------------------------------
[13, 18]
[13, 18, 19]
[13, 18, 19, 8]
--------------------------------------------------
[]
--------------------------------------------------
--------------------------------------------------
[9, 11, 24, 1, 25, 11, 26, 1]
--------------------------------------------------
[9, 11]
[9, 11, 24]
[9, 11, 24, 1]
[9, 11, 24, 1, 25]
[9, 11, 24, 1, 25, 11]
[9, 11, 24, 1, 25, 11, 26]
[9, 11, 24, 1, 25, 11, 26, 1]
--------------------------------------------------
[14, 20, 15, 16, 1]
--------------------------------------------------
[14, 20]
[14, 20, 15]
[14, 20, 15, 16]
[14, 20, 15, 16, 1]
--------------------------------------------------
[14, 15, 16, 1]
--------------------------------------------------
[14, 15]
[14, 15, 16]
[14, 15, 16, 1]
--------------------------------------------------
[21, 22, 9, 10, 23]
--------------------------------------------------
[21, 22]
[21, 

In [8]:
def ngram(token_list):
  ng = []
  for i in range(1, len(token_list)):
    n_gram_sequence = token_list[:i+1]
    ng.append(n_gram_sequence)
  return ng

In [9]:
df.iloc[[0]].lyrics.iloc[0]

'"Moon in both [houses]..."...Scorpio, [Arabian Skies], Libra..."...Pluto was not discovered until 1930..."\nLime and limpid green, a second scene\nA fight between the blue you once knew\nFloating down, the sound resounds\nAround the icy waters underground\nJupiter and Saturn, Oberon, Miranda and Titania\nNeptune, Titan, stars can frighten\n\nBlinding signs flap\nFlicker, flicker, flicker, blam\nPow, pow\nStairway scare Dan Dare who’s there?\n\nLime and limpid green, the sound surrounds\nThe icy waters under\nLime and limpid green, the sound surrounds\nThe icy waters underground'

In [11]:
def seqform(data):
  tokenise = Tokenizer()
  input_sequences = []
  corpus = []
  k=0
  for i in range(0,len(df)):
      text = df.iloc[[i]].lyrics.iloc[0]
      if type(text)==float:
          pass
      else:
          text = text.lower().split("\n")
          text = [re.sub(r'\d+', '', i) for i in text]
          text = list(set(text))
          if text==' ':
              pass
          else:
              corpus.extend(text)
              k+=1
  tokenise.fit_on_texts(corpus)
  for line in corpus:
      token_list = tokenise.texts_to_sequences([line])[0]
      input_sequences.extend(ngram(token_list))
 
  
  max_sequence_len = max([len(x) for x in input_sequences])
  input_sequences = np.array(pad_sequences(input_sequences,
                       maxlen = max_sequence_len, padding='pre'))
  
  predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
  fin_data = pd.DataFrame(np.hstack((predictors, label.reshape(-1,1))),columns=np.hstack((np.arange(1,predictors.shape[1]+1),np.array(['label']))))
  total_words = len(tokenise.word_index) + 1
  print('{} number of lyrics inputted'.format(k))

  return fin_data,tokenise,max_sequence_len,total_words,predictors,label

In [12]:
fdf,tokenise,max_sequence_len,total_words,predictors,label = seqform(df)
print(fdf.shape,max_sequence_len,total_words)

125 number of lyrics inputted
(13839, 88) 88 2983


In [13]:
fdf .to_csv('fin_df.csv',index=False)

In [14]:
dataX = [fdf.iloc[i,0:87].tolist() for i in range(0,fdf.shape[0])]
dataY = [fdf.iloc[i,87] for i in range(0,fdf.shape[0])]
print(len(dataX))
print(len(dataY))

13839
13839


In [17]:
# reshape X to be [samples, time steps, features]
X = np.reshape(dataX, (13839, 87, 1))

# one hot encode the output variable
y = np_utils.to_categorical(dataY)

In [18]:
model = Sequential()
model.add(Embedding(total_words, 150, input_length=max_sequence_len-1))
# Add an LSTM Layer
model.add(Bidirectional(LSTM(150, return_sequences=True)))  
# A dropout layer for regularisation
model.add(Dropout(0.2))
# Add another LSTM Layer
model.add(LSTM(100)) 
model.add(Dense(total_words/2, activation='relu'))  
# In the last layer, the shape should be equal to the total number of words present in our corpus
model.add(Dense(y.shape[1], activation='softmax'))
#model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics='accuracy')  #(# Pick a loss function and an optimizer)
print(model.summary())

2023-02-02 16:05:43.698388: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 87, 150)           447450    
                                                                 
 bidirectional (Bidirectiona  (None, 87, 300)          361200    
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 87, 300)           0         
                                                                 
 lstm_1 (LSTM)               (None, 100)               160400    
                                                                 
 dense (Dense)               (None, 1491)              150591    
                                                                 
 dense_1 (Dense)             (None, 2983)              4450636   
                                                        