In [1]:
import keras
from keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
import pandas as pd
import csv
from sklearn.model_selection import train_test_split
from keras.callbacks import ModelCheckpoint
import regex as re
import numpy as np

from keras.utils import np_utils
import pickle


In [6]:
df = pd.read_csv('../raw_data/pink_floyd_lyrics.csv')

In [7]:
tokenizer = Tokenizer(char_level=False)
tokenizer.fit_on_texts([df.lyrics.iloc[1].replace('\n',' \n ')])
tokenizer.word_index
    

{'side': 1,
 'that': 2,
 'cat’s': 3,
 'something': 4,
 'i': 5,
 'can’t': 6,
 'explain': 7,
 'cat': 8,
 'you’re': 9,
 'a': 10,
 'the': 11,
 'be': 12,
 'lucifer': 13,
 'always': 14,
 'by': 15,
 'your': 16,
 'around': 17,
 'sam': 18,
 'siam': 19,
 'sitting': 20,
 'jennifer': 21,
 'gentle': 22,
 'witch': 23,
 'left': 24,
 'he’s': 25,
 'right': 26,
 'oh': 27,
 'no': 28,
 'go': 29,
 'to': 30,
 'sea': 31,
 'hip': 32,
 'ship’s': 33,
 'somewhere': 34,
 'anywhere': 35,
 'at': 36,
 'night': 37,
 'prowling': 38,
 'sifting': 39,
 'sand': 40,
 'hiding': 41,
 'on': 42,
 'ground': 43,
 'he’ll': 44,
 'found': 45,
 'when': 46}

In [8]:
text = df.lyrics.iloc[1].split('\n')
text = [re.sub(r'\d+', '', i) for i in text]
corpus = list(set(text))

In [9]:
corpus

['',
 'Lucifer, go to sea',
 'Hiding around on the ground',
 'Somewhere, anywhere',
 'Be a hip cat, be a ship’s cat',
 'At night prowling, sifting sand',
 'He’ll be found when you’re around',
 'You’re the left side, he’s the right side',
 'Lucifer Sam, siam cat',
 'Always sitting by your side',
 'Oh, no!',
 'Jennifer Gentle, you’re a witch',
 'That cat’s something I can’t explain',
 'Always by your side']

In [10]:
lines=[]

for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    print('--'*25)
    print(token_list)
    print('--'*25)
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        print(n_gram_sequence)

--------------------------------------------------
[]
--------------------------------------------------
--------------------------------------------------
[13, 29, 30, 31]
--------------------------------------------------
[13, 29]
[13, 29, 30]
[13, 29, 30, 31]
--------------------------------------------------
[41, 17, 42, 11, 43]
--------------------------------------------------
[41, 17]
[41, 17, 42]
[41, 17, 42, 11]
[41, 17, 42, 11, 43]
--------------------------------------------------
[34, 35]
--------------------------------------------------
[34, 35]
--------------------------------------------------
[12, 10, 32, 8, 12, 10, 33, 8]
--------------------------------------------------
[12, 10]
[12, 10, 32]
[12, 10, 32, 8]
[12, 10, 32, 8, 12]
[12, 10, 32, 8, 12, 10]
[12, 10, 32, 8, 12, 10, 33]
[12, 10, 32, 8, 12, 10, 33, 8]
--------------------------------------------------
[36, 37, 38, 39, 40]
--------------------------------------------------
[36, 37]
[36, 37, 38]
[36, 37, 38, 39

In [11]:
def ngram(token_list):
  ng = []
  for i in range(1, len(token_list)):
    n_gram_sequence = token_list[:i+1]
    ng.append(n_gram_sequence)
  return ng

In [12]:
df.iloc[[0]].lyrics.iloc[0]

'"Moon in both [houses]..."...Scorpio, [Arabian Skies], Libra..."...Pluto was not discovered until 1930..."\nLime and limpid green, a second scene\nA fight between the blue you once knew\nFloating down, the sound resounds\nAround the icy waters underground\nJupiter and Saturn, Oberon, Miranda and Titania\nNeptune, Titan, stars can frighten\n\nBlinding signs flap\nFlicker, flicker, flicker, blam\nPow, pow\nStairway scare Dan Dare who’s there?\n\nLime and limpid green, the sound surrounds\nThe icy waters under\nLime and limpid green, the sound surrounds\nThe icy waters underground'

In [13]:
def seqform(data):
  tokenise = Tokenizer()
  input_sequences = []
  corpus = []
  k=0
  for i in range(0,len(df)):
      text = df.iloc[[i]].lyrics.iloc[0]
      if type(text)==float:
          pass
      else:
          text = text.lower().split("\n")
          text = [re.sub(r'\d+', '', i) for i in text]
          text = list(set(text))
          if text==' ':
              pass
          else:
              corpus.extend(text)
              k+=1
  tokenise.fit_on_texts(corpus)
  for line in corpus:
      token_list = tokenise.texts_to_sequences([line])[0]
      input_sequences.extend(ngram(token_list))
 
  
  max_sequence_len = max([len(x) for x in input_sequences])
  input_sequences = np.array(pad_sequences(input_sequences,
                       maxlen = max_sequence_len, padding='pre'))
  
  predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
  fin_data = pd.DataFrame(np.hstack((predictors, label.reshape(-1,1))),columns=np.hstack((np.arange(1,predictors.shape[1]+1),np.array(['label']))))
  total_words = len(tokenise.word_index) + 1
  print('{} number of lyrics inputted'.format(k))

  return fin_data,tokenise,max_sequence_len,total_words,predictors,label

In [14]:
fdf,tokenise,max_sequence_len,total_words,predictors,label = seqform(df)
print(fdf.shape,max_sequence_len,total_words)

125 number of lyrics inputted
(13839, 88) 88 2983


In [15]:
fdf .to_csv('fin_df.csv',index=False)

In [16]:
dataX = [fdf.iloc[i,0:87].tolist() for i in range(0,fdf.shape[0])]
dataY = [fdf.iloc[i,87] for i in range(0,fdf.shape[0])]
print(len(dataX))
print(len(dataY))

13839
13839


In [17]:
# reshape X to be [samples, time steps, features]
X = np.reshape(dataX, (13839, 87, 1))

# one hot encode the output variable
y = np_utils.to_categorical(dataY)

In [18]:
model = Sequential()
model.add(Embedding(total_words, 150, input_length=max_sequence_len-1))
# Add an LSTM Layer
model.add(Bidirectional(LSTM(150, return_sequences=True)))  
# A dropout layer for regularisation
model.add(Dropout(0.2))
# Add another LSTM Layer
model.add(LSTM(100)) 
model.add(Dense(total_words/2, activation='relu'))  
# In the last layer, the shape should be equal to the total number of words present in our corpus
model.add(Dense(y.shape[1], activation='softmax'))
#model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics='accuracy')  #(# Pick a loss function and an optimizer)
print(model.summary())

2023-02-04 11:48:08.074155: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 87, 150)           447450    
                                                                 
 bidirectional (Bidirectiona  (None, 87, 300)          361200    
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 87, 300)           0         
                                                                 
 lstm_1 (LSTM)               (None, 100)               160400    
                                                                 
 dense (Dense)               (None, 1491)              150591    
                                                                 
 dense_1 (Dense)             (None, 2983)              4450636   
                                                        

In [19]:
# define the checkpoint
#filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
#checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
#callbacks_list = [checkpoint]

In [20]:
#model.fit(X,y, epochs= 60,callbacks=callbacks_list)

In [21]:
#filename = "weights-improvement-60-0.7534.hdf5"
#model.load_weights(filename)
#model.compile(loss='categorical_crossentropy', optimizer='adam',metrics='accuracy')
#model.fit(X, y, epochs=20, callbacks=callbacks_list)

In [22]:
#model.save_weights('my_model_weights.h5')
#model.save('my_model.h5')

In [3]:
def make_lyrics(seed_text, next_words):
    pred_index=[]
    for i in range(next_words):
        token_list = tokenise.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list],
                     maxlen=max_sequence_len-1,padding='pre')
        #print(token_list.shape)
        token_list = np.reshape(token_list, (1, max_sequence_len-1, 1))
        predicted = model.predict(token_list, verbose=0)
        predicted_index =  np.argmax(predicted)
        pred_index.append(predicted_index)
        


        #predicted_index=1
        output_word = ""
        for word, index in tokenise.word_index.items():
            if index == predicted_index:
                output_word = word
                break
        seed_text += " " + output_word
    #print(seed_text)
    return seed_text

In [24]:
reverse_word_map = dict(map(reversed, tokenise.word_index.items()))
start = np.random.randint(0, len(dataX)-1)
pattern = dataX[start]
pattern_val = [i for i in pattern if i>0]
print("Seed:")
print(' '.join([reverse_word_map.get(value) for value in pattern_val]))
seed_text = [reverse_word_map.get(value)+' ' for value in pattern_val]

Seed:
remember how she


In [4]:
make_lyrics('I',40)

NameError: name 'tokenise' is not defined

In [26]:
#import pickle

# saving
#with open('tokenizer.pickle', 'wb') as handle:
    #pickle.dump(tokenise, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [6]:
model = keras.models.load_model('../my_model.h5')

2023-02-06 13:47:08.873548: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [16]:
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [19]:
max_sequence_len = 88

In [20]:
def make_lyrics(seed_text, next_words):
    pred_index=[]
    for i in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list],
                     maxlen=max_sequence_len-1,padding='pre')
        #print(token_list.shape)
        token_list = np.reshape(token_list, (1, max_sequence_len-1, 1))
        predicted = model.predict(token_list, verbose=0)
        predicted_index =  np.argmax(predicted)
        pred_index.append(predicted_index)
        


        #predicted_index=1
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted_index:
                output_word = word
                break
        seed_text += " " + output_word
    #print(seed_text)
    return seed_text

In [21]:
make_lyrics('I',40)

'I don’t know i was really drunk at the time is gone the song is over thought i’d something more had it doon by the haim ‘ma place well i slapped me and i slapped it doon in the side and'