In [31]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dropout, Dense
from keras.utils import to_categorical

# Read data set
We have a data set containing song lyrics. We display the first 50 chars of such a dataset.

In [32]:
data = open('Zayn_Lyrics.txt').read()

In [33]:
data[:50]

"Now I'm on the edge\nI can't find my way\nIt's insid"

# Generate corpus
We work with lower case data. Each line of the song lyric is an entry for the corpus. We save it into a set to avoid duplicates.

In [34]:
# Splitting the string into sentences, while converting whole data into lowercase.
corpus = data.lower().split("\n")
# To make sure no sentence appears twice in our corpus, we use set. Otherwise, it will make the model biased.
corpus = list(set(corpus))
corpus

['',
 'climb on board',
 'taste your sweet profanity',
 'my dreams, new seeds with enemies',
 "i'm just wishing it's ambition that got you",
 "i'm sad about shit that never happened",
 'so take it off',
 "don't know how many times",
 "don't try, light it up",
 "'cause i have no time to help you find",
 'you get off on me',
 'seeing the pain inside in this house of fear',
 "i'm seeing in the blacks and grays",
 'i wanna shed light',
 'she is the life of the party',
 'i know sometimes i hide it',
 'a lack of sanity, losing touch with reality',
 "you're caught between a dream",
 'change the scene',
 'pillow talk',
 "bright lights, but she's fading",
 "but it's gotta be the right time",
 'i go out my way to treat you',
 'in the place that feels the tears',
 "but i can't this time 'cause it's gonna defeat me",
 "baby, i'm a pleaser",
 "think you're doing a good job",
 "we're blazing on that new found haze",
 'not like this, not like this',
 "and i ain't wrong, i don't breathe the pollution"

We then separete each work and we tokenize the corpus using the Tokenizer library from keras. We also save a variable containing the total number of words.

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1
tokenizer.word_index

{'i': 1,
 'the': 2,
 'you': 3,
 'it': 4,
 'to': 5,
 'and': 6,
 'a': 7,
 "don't": 8,
 'in': 9,
 'my': 10,
 'all': 11,
 'me': 12,
 "i'm": 13,
 "it's": 14,
 'this': 15,
 'that': 16,
 'take': 17,
 'she': 18,
 "can't": 19,
 'on': 20,
 'off': 21,
 'like': 22,
 'just': 23,
 'so': 24,
 'of': 25,
 'right': 26,
 'but': 27,
 'for': 28,
 'what': 29,
 "won't": 30,
 'your': 31,
 "you're": 32,
 'know': 33,
 'time': 34,
 "'cause": 35,
 'wanna': 36,
 'be': 37,
 'say': 38,
 'when': 39,
 'no': 40,
 'is': 41,
 'way': 42,
 'see': 43,
 'love': 44,
 'up': 45,
 'got': 46,
 'about': 47,
 'things': 48,
 'baby': 49,
 'her': 50,
 'with': 51,
 "she's": 52,
 'out': 53,
 'good': 54,
 'now': 55,
 "i'll": 56,
 'zone': 57,
 'give': 58,
 'never': 59,
 'try': 60,
 'light': 61,
 'seeing': 62,
 'go': 63,
 'place': 64,
 'not': 65,
 'do': 66,
 'need': 67,
 'want': 68,
 'had': 69,
 'at': 70,
 'war': 71,
 "you've": 72,
 'paradise': 73,
 'hold': 74,
 'shit': 75,
 'get': 76,
 'life': 77,
 "we're": 78,
 "ain't": 79,
 'mind': 80,


We then create what is going to be our input sequences. We take the first line of the corpus _"climb on board"_ and we generate two new lines from it: _"climb on"_ and _"climb on board"_. These two line are represented using their token. We repeate this procedure for all input lines.

In [None]:
# create input sequences using list of tokens
input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)
print(tokenizer.word_index["climb"])
print(tokenizer.word_index["on"])
print(tokenizer.word_index["board"])
input_sequences

205
20
206


[[205, 20],
 [205, 20, 206],
 [136, 31],
 [136, 31, 207],
 [136, 31, 207, 208],
 [10, 209],
 [10, 209, 97],
 [10, 209, 97, 210],
 [10, 209, 97, 210, 51],
 [10, 209, 97, 210, 51, 211],
 [13, 23],
 [13, 23, 212],
 [13, 23, 212, 14],
 [13, 23, 212, 14, 213],
 [13, 23, 212, 14, 213, 16],
 [13, 23, 212, 14, 213, 16, 46],
 [13, 23, 212, 14, 213, 16, 46, 3],
 [13, 214],
 [13, 214, 47],
 [13, 214, 47, 75],
 [13, 214, 47, 75, 16],
 [13, 214, 47, 75, 16, 59],
 [13, 214, 47, 75, 16, 59, 215],
 [24, 17],
 [24, 17, 4],
 [24, 17, 4, 21],
 [8, 33],
 [8, 33, 137],
 [8, 33, 137, 138],
 [8, 33, 137, 138, 216],
 [8, 60],
 [8, 60, 61],
 [8, 60, 61, 4],
 [8, 60, 61, 4, 45],
 [35, 1],
 [35, 1, 98],
 [35, 1, 98, 40],
 [35, 1, 98, 40, 34],
 [35, 1, 98, 40, 34, 5],
 [35, 1, 98, 40, 34, 5, 139],
 [35, 1, 98, 40, 34, 5, 139, 3],
 [35, 1, 98, 40, 34, 5, 139, 3, 99],
 [3, 76],
 [3, 76, 21],
 [3, 76, 21, 20],
 [3, 76, 21, 20, 12],
 [62, 2],
 [62, 2, 100],
 [62, 2, 100, 101],
 [62, 2, 100, 101, 9],
 [62, 2, 100, 101

In order to have arrays of same length we add some padding to the left to all the rows.

In [None]:
# pad sequences
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen = max_sequence_len, padding='pre'))
input_sequences

array([[  0,   0,   0, ...,   0, 205,  20],
       [  0,   0,   0, ..., 205,  20, 206],
       [  0,   0,   0, ...,   0, 136,  31],
       ...,
       [  0,   0,   0, ...,  55,  13,  20],
       [  0,   0,   0, ...,  13,  20,   7],
       [  0,   0,   0, ...,  20,   7, 511]], dtype=int32)

We separate what is going to be our inputs from our target values. For this purpose we consider, for instance, "climb" to be the input and "on" to be the target value, "climb on" to be the input next and "board" to be the target value. We use predictors as input and label as our target value.

In [None]:
predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
print(predictors[:10])
print(label[:10])

[[  0   0   0   0   0   0   0   0   0   0   0   0 205]
 [  0   0   0   0   0   0   0   0   0   0   0 205  20]
 [  0   0   0   0   0   0   0   0   0   0   0   0 136]
 [  0   0   0   0   0   0   0   0   0   0   0 136  31]
 [  0   0   0   0   0   0   0   0   0   0 136  31 207]
 [  0   0   0   0   0   0   0   0   0   0   0   0  10]
 [  0   0   0   0   0   0   0   0   0   0   0  10 209]
 [  0   0   0   0   0   0   0   0   0   0  10 209  97]
 [  0   0   0   0   0   0   0   0   0  10 209  97 210]
 [  0   0   0   0   0   0   0   0  10 209  97 210  51]]
[ 20 206  31 207 208 209  97 210  51 211]


We train the model using word embeddings, lstm and a normal NN at the end. 

In [None]:
model = Sequential()
model.add(Embedding(total_words, 50, input_length=max_sequence_len-1))
# Add an LSTM Layer
model.add(Bidirectional(LSTM(150, return_sequences=True)))  
# A dropout layer for regularisation
model.add(Dropout(0.2))
# Add another LSTM Layer
model.add(LSTM(100)) 
model.add(Dense(total_words/2, activation='relu'))  
# In the last layer, the shape should be equal to the total number of words present in our corpus
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])  #(# Pick a loss function and an optimizer)
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 13, 50)            25600     
_________________________________________________________________
bidirectional_3 (Bidirection (None, 13, 300)           241200    
_________________________________________________________________
dropout_3 (Dropout)          (None, 13, 300)           0         
_________________________________________________________________
lstm_7 (LSTM)                (None, 100)               160400    
_________________________________________________________________
dense_6 (Dense)              (None, 256)               25856     
_________________________________________________________________
dense_7 (Dense)              (None, 512)               131584    
Total params: 584,640
Trainable params: 584,640
Non-trainable params: 0
_________________________________________________________________
None

In [None]:
history = model.fit(predictors, label, epochs= 100, verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100

# Predict some song lyrics using the trained model.

In [None]:
def make_lyrics(seed_text, next_words):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list],
                     maxlen=max_sequence_len-1,padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    print(seed_text)

In [None]:
make_lyrics("i know", 10)