In [1]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
import tensorflow as tf
import sys 
from keras.models import Sequential
from keras.layers import LSTM, Activation, Flatten, Dropout, Dense, Embedding, TimeDistributed, CuDNNLSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
from tensorflow.python.client import device_lib 

Using TensorFlow backend.


In [2]:
#only taking country songs
dframe = pd.read_csv('lyrics.csv')
dframe = dframe.sort_values('genre')
dframe = dframe.loc[dframe['genre'] == 'Country']
dframe = dframe.drop(['year', 'artist','genre'], axis=1)
dframe = dframe.dropna()
dframe

Unnamed: 0,index,song,lyrics
362236,362236,amen,I heard from a friend of a friend of a friend ...
188804,188804,third-rock-from-her-thumb-parody-of-third-rock...,"Don't tell her what it's worth, third rock fro..."
188803,188803,it-s-a-great-day-to-be-a-guy,"""You know I'm really gonna miss you, sugar bri..."
188802,188802,funny-man,"I was the class clown,\nI kept them laughing o..."
188800,188800,cledus-went-down-to-florida,"Cledus went down to Florida, he was lookin for..."
90561,90561,let-it-be-me,I bless the day I found you\nI want to stay ar...
188805,188805,what-the-did-you-say,You see I'm always talkin'\nWhen I should be d...
90563,90563,san-antonio-rose,Deep within my heart lies a melody\nA song of ...
90569,90569,corrine-corrina,"Corrine Corrina, where you been so long?\nCorr..."
90571,90571,danny-boy,"Oh, Danny boy, the pipes, the pipes are callin..."


In [3]:
#selecting 10 random songs
a = np.random.randint(0,14387 ,100)
small_dframe = dframe.iloc[a]

In [4]:
# Save Lyrics in .txt file
with open('lyricsText.txt', 'w',encoding="utf-8") as filehandle:  
    for listitem in small_dframe['lyrics']:
        filehandle.write('%s\n' % listitem)

# Load the dataset and convert it to lowercase :
textFileName = 'lyricsText.txt'
raw_text = open(textFileName, encoding = 'UTF-8').read()
raw_text = raw_text.lower()

# Mapping chars to ints :
chars = sorted(list(set(raw_text)))
int_chars = dict((i, c) for i, c in enumerate(chars))
chars_int = dict((i, c) for c, i in enumerate(chars))

number_chars = len(raw_text)
number_vocab = len(chars)
print('Total Characters : ' , number_chars) # number of all the characters in lyricsText.txt
print('Total Vocab : ', number_vocab) # number of unique characters

Total Characters :  87978
Total Vocab :  61


In [5]:
# # Save Lyrics in .txt file
# with open('lyricsText.txt', 'w',encoding="utf-8") as filehandle:  
#     for listitem in dframe['lyrics']:
#         filehandle.write('%s\n' % listitem)

# # Load the dataset and convert it to lowercase :
# textFileName = 'lyricsText.txt'
# raw_text = open(textFileName, encoding = 'UTF-8').read()
# raw_text = raw_text.lower()

# # Mapping chars to ints :
# chars = sorted(list(set(raw_text)))
# int_chars = dict((i, c) for i, c in enumerate(chars))
# chars_int = dict((i, c) for c, i in enumerate(chars))

# number_chars = len(raw_text)
# number_vocab = len(chars)
# print('Total Characters : ' , number_chars) # number of all the characters in lyricsText.txt
# print('Total Vocab : ', number_vocab) # number of unique characters

In [6]:
# process the dataset:
seq_len = 100
data_X = []
data_y = []

for i in range(0, number_chars - seq_len, 1):
    # Input Sequence(will be used as samples)
    #0 to 100, then 1 to 101.... sequence of characters
    seq_in  = raw_text[i:i+seq_len]
    # Output sequence (will be used as target)
    #101th, then 102nd... following character
    seq_out = raw_text[i + seq_len]
    # Store samples in data_X
    data_X.append([chars_int[char] for char in seq_in])
    # Store targets in data_y
    data_y.append(chars_int[seq_out])
   
number_patterns = len(data_X)
print( 'Total Patterns : ', number_patterns)

Total Patterns :  87878


In [7]:
print(data_y[0])

41


In [8]:
# Reshape X to be suitable to go into LSTM RNN :
X = np.reshape(data_X , (number_patterns, seq_len, 1))
#X is an 8984x100 matrix

# Normalizing input data :
X = X/ float(number_vocab)

# One hot encode the output targets :
y = np_utils.to_categorical(data_y)

(87878, 100, 1)


In [10]:
LSTM_layer_num = 4 # number of LSTM layers
layer_size = [256,256,256,256] # number of nodes in each layer
model1 = Sequential()
# model.add(CuDNNLSTM(layer_size[0], input_shape =(X.shape[1], X.shape[2]), return_sequences = True))
model1.add(LSTM(layer_size[0], input_shape =(X.shape[1], X.shape[2]), return_sequences = True))
#add some hidden layers
for i in range(1,LSTM_layer_num) :
    model1.add(LSTM(layer_size[i], return_sequences=True))
#     model.add(CuDNNLSTM(layer_size[i], return_sequences=True))
    
#flatten data from last hidden layer to go into output layer
model1.add(Flatten())

#output layer has a softmax activation function
model1.add(Dense(y.shape[1]))
model1.add(Activation('softmax'))
model1.compile(loss = 'categorical_crossentropy', optimizer = 'adam')
model1.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 100, 256)          264192    
_________________________________________________________________
lstm_2 (LSTM)                (None, 100, 256)          525312    
_________________________________________________________________
lstm_3 (LSTM)                (None, 100, 256)          525312    
_________________________________________________________________
lstm_4 (LSTM)                (None, 100, 256)          525312    
_________________________________________________________________
flatten_1 (Flatten)          (None, 25600)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 61)                1561661   
_________________________________________________________________
activation_1 (Activation)    (None, 61)                0         
Total para

In [11]:
#after having defined the model, define the callbacks
#update weights
# Configure the checkpoint :
checkpoint_name = 'Weights-LSTM-improvement-{epoch:03d}-{loss:.5f}-bigger.hdf5'
checkpoint = ModelCheckpoint(checkpoint_name, monitor='loss', verbose = 1, save_best_only = True, mode ='min')
callbacks_list = [checkpoint]

In [12]:
# Fit the model :
model_params = {'epochs':10,
                'batch_size':128,
                'callbacks':callbacks_list,
                'verbose':1,
                'validation_split':0.2,
                'validation_data':None,
                'shuffle': True,
                'initial_epoch':0,
                'steps_per_epoch':None,
                'validation_steps':None}


In [13]:
model1.fit(X,
          y,
          epochs = model_params['epochs'],
           batch_size = model_params['batch_size'],
           callbacks= model_params['callbacks'],
           verbose = model_params['verbose'],
           validation_split = model_params['validation_split'],
           validation_data = model_params['validation_data'],
           shuffle = model_params['shuffle'],
           initial_epoch = model_params['initial_epoch'],
           steps_per_epoch = model_params['steps_per_epoch'],
           validation_steps = model_params['validation_steps'])

Train on 70302 samples, validate on 17576 samples
Epoch 1/10

Epoch 00001: loss improved from inf to 3.00748, saving model to Weights-LSTM-improvement-001-3.00748-bigger.hdf5
Epoch 2/10

Epoch 00002: loss improved from 3.00748 to 2.99324, saving model to Weights-LSTM-improvement-002-2.99324-bigger.hdf5
Epoch 3/10

Epoch 00003: loss improved from 2.99324 to 2.99129, saving model to Weights-LSTM-improvement-003-2.99129-bigger.hdf5
Epoch 4/10

Epoch 00004: loss improved from 2.99129 to 2.98986, saving model to Weights-LSTM-improvement-004-2.98986-bigger.hdf5
Epoch 5/10

Epoch 00005: loss improved from 2.98986 to 2.98937, saving model to Weights-LSTM-improvement-005-2.98937-bigger.hdf5
Epoch 6/10

Epoch 00006: loss improved from 2.98937 to 2.98936, saving model to Weights-LSTM-improvement-006-2.98936-bigger.hdf5
Epoch 7/10

Epoch 00007: loss improved from 2.98936 to 2.88437, saving model to Weights-LSTM-improvement-007-2.88437-bigger.hdf5
Epoch 8/10

Epoch 00008: loss improved from 2.88437

<keras.callbacks.History at 0xb37082630>

In [14]:
# # Load weights file :
# weights_file = './models/Weights-LSTM-improvement-004-2.49538-bigger.hdf5' # weights file path
# model1.load_weights(weights_file)
# model1.compile(loss = 'categorical_crossentropy', optimizer = 'adam')

In [23]:
# set a random seed :
start = np.random.randint(0, len(data_X)-1)
pattern = data_X[start]
print('Seed : ')
print("\"",''.join([int_chars[value] for value in pattern]), "\"\n")
# How many characters you want to generate
generated_characters = 300
# Generate Charachters :
for i in range(generated_characters):
    x = np.reshape(pattern, ( 1, len(pattern), 1))
    x = x / float(number_vocab)
    prediction = model1.predict(x,verbose = 0)
    #print(prediction)
    index = np.argmax(prediction)
#     print(index)
    result = int_chars[index]
#     print(result)
    seq_in = [int_chars[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]
print('\nDone')

Seed : 
" u'll never stray more than just two lips away
if my time could be spent near you i'd be more than co "

 to te toe to te to te tee to le to te te to te te te te te te te te te te te te te te te te te te te te te te te te te te to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to 
Done
