<span style="font-size: 2em; font-weight:bold">AI 70's Country</span>

In [1]:
import warnings
warnings.filterwarnings('ignore')

import scipy
import numpy as np
import matplotlib
import pandas as pd
import statsmodels
import sklearn
import tensorflow
#import keras

from tensorflow.python.keras.models import Sequential, load_model
from tensorflow.python.keras.layers import Dense,LSTM,Dropout
from tensorflow.python.keras.utils import to_categorical
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.callbacks import History, EarlyStopping, ModelCheckpoint
from tensorflow.python.keras.constraints import maxnorm
import string


import json
import time

As always, data prep is the hardest part of the project.  Because I am going to use a validation set in training my model, and because Keras uses the last n% of the data as the validation set, I want to shuffle the lyrics so that my validation set contains a better representation of all the data - not just the last song.  I also want to get the most originality that I can out of the model, so I will eliminate duplicate lyrics (some songs have refrains that repeat multiple times).  

In [2]:
lyricsCSV = pd.read_csv('lyricsTrain_35.csv',encoding='ISO-8859-1')
lyricsCSV.sort_values(lyricsCSV.columns[0],inplace=True)
lyricsCSV.drop_duplicates(keep='first',inplace=True)
lyricsCSV = lyricsCSV.sample(frac=1)



lyricsCSV.to_csv('lyrics.txt',sep='\t',index=False)
l = open('lyrics.txt','r')
lyrics = l.read()
l.close()

Remove lines and print

In [3]:
tokens = lyrics.split()
lyrics = ' '.join(tokens)
print(lyrics)

lyrics I've been walkin' these streets so long When the sun's comin' up I got cakes on the griddle If I said you had a beautiful body When I was lost you took me home I beg your pardon That's all I'm taking with me With flaming locks of auburn hair "When Tommy turned around they said, ""hey look, old yeller's leaving""" Just like your daddy is A long time forgotten are dreams that just fell by the way It sure is cold today Tommy opened up the door, and saw his Becky crying Bad so I had one more for dessert Just when I'm about to make it work without you Kiss an angel good morning Even with someone they love Between Hank Williams' pain songs and Like your imitation love for me And did I hear you say he was a-meeting you here today Sleepin' in our king size bed "Said, ""Live a good life and play the fiddle with pride" Oh how real those roses seem to me My daddy taught me young how to hunt and how to whittle I'd smoked my brain the night before Would your flowing love come quench me """Lo

One final thing, when I look at the above lyrics, I seem to see a LOT of quotation marks.  So, I am going to just replace those with a space.  

In [4]:
lyrics = lyrics.replace('"',' ')

Now, we can build sequences of characters that will be used to predict a final character

In [5]:
length = 100 # Length of the characer sequences (because we have so much verbage,
             # we can use a relatively large number)
sequences = list()
for i in range(length, len(lyrics)):
    seq = lyrics[i-length:i+1]
    sequences.append(seq)

Create and save a .txt file of our sequences with line endings

In [6]:
data = '\n'.join(sequences)
file = open('char_sequences.txt','w')
file.write(data)
file.close()

Create a dictionary of character:number mappings 

In [7]:
file = open('char_sequences.txt','r')
raw_text = file.read()
file.close()

lines = raw_text.split('\n')

chars = sorted(list(set(raw_text)))
mapping = dict((c, i) for i, c in enumerate(chars))

# Save the mapping as json 
json_map = json.dumps(mapping)
__ = open('mapping.json','w')
__.write(json_map)
__.close()

Use the dictionary to create sequences of numbers only (numbers that describe the characters)

In [8]:
sequences = list()
for line in lines:
    encoded_seq = [mapping[char] for char in line]
    sequences.append(encoded_seq)
    


Create input sets (with 99 characters) and output sets (1 character) and then one-hot code the sets so we can use them to train the model.

In [9]:
vocab_size = len(mapping)
sequences = np.array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
sequences = [to_categorical(x, num_classes=vocab_size) for x in X] #one-hot code input
X = np.array(sequences)
y = to_categorical(y, num_classes=vocab_size) #one-hot code output

Fit the model with tuning parameters determined by trial and error.

In [10]:

def countryTrain(paramUnits,paramEpochs,paramValSplit,paramShuffle,paramBatchSize,paramDropout):

    #Define callbacks
    es = EarlyStopping(monitor = 'acc',min_delta = .01, patience = 2, mode = 'max',verbose=1)
    mc = ModelCheckpoint('model.drop_{}.best'.format(str(paramDropout)), monitor='loss', mode='min', save_best_only=True) # Keep best model


    # define and fit model
    model = Sequential()
    model.add(LSTM(paramUnits, input_shape=(X.shape[1], X.shape[2])))
    model.add(Dropout(paramDropout))
    model.add(Dense(vocab_size, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


    modelLyrics = model.fit(X, y, epochs = paramEpochs, validation_split = paramValSplit, 
                            shuffle = paramShuffle, batch_size = paramBatchSize, verbose=1,callbacks=[es,mc])

   
    history = pd.DataFrame(modelLyrics.history)
    history.to_csv('modelLyricsHistory_drop_{}.csv'.format(str(paramDropout)),index=False)



In [13]:
units = 100  # From the mentioned article in data science
epochs = 100  # Just a large number since I am using early stopping
validationSplit = 0.2 # My data set is small so I want to use as much as possible to train vs. validate
shuffle = True
batchSize = 64 # Doubled the default batch size to speed up training
dropOut = .5  # http://papers.nips.cc/paper/4878-understanding-dropout.pdf
    #Define callbacks

mc = ModelCheckpoint('model.100Epoch_6.6.best', monitor='acc', mode='max', save_best_only=True) # Keep best model


    # define and fit model
model = Sequential()
model.add(LSTM(units, return_sequences=True,input_shape=(X.shape[1], X.shape[2])))
model.add(LSTM(units, return_sequences=True,input_shape=(X.shape[1], X.shape[2])))
model.add(LSTM(units, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(dropOut))
model.add(Dense(vocab_size, activation='softmax',kernel_constraint=maxnorm(3)))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


modelLyrics = model.fit(X, y, epochs = epochs, validation_split = validationSplit, 
                            shuffle = shuffle, batch_size = batchSize, verbose=1,callbacks=[])

   
#history = pd.DataFrame(modelLyrics.history)
#history.to_csv('modelLyricsHistory.100Epoch_6.6.2.csv',index=False)

Train on 23347 samples, validate on 5837 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100

KeyboardInterrupt: 

In [11]:
#Define parameters  -
units = 512  # From the mentioned article in data science
epochs = 100  # Just a large number since I am using early stopping
validationSplit = 0.1 # My data set is small so I want to use as much as possible to train vs. validate
shuffle = True
batchSize = 64 # Doubled the default batch size to speed up training
dropOut = .5  # http://papers.nips.cc/paper/4878-understanding-dropout.pdf


for i in range(2,6):
    d = i*0.1
    countryTrain(units,epochs,validationSplit,shuffle,batchSize,d)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Train on 20118 samples, validate on 2236 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 00025: early stopping
Train on 20118 samples, validate on 2236 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 2

Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 00035: early stopping


In [12]:
# store history

Create a function that encodes a kickoff text string and then plugs it into our trained model

In [16]:
# generate a sequence of characters with a language model
def generate_seq(model, mapping, seq_length, seed_lyric, n_chars):
    lyrics = seed_lyric
    for __ in range(n_chars):
    # encode the characters as integers
        encoded = [mapping[char] for char in lyrics]
    # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
    # one hot encode
        encoded = to_categorical(encoded, num_classes=len(mapping))
    # predict character
        yhat = model.predict_classes(encoded, verbose=0)
    # reverse map integer to character
        out_char = ''
        for char, index in mapping.items():
            if index == yhat:
                out_char = char
                break
    # append to input
        lyrics += char
    return lyrics

Is there a song other than Stairway to Heaven that I could have used for the kickoff sequence?

In [21]:
startLyrics = "I've had a largemouth bass bust my line A couple beautiful girls tell me, Goodbye Trucks break down,"

In [19]:
startLyrics = "There's a lady who's sure All that glitters is gold And she's buying a stairway to heaven When she g"

In [27]:
__ = open('lyrics.Coolio.Epoch100.txt','w')

model = load_model('model.100Epoch_6.6.best')
lyricsFinal = generate_seq(model,mapping,length,startLyrics,1000)
__ = open('lyrics.Coolio.Epoch100.txt','a+')
#__.write('Drop {}\n\n'.format(modelNum))
__.write(lyricsFinal)
__.write('.\n\n\n')
__.close()

Run the model and print the lyrics

In [29]:
def run_models(modelNum):
    model = load_model('model.drop_0.{}.best'.format(modelNum))
    lyricsFinal = generate_seq(model,mapping,length,startLyrics,1000)
    __ = open('lyrics.LedZep.txt','a+')
    __.write('Drop {}\n\n'.format(modelNum))
    __.write(lyricsFinal)
    __.write('.\n\n\n')
    __.close()
    
    

In [30]:
__ = open('lyrics.LedZep.txt','w')
__.write('Stairway to Heaven/n/n')
__.close()

for i in range(0,6):
    run_models(str(i))

df = pd.read_json('modelLyricsHistory.json')

df