In [155]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, GRU
from keras.callbacks import TensorBoard, ModelCheckpoint
import pandas as pd
import numpy as np

In [3]:
#Data retrieved from https://www.kaggle.com/mousehead/songlyrics
song_df = pd.read_csv("data/songlyrics/songdata.csv")
song_df.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [4]:
song_df.at[0, "text"]

"Look at her face, it's a wonderful face  \nAnd it means something special to me  \nLook at the way that she smiles when she sees me  \nHow lucky can one fellow be?  \n  \nShe's just my kind of girl, she makes me feel fine  \nWho could ever believe that she could be mine?  \nShe's just my kind of girl, without her I'm blue  \nAnd if she ever leaves me what could I do, what could I do?  \n  \nAnd when we go for a walk in the park  \nAnd she holds me and squeezes my hand  \nWe'll go on walking for hours and talking  \nAbout all the things that we plan  \n  \nShe's just my kind of girl, she makes me feel fine  \nWho could ever believe that she could be mine?  \nShe's just my kind of girl, without her I'm blue  \nAnd if she ever leaves me what could I do, what could I do?\n\n"

In [7]:
#Use the previous 64 characters to predict the 65th
SEQ_LEN = 64

In [8]:
#Changed my mind - went with character-wise prediction
#Kept this in case I want to try to predict words later
import re
punct = list(".,?!")

def clean_lyrics(lyrics):
    for p in punct:
        lyrics = lyrics.replace(p, " {} ".format(p))
    for p in ["(",")"]:
        lyrics = lyrics.replace(p, "")
    lyrics = re.sub("\s+", " ", lyrics)
    return np.array(["START"] + lyrics.lower().split() + ["END"])

In [100]:
def clean_lyrics(lyrics):
    lyrics = lyrics.replace("\n", ".").lower() #Newlines generally indicate pauses
    lyrics = re.sub(r"\(.*\)", "", lyrics) #Get rid of lines inside parentheses (chorus)
    lyrics = re.sub(r"\[.*\]", "", lyrics) #Get rid of lines inside brackets [chorus]
    lyrics = re.sub(r"[\(\)\[\]]", "", lyrics) #Some parentheses were unbalanced...
    lyrics = re.sub(r"(\s+\.)+", ". ", lyrics) #Some brackets were unbalanced...
    lyrics = re.sub(r"([\?\.\!\;\,])\.+", r"\1", lyrics)  #Drop periods appearing after other punctuation
    lyrics = re.sub(r"\s+", " ", lyrics)  #Replace 1 or more whitespace characters with a single space
    return " " * (SEQ_LEN - 1) + lyrics + "E" #Pad the beginning with whitespace so we can predict from the first character

In [191]:
#Check out random songs to see if we should add anything to clean_lyrics
random_index = np.random.choice(len(song_df))
clean_lyrics(song_df.at[random_index, "text"])

"                                                               some would say there is trouble in my home. even when i am alone. some would say there is always someone home. even though i might be gone. speak to me my friend, i know you're there. speak to me my friend speak to me speak to me! i can always feel it's there, creeping one step behind me. cold hands in the night, i know that it's watching me. the poltergeist living in my home. could be a friend or foe. the poltergeist living in my home. could be a friend for life, or a foe until the night i die. speak to me my friend, let me hear your tongue. speak to me my friend speak to me speak to me! speak to me my friend, i know you're there. speak to me my friend speak to me speak to me! my guests can never tell when it's creeping up to touch them. they will never ever know our game. until they feel it and scream. the poltergeist living in my home. could be a friend or foe. the poltergeist living in my home. could be a friend for li

In [101]:
#Vectorize clean_lyrics over the entire song text column
song_df["clean"] = song_df.text.apply(clean_lyrics)

In [105]:
song_df.head()

Unnamed: 0,artist,song,link,text,clean
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd...",...
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl...",...
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...,...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...,...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...,...


In [106]:
data = song_df.clean.values
data[0]

"                                                               look at her face, it's a wonderful face. and it means something special to me. look at the way that she smiles when she sees me. how lucky can one fellow be? she's just my kind of girl, she makes me feel fine. who could ever believe that she could be mine? she's just my kind of girl, without her i'm blue. and if she ever leaves me what could i do, what could i do? and when we go for a walk in the park. and she holds me and squeezes my hand. we'll go on walking for hours and talking. about all the things that we plan. she's just my kind of girl, she makes me feel fine. who could ever believe that she could be mine? she's just my kind of girl, without her i'm blue. and if she ever leaves me what could i do, what could i do?E"

In [107]:
from itertools import chain

#Chain takes a bunch of iterables and connects them together
#The * unpacks an iterable so you can use it as positional arguments
#For example:  print(*[1,2,3]) is the same as calling print(1,2,3)
word_set = set(chain(*data))
len(word_set) #46 characters to predict

46

In [108]:
print(sorted(word_set))

[' ', '!', '"', "'", ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '?', 'E', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [109]:
len(list(chain(*data))) #56 million characters in data

56300816

In [110]:
N = len(data) #Number of songs
K = len(word_set) #Number of unique characters

In [111]:
#Mappings back and forth between character and integer index
letter2idx = dict((c, i) for i, c in enumerate(word_set))
idx2letter = dict((i, c) for i, c in enumerate(word_set))

In [112]:
def create_batch(data, n=128):
    #Create a batch of n samples, each row in X representing SEQ_LEN letters from a song
    #with each row in y representing the one-hot encoding of the next letter (or the STOP character "S")
    #p_start determines the probability of starting at the beginning of the song vice a random point
    X = np.zeros((n, SEQ_LEN, K))
    y = np.zeros((n, K))
    
    for i in range(n):
        #random.choice(N) would make sequences ending in "E" SEQ_LEN times as likely
        #I still wanted them to be more common than uniform probability; here they are about 6x as likely
        song_idx = np.random.choice(N - int(SEQ_LEN * .9))
        song_len = len(data[song_idx])
        
        #We don't want to run out of song!  Clip the random choice to be within valid range
        start_idx = min(np.random.choice(song_len), song_len - SEQ_LEN - 1)
        
        #Iterate over letters in the song and one-hot encode them into the array
        for j, letter in enumerate(data[song_idx][start_idx:start_idx + SEQ_LEN]):
            letter_idx = letter2idx[letter]
            X[i, j, letter_idx] = 1
        
        #One-hot encode the next letter
        next_letter_idx = letter2idx[data[song_idx][start_idx + SEQ_LEN]]
        y[i, next_letter_idx] = 1
    
    return X, y

In [113]:
X, y = create_batch(data)

In [114]:
blah = iter(range(len(X)))

In [151]:
#Test to see if create_batch worked properly
i = next(blah)
"".join([idx2letter[idx] for idx in X[i].argmax(axis = 1)]), idx2letter[y[i].argmax()]

('pain is a deadly reality. a sociopath with empty eyes. and no so', 'u')

In [153]:
#Check what proportion of the next letters are the end of the song
np.mean(np.array([idx2letter[idx] for idx in y.argmax(axis = 1)]) == "E")

0.0703125

In [156]:
X.shape, y.shape

((128, 64, 46), (128, 46))

In [160]:
model = Sequential()
#return_sequences = True is required if plugging into another recurrent layer
model.add(GRU(128, dropout = .2, recurrent_dropout = .2, input_shape = (SEQ_LEN, K), return_sequences = True))
model.add(GRU(128, dropout = .2, recurrent_dropout = .2))
model.add(Dense(256, activation = "relu"))
model.add(Dropout(.5))
model.add(Dense(K, activation = "softmax"))

In [161]:
#Save model weights at the end of each epoch
chk_callback = ModelCheckpoint("tmp/weights.rnn_char.hdf5", save_best_only = True)
#Save logs to check out TensorBoard
tb_callback = TensorBoard()

In [162]:
model.compile("adam", "categorical_crossentropy", ["accuracy"])

In [163]:
model.train_on_batch(X, y)

[3.8310561, 0.0078125]

In [165]:
X, y = create_batch(data, n = 100000)

In [166]:
model.fit(X, y, batch_size = 128, epochs = 20, callbacks = [chk_callback, tb_callback])

Epoch 1/20
Epoch 2/20




Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x2d60761a748>

In [210]:
def make_song(model, start = " " * SEQ_LEN, temperature = 0):
    start = list((" " * SEQ_LEN + start)[-SEQ_LEN:].lower())
    
    X_digits = [letter2idx[letter] for letter in start]
    X = []
    
    for digit in X_digits:
        row = np.zeros(K)
        row[digit] = 1
        X.append(row)

    
    while start[-1] != "E":
        pred = model.predict(np.array(X[-SEQ_LEN:]).reshape((1, SEQ_LEN, K)))[0]
        prediction = np.random.choice(K, 1, p = pred)[0]
        row = np.zeros(K)
        row[prediction] = 1
        X.append(row)
        start.append(idx2letter[prediction])
    
    return "".join(start).strip()[:-1]

In [228]:
make_song(model, "she sings and she dances and her smile lights up the sky")

"she sings and she dances and her smile lights up the sky a. let down my lie. where now come speach in alreat chysise, i can't let again."

In [None]:
for i in range(2000):
    stats = model.train_on_batch(*create_batch(data))
    if i % 50 == 0:
        print("Iteration {}, {}".format(i, stats))

Iteration 0, [1.6264844, 0.5]
Iteration 50, [1.7018552, 0.578125]
Iteration 100, [1.7121595, 0.5078125]
Iteration 150, [1.6743716, 0.453125]
Iteration 200, [1.4224437, 0.4765625]
Iteration 250, [1.5754218, 0.53125]
Iteration 300, [1.8307935, 0.4453125]
Iteration 350, [1.5503139, 0.5234375]
Iteration 400, [1.6237633, 0.4765625]
Iteration 450, [1.6109223, 0.5078125]
Iteration 500, [1.7225275, 0.515625]
Iteration 550, [1.6731038, 0.421875]
Iteration 600, [1.6072071, 0.515625]
Iteration 650, [1.5234128, 0.484375]
Iteration 700, [1.4317018, 0.5546875]
Iteration 750, [1.8100257, 0.5234375]
Iteration 800, [1.6361152, 0.5]
Iteration 850, [1.3789246, 0.5859375]
Iteration 900, [1.646139, 0.515625]
Iteration 950, [1.6457453, 0.453125]
Iteration 1000, [1.7499382, 0.4140625]
Iteration 1050, [1.9065955, 0.421875]
Iteration 1100, [1.7424059, 0.4296875]
Iteration 1150, [1.6513617, 0.5234375]
Iteration 1200, [1.6009815, 0.515625]
Iteration 1250, [1.7388172, 0.453125]
Iteration 1300, [1.5574999, 0.51562