In [None]:
import matplotlib.pyplot as plt

In [3]:
import collections
from pprint import pprint
import json
import re
import os
import pickle
import numpy as np
import tensorflow as tf


"""
Isaac Duarte, Yinuo Zhang, Anthony Girard

Attempt at Predicting release date of songs from their lyrics utlizing TensorFlow

NOTE: This did not provide usable results unfortunately, however it does cover a number
      of topics including RNNs, Word Embeddings, Logistic Regression, Tokenization.
      
SEE: song2year.ipynb for the actual model used to create our results

Creating the model beigins as follows. Extracting and tokenizing lyrics and years
from our data file. Creating a dictionary from the lyrics and encoding the lyrics
using the indices of the words in the dictionary. 

Next a doc2vec model is produced using an RNN with batches produced via a skip gram 
model and gradient descent as the optimizer. This will ultimately produce vectors
for songs by concatanating the sum of the vectors of the lyrics to the song, 
so hopefully similar songs are closer together, and thus songs of a similar year
are more likely to be paired together.

Afterwards a logistic (linear) regression model is used to fit the vectors to years.
Such that a song's lyrics can be mapped to a vector and then to a year in order to predict
the year that it came out. The loss function is cross entropy.
"""

space = re.compile(r'\s')
period = re.compile(r'(?<![A-Z])\.|(?<!\w)\'(?!\w)')
punct = re.compile(r'[^\'#@\.\w]')

def tokenize(sent):
    sent = space.split(sent)
    to = []
    tokens = []
    for t in sent:
        if t:
            to += period.split(t)
    for t in to:
        if t:
            tokens += punct.split(t)
    return [a for a in tokens if a is not '']

def get_lyrics_years(songs):
    lyrics = []
    years = []
    
    for song_id in list(songs.keys()):
        lyrics.append(songs[song_id]["lyrics"].lower())
        years.append(songs[song_id]["year"])
        
    return lyrics, years

def dataset(lyrics, vocab_size):
    # Words that were uncommon get noted as Out of bounds
    count = [["OOB", 0]]
    count.extend(collections.
                 Counter([word for lyric in lyrics for word in lyric]).
                 most_common(vocab_size - 1))
    word_to_index = {}
    for word, _ in count:
        word_to_index[word] = len(word_to_index)
    encoded_lyrics = []
    for song in lyrics:
        encoded = []
        for word in song:
            index = word_to_index.get(word, 0)
            if index == 0:
                count[0][1] += 1
            encoded.append(index)
        encoded_lyrics.append(encoded)
        index_to_word = dict(zip(word_to_index.values(), word_to_index.keys()))
    return encoded_lyrics, count, word_to_index, index_to_word

def generate_batch(lyrics, batch_size, window_size):
    batch = []
    labels = []
    
    while len(batch) < batch_size:
        # select random song
        r_song_index = int(np.random.choice(len(lyrics), size=1))
        r_song = lyrics[r_song_index]
        # generate window
        window = [r_song[max(i - window_size, 0):(i + window_size + 1)] for i, _ in enumerate(r_song)]
        
        batch_labels = [(r_song[i:i + window_size], r_song[i + window_size]) for i in range(len(r_song) - window_size)]
        if len(batch_labels) <= 2:
            continue
        # extract batch and label for this iteration
        b, l = [list(x) for x in zip(*batch_labels)]
        b = [x + [r_song_index] for x in b]
        
        batch.extend(b[:batch_size])
        labels.extend(l[:batch_size])
        
    batch = batch[:batch_size]
    labels = labels[:batch_size]
    
    batch = np.array(batch)
    labels = np.transpose(np.array([labels]))
    return batch, labels

In [4]:
batch_size = 500

# Number of unique words to consider in our model
vocabulary_size = 100000
generations = 65000
learning_rate = 0.001

# vector size
embedding_size = 200
song_embedding_size = 200
concatenated_size = embedding_size + song_embedding_size

# intervals to print out progress
save_interval = 500
print_loss_interval = 300

# negative examples to sample
num_sampled = 250

window_size = 8

data_folder = "model_out"
sess = tf.Session()

In [5]:
songs_filename = "songs/songs.json"
songs_file = open(songs_filename, "r+")
songs_dict = json.load(songs_file)


lyrics, years = get_lyrics_years(songs_dict)

tokenized_lyrics = []

print("[Tokenizing lyrics]")
for l in lyrics:
    tokenized_lyrics.append(tokenize(l))
print("[Done]")

# encoded_lyrics is the original list of lyrics but with tokens
# replaced with their corresponding dictionary index
print("[Encoding Lyrics]")
encoded_lyrics, count, word_to_index, index_to_word = dataset(
    tokenized_lyrics, vocabulary_size)
print("[Done]")

del lyrics
del tokenized_lyrics

print("Number of songs:", len(encoded_lyrics))

[Tokenizing lyrics]
[Done]
[Encoding Lyrics]
[Done]
Number of songs: 17742


In [12]:
print("[Creating Model]")

with tf.name_scope('inputs'):
    x_inputs = tf.placeholder(tf.int32, shape=[None, window_size + 1])
    y_target = tf.placeholder(tf.int32, shape=[None, 1])

with tf.name_scope('weights'):
    nce_weights = tf.Variable(
        tf.truncated_normal([vocabulary_size, concatenated_size],
                           stddev=1.0 / np.sqrt(concatenated_size)))
with tf.name_scope('biases'):
    nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

with tf.device('/gpu:0'):
    with tf.name_scope('embeddings'):
        embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
        song_embeddings = tf.Variable(tf.random_uniform([len(encoded_lyrics), song_embedding_size], -1.0, 1.0))
        embed = tf.zeros([batch_size, embedding_size])

        # lookup word embeddings
        for element in range(window_size):
            embed += tf.nn.embedding_lookup(embeddings, x_inputs[:, element])

        song_indices = tf.slice(x_inputs, [0, window_size], [batch_size, 1])
        song_embed = tf.nn.embedding_lookup(song_embeddings, song_indices)

        # concatenate embeddings
        final_embed = tf.concat(axis=1, values=[embed, tf.squeeze(song_embed)])

with tf.name_scope('loss'):
    loss = tf.reduce_mean(
        tf.nn.nce_loss(
            weights=nce_weights, biases=nce_biases,
            inputs=final_embed, labels=y_target,
            num_sampled=num_sampled, 
            num_classes=vocabulary_size))

# SGD optimizer
with tf.name_scope("optimizer"):
    optimizer = tf.train.GradientDescentOptimizer(
        learning_rate=learning_rate).minimize(loss)

# Create model saving operation
saver = tf.train.Saver({"embeddings":embeddings, "song_embeddings": song_embeddings})

# Initialize global varialbles
init = tf.global_variables_initializer()
print("[Done]")

[Creating Model]


In [13]:
sess.run(init)
print('[Starting Training]')

loss_vec = []
loss_x_vec = []

for i in range(generations):
    batch_inputs, batch_labels = generate_batch(encoded_lyrics, batch_size,
                                               window_size)

    feed_dict = {x_inputs: batch_inputs, y_target: batch_labels}
    sess.run(optimizer, feed_dict=feed_dict)

    # Return the loss
    if (i + 1) % print_loss_interval == 0:
        loss_val = sess.run(loss, feed_dict=feed_dict)
        loss_vec.append(loss_val)
        loss_x_vec.append(i + 1)
        print('Loss at step {} : {}'.format(i + 1, loss_val))
        
    # Save dictionary + embeddings
    if (i + 1) % save_interval == 0:
        # Save vocabulary dictionary
        with open(os.path.join(data_folder, 'songs_vocab.pkl'), 'wb') as f:
            pickle.dump(word_to_index, f)

        # Save embeddings
        model_checkpoint_path = os.path.join('model_out/doc2vec_song_embeddings.ckpt')
        save_path = saver.save(sess, model_checkpoint_path)
        print('Model saved in file: {}'.format(save_path))
print("[Training doc2vec model Complete]")

[Starting Training]
Loss at step 300 : 894.7554321289062
Model saved in file: C:\Users\isaac\Desktop\Dev_Stuff\Projects\ProphetWutang\model_out\doc2vec_song_embeddings.ckpt
Loss at step 600 : 769.3016967773438
Loss at step 900 : 838.2728271484375
Model saved in file: C:\Users\isaac\Desktop\Dev_Stuff\Projects\ProphetWutang\model_out\doc2vec_song_embeddings.ckpt
Loss at step 1200 : 746.5286254882812
Loss at step 1500 : 822.53271484375
Model saved in file: C:\Users\isaac\Desktop\Dev_Stuff\Projects\ProphetWutang\model_out\doc2vec_song_embeddings.ckpt
Loss at step 1800 : 698.6054077148438
Model saved in file: C:\Users\isaac\Desktop\Dev_Stuff\Projects\ProphetWutang\model_out\doc2vec_song_embeddings.ckpt
Loss at step 2100 : 648.1721801757812
Loss at step 2400 : 656.0173950195312
Model saved in file: C:\Users\isaac\Desktop\Dev_Stuff\Projects\ProphetWutang\model_out\doc2vec_song_embeddings.ckpt
Loss at step 2700 : 614.5010986328125
Loss at step 3000 : 543.542724609375
Model saved in file: C:\Us

Loss at step 23100 : 34.6316032409668
Loss at step 23400 : 108.36941528320312
Model saved in file: C:\Users\isaac\Desktop\Dev_Stuff\Projects\ProphetWutang\model_out\doc2vec_song_embeddings.ckpt
Loss at step 23700 : 104.04122161865234
Loss at step 24000 : 53.295555114746094
Model saved in file: C:\Users\isaac\Desktop\Dev_Stuff\Projects\ProphetWutang\model_out\doc2vec_song_embeddings.ckpt
Loss at step 24300 : 27.25023651123047
Model saved in file: C:\Users\isaac\Desktop\Dev_Stuff\Projects\ProphetWutang\model_out\doc2vec_song_embeddings.ckpt
Loss at step 24600 : 43.92141342163086
Loss at step 24900 : 98.36727905273438
Model saved in file: C:\Users\isaac\Desktop\Dev_Stuff\Projects\ProphetWutang\model_out\doc2vec_song_embeddings.ckpt
Loss at step 25200 : 22.132280349731445
Loss at step 25500 : 54.50703048706055
Model saved in file: C:\Users\isaac\Desktop\Dev_Stuff\Projects\ProphetWutang\model_out\doc2vec_song_embeddings.ckpt
Loss at step 25800 : 26.21962547302246
Model saved in file: C:\Use

Loss at step 45900 : 17.900617599487305
Model saved in file: C:\Users\isaac\Desktop\Dev_Stuff\Projects\ProphetWutang\model_out\doc2vec_song_embeddings.ckpt
Loss at step 46200 : 19.28911781311035
Loss at step 46500 : 20.37373161315918
Model saved in file: C:\Users\isaac\Desktop\Dev_Stuff\Projects\ProphetWutang\model_out\doc2vec_song_embeddings.ckpt
Loss at step 46800 : 24.469654083251953
Model saved in file: C:\Users\isaac\Desktop\Dev_Stuff\Projects\ProphetWutang\model_out\doc2vec_song_embeddings.ckpt
Loss at step 47100 : 30.199838638305664
Loss at step 47400 : 13.891159057617188
Model saved in file: C:\Users\isaac\Desktop\Dev_Stuff\Projects\ProphetWutang\model_out\doc2vec_song_embeddings.ckpt
Loss at step 47700 : 23.10991859436035
Loss at step 48000 : 26.634353637695312
Model saved in file: C:\Users\isaac\Desktop\Dev_Stuff\Projects\ProphetWutang\model_out\doc2vec_song_embeddings.ckpt
Loss at step 48300 : 21.95051383972168
Model saved in file: C:\Users\isaac\Desktop\Dev_Stuff\Projects\P

In [14]:
def lookup_embedding(elements, num_lookup, max_words):
    documents = tf.constant(elements, dtype=tf.int32, shape=[num_lookup, max_words + 1])

    word_embed = tf.zeros([num_lookup, embedding_size])

    # lookup word embeddings
    for element in range(max_words):
        word_embed += tf.nn.embedding_lookup(embeddings, documents[:, element])

    doc_indices = tf.slice(documents, [0, max_words], [num_lookup, 1])
    doc_embed = tf.nn.embedding_lookup(song_embeddings, doc_indices, name="embedded_songs")

    # concatenate embeddings
    return_embed = tf.concat(axis=1, values=[word_embed, tf.squeeze(doc_embed)], name="final_embedding")
    
    return sess.run(return_embed)

batch_inputs, _ = generate_batch(encoded_lyrics, batch_size, 50)
test_embeddings = lookup_embedding(batch_inputs, batch_size, 50)
print(test_embeddings)

[[-5.6397967   3.9534557   0.71651196 ... -0.74096954 -0.8555568
   0.06410065]
 [-5.424506    4.6742377   1.4655683  ... -0.74096954 -0.8555568
   0.06410065]
 [-4.279373    4.623521    0.33677125 ... -0.74096954 -0.8555568
   0.06410065]
 ...
 [-4.3994093   6.5587897  -3.5923862  ... -0.74096954 -0.8555568
   0.06410065]
 [-4.92293     6.477989   -3.197331   ... -0.74096954 -0.8555568
   0.06410065]
 [-3.491235    6.3400855  -2.6646218  ... -0.74096954 -0.8555568
   0.06410065]]


In [32]:
from sklearn.model_selection import train_test_split

# Start logistic model-------------------------
max_words = 50
logistic_batch_size = 500

encoded_lyrics = np.array(encoded_lyrics)
years = np.array(years)

X_train, X_test, y_train, y_test = train_test_split(
    encoded_lyrics, years, test_size=0.25, random_state=42)

X_train = np.array([x[0:max_words] for x in [y + [0] * max_words for y in X_train]])
X_test = np.array([x[0:max_words] for x in [y + [0] * max_words for y in X_test]])

with tf.name_scope('inputs'):
    X = tf.placeholder(tf.float32, shape=[logistic_batch_size, concatenated_size], name="X")
    y = tf.placeholder(tf.float32, shape=[logistic_batch_size, 1], name="y")
        
with tf.name_scope('weights'):
    betas = tf.Variable(tf.random_uniform([concatenated_size, 1], -1, 1))

# Actual Prediction
y_pred = tf.nn.softmax(tf.matmul(X, betas))

with tf.name_scope('cost'):
    penalized_cost = tf.reduce_sum(tf.square(y - y_pred)) + 1.0 * tf.reduce_sum(tf.square(betas))
    #tf.nn.softmax_cross_entropy_with_logits_v2(labels=y, logits=y_pred)

with tf.name_scope('optimizer'):
    log_optimizer = tf.train.GradientDescentOptimizer(learning_rate = 0.1)
    log_training = log_optimizer.minimize(penalized_cost)

saver = tf.train.Saver()

# Intitialize Variables
init = tf.global_variables_initializer()

sess.run(init)

In [None]:
# Start Logistic Regression
print('[Starting Logistic Doc2Vec Model Training]')
for i in range(10000):
    rand_index = np.random.choice(X_train.shape[0], size=logistic_batch_size)
    rand_x = X_train[rand_index]
    # Append song index at the end of lyrics data
    rand_x_doc_indices = np.sort(rand_index)
    rand_x = np.hstack((rand_x, np.transpose([rand_x_doc_indices])))
    rand_y = np.transpose([y_train[rand_index]])
    
    x_embeddings = lookup_embedding(rand_x, logistic_batch_size, max_words)
    
    feed_dict = {X: x_embeddings, y: rand_y}
    sess.run(log_training, feed_dict=feed_dict)

    # Only record loss and accuracy every 100 generations
    if (i + 1) % 50 == 0:
        #print(x_embeddings)
        print("Step:", i + 1, "First 5 Betas", sess.run(betas, feed_dict=feed_dict)[0:5])
        print("Step:", i + 1, "Penalized Cost", sess.run(penalized_cost, feed_dict=feed_dict))
        
    if (i + 1) % 500 == 0:
        model_checkpoint_path = os.path.join('model_out/doc2vec_log_reg_model.ckpt')
        save_path = saver.save(sess, model_checkpoint_path)

[Starting Logistic Doc2Vec Model Training]
Step: 50 First 5 Betas [[-2.2154118e-06]
 [-1.3136296e-05]
 [-1.2596138e-05]
 [-1.2523004e-05]
 [ 6.3387114e-07]]
Step: 50 Penalized Cost 2021646700.0
Step: 100 First 5 Betas [[-3.1619412e-11]
 [-1.8748747e-10]
 [-1.7977811e-10]
 [-1.7873433e-10]
 [ 9.0469125e-12]]
Step: 100 Penalized Cost 2021966300.0
Step: 150 First 5 Betas [[-4.5128740e-16]
 [-2.6759105e-15]
 [-2.5658792e-15]
 [-2.5509816e-15]
 [ 1.2912181e-16]]
Step: 150 Penalized Cost 2020895400.0
