In [24]:
import collections
from pprint import pprint
import json
import re
import numpy as np
import tensorflow as tf

space = re.compile(r'\s')
period = re.compile(r'(?<![A-Z])\.|(?<!\w)\'(?!\w)')
punct = re.compile(r'[^\'#@\.\w]')

def tokenize(sent):
    sent = space.split(sent)
    to = []
    tokens = []
    for t in sent:
        if t:
            to += period.split(t)
    for t in to:
        if t:
            tokens += punct.split(t)
    return [a for a in tokens if a is not '']

def get_lyrics_years(songs):
    lyrics = []
    years = []
    
    for song_id in list(songs.keys()):
        lyrics.append(songs[song_id]["lyrics"].lower())
        years.append(songs[song_id]["year"])
        
    return lyrics, years

def dataset(lyrics, vocab_size):
    # Words that were uncommon get noted as Out of bounds
    count = [["OOB", 0]]
    count.extend(collections.Counter([word for lyric in lyrics for word in lyric]).most_common(vocab_size - 1))
    word_to_index = {}
    for word, _ in count:
        word_to_index[word] = len(word_to_index)
    encoded_lyrics = []
    for song in lyrics:
        encoded = []
        for word in song:
            index = word_to_index.get(word, 0)
            if index == 0:
                count[0][1] += 1
            encoded.append(index)
        encoded_lyrics.append(encoded)
        index_to_word = dict(zip(word_to_index.values(), word_to_index.keys()))
    return encoded_lyrics, count, word_to_index, index_to_word

def generate_batch(batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1  # [ skip_window target skip_window ]
    buffer = collections.deque(maxlen=span)
    if data_index + span > len(data):
        data_index = 0
    buffer.extend(encoded_lyrics[data_index:data_index + span])
    data_index += span
    for i in range(batch_size // num_skips):
        context_words = [w for w in range(span) if w != skip_window]
        words_to_use = random.sample(context_words, num_skips)
        for j, context_word in enumerate(words_to_use):
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[context_word]
        if data_index == len(data):
            buffer.extend(data[0:span])
            data_index = span
        else:
            buffer.append(data[data_index])
            data_index += 1
    # Backtrack a little bit to avoid skipping words in the end of a batch
    data_index = (data_index + len(data) - span) % len(data)
    return batch, labels

In [25]:
songs_filename = "songs/songs.json"
songs_file = open(songs_filename, "r+")
songs_dict = json.load(songs_file)


lyrics, years = get_lyrics_years(songs_dict)

tokenized_lyrics = []

print("[Tokenizing lyrics]")
for l in lyrics:
    tokenized_lyrics.append(tokenize(l))
print("[Done]")
# Number of unique words to consider in our model
vocabulary_size = 50000

# encoded_lyrics is the original list of lyrics but with tokens
# replaced with their corresponding dictionary index
print("[Encoding Lyrics]")
encoded_lyrics, count, word_to_index, index_to_word = dataset(
    tokenized_lyrics, vocabulary_size)
print("[Done]")

del tokenized_lyrics
del lyrics

print(len(encoded_lyrics))

[Tokenizing lyrics]
[Done]
[Encoding Lyrics]
[Done]
17742


In [41]:
print(count[1000])

('level', 836)
