Data Collection
--------------

In [4]:
import urllib2
import re
import sys
import bs4
import logging
import os

In [2]:
main_url = "http://www.azlyrics.com/m/mfdoom.html" # just a link i found with a bunch of mf doom lyrics
lyrics_base = "http://www.azlyrics.com/lyrics/mfdoom/"
pattern = 'lyrics/mfdoom/(.*)\.html'
output_dir = "../lyrics/"

attempts = 4

logging.basicConfig(format='%(asctime)s %(message)s', stream=sys.stdout, level="INFO")

In [3]:
def fetch_with_retries(url, attempt):
    try:
        return urllib2.urlopen(url).read()
    except Exception as e:
        if attempt < attempts:
            return fetch_with_retries(url, attempt+1)
        else:
            logging.error("unable to fetch %s, error: %s" % (url, e))

In [4]:
html = fetch_with_retries(main_url, 0)
soup = bs4.BeautifulSoup(html)

for link in soup.find_all("a"):
    if link.has_key('href'):
        for m in re.findall(pattern, link['href']):
            url = "%s%s.html" % (lyrics_base, m)
            logging.info("collecting %s" % url)
            outfile = os.path.join(output_dir, m)
            out = open(outfile, "w")
            u_html = fetch_with_retries(url, 0)
            out.write(u_html)
            out.flush()
            out.close()

2017-06-19 11:27:07,855 unable to fetch http://www.azlyrics.com/m/mfdoom.html, error: ''


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


Data Cleaning
------------

In [1]:
input_dir = "../lyrics/"
output_dir = "../lyrics_clean/"
pattern = r'MF Doom Lyrics\s+(.+?)\s+'

In [2]:
def filter_html(element):
    return not element.parent.name in ['style', 'script', '[document]', 'head', 'title']

def text_from_file(file):
    html = open(file, "r").read()
    soup = bs4.BeautifulSoup(html, "lxml")
    text_stuff = soup.findAll(text=True)
    parts =  "".join(filter(lambda t: filter_html(t), text_stuff)).replace("Usage of azlyrics.com content by any third-party lyrics provider is prohibited by our licensing agreement. Sorry about that.", "").split("\n")
    no_names = "|".join([x.strip() for x in parts if x and not (x.startswith("[") or x.endswith("]"))])
    no_bottom = re.sub(r'MxM banner.*', "", no_names)
    no_top =  re.sub(r'.*?MF Doom Lyrics', "", no_bottom)
    return no_top.replace("|", "\n")

In [5]:
for sub, dirs, files in os.walk(input_dir):
    for file in files:
        with open(os.path.join(output_dir, file), 'w') as out:
            text = text_from_file(os.path.join(input_dir, file))
            out.write("%s\n" % text.encode('utf-8'))
            out.flush()

Lyric Generation
---------------

In [6]:
input_dir = "../lyrics_clean/"

In [8]:
import numpy as np

from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop
from sklearn.feature_extraction.text import TfidfVectorizer

Using TensorFlow backend.


In [9]:
def process_file(dir, file):
    content = open(os.path.join(dir, file), "r").read()
    body = re.split('\n{2,}', content)
    return body[1].lower().split("\n")

In [10]:
def song_data(dir):
    """
        just takes all the data and puts it together
        doing fully functional programming in python is a pain
    """
    all_files = reduce(lambda x,y: x+y, [files for sub, dirs, files in os.walk(dir)], [])
    song_lyrics = [process_file(dir, file) for file in all_files]
    return song_lyrics

In [11]:
songs = song_data(input_dir)

In [13]:
string = " ".join(reduce(lambda x,y: x+y, songs))

In [15]:
chars = sorted(list(set(string)))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

In [16]:
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(string) - maxlen, step):
    sentences.append(string[i: i + maxlen])
    next_chars.append(string[i + maxlen])

In [17]:
x = np.zeros((len(sentences), maxlen, len(string)), dtype=np.bool)
y = np.zeros((len(sentences), len(string)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

In [18]:
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))

optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [19]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [20]:
# train the model, output generated text after each iteration
for iteration in range(1, 60):
    print()
    print('-' * 50)
    print('Iteration', iteration)
    model.fit(x, y,
              batch_size=128,
              epochs=1)

    start_index = random.randint(0, len(text) - maxlen - 1)

    for diversity in [0.2, 0.5]:
        print()
        print('----- diversity:', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)

        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()

()
--------------------------------------------------
('Iteration', 1)


ValueError: Error when checking input: expected lstm_1_input to have shape (None, 40, 76) but got array with shape (107097, 40, 321331)