In [2]:
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys
import io
import sqlite3

In [3]:
conn = sqlite3.connect('funk_crawler/songs.db')
cursor = conn.cursor()

In [4]:
cursor.execute('SELECT text from songs')
songs = cursor.fetchall()
text = '\n'.join([song[0] for song in songs])

In [5]:
import unicodedata
import string

def shave_marks_latin(txt):
    norm_text = unicodedata.normalize('NFD', txt)
    latin_base = False
    keepers = []
    for c in norm_text:
        if unicodedata.combining(c) and latin_base:
            continue
        keepers.append(c)
        if not unicodedata.combining(c):
            latin_base = c in string.ascii_letters
    shaved = ''.join(keepers)
    return unicodedata.normalize('NFC', shaved)

In [6]:
text = shave_marks_latin(text).lower()

In [7]:
import re
letter_space_re = re.compile(r'[^a-z\s0-9]')

In [8]:
text = letter_space_re.sub('', text)

In [9]:
# NORMALIZE!

In [10]:
chars = sorted(list(set(text)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

total chars: 38


In [11]:
char_indices

{'\n': 0,
 ' ': 1,
 '0': 2,
 '1': 3,
 '2': 4,
 '3': 5,
 '4': 6,
 '5': 7,
 '6': 8,
 '7': 9,
 '8': 10,
 '9': 11,
 'a': 12,
 'b': 13,
 'c': 14,
 'd': 15,
 'e': 16,
 'f': 17,
 'g': 18,
 'h': 19,
 'i': 20,
 'j': 21,
 'k': 22,
 'l': 23,
 'm': 24,
 'n': 25,
 'o': 26,
 'p': 27,
 'q': 28,
 'r': 29,
 's': 30,
 't': 31,
 'u': 32,
 'v': 33,
 'w': 34,
 'x': 35,
 'y': 36,
 'z': 37}

In [12]:
# cut the text in semi-redundant sequences of maxlen characters
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))

nb sequences: 1542756


In [13]:
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

In [14]:
print('Build model...')
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))

Build model...


In [15]:
optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [16]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [17]:
def on_epoch_end(epoch, logs):
    # Function invoked at end of each epoch. Prints generated text.
    print()
    print('----- Generating text after Epoch: %d' % epoch)

    start_index = random.randint(0, len(text) - maxlen - 1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('----- diversity:', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)

        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()

On CPU 4 hours per epoch, on GPU 10 min

In [18]:
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

model.fit(x, y,
          batch_size=128,
          epochs=5,
#           callbacks=[print_callback]
         )

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x197d2289710>

In [26]:
model.save('model.h5')

In [25]:
model.fit(x, y,
          batch_size=128,
          epochs=2,
#           callbacks=[print_callback]
         )

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x197d15a9588>

In [23]:
for diversity in [0.2, 0.5, 1.0, 1.2]:
    print('----- diversity:', diversity)

    generated = ''
    sentence = "senta novinha"
    generated += sentence
    print('----- Generating with seed: "' + sentence + '"')
    sys.stdout.write(generated)

    for i in range(400):
        x_pred = np.zeros((1, maxlen, len(chars)))
        for t, char in enumerate(sentence):
            x_pred[0, t, char_indices[char]] = 1.

        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample(preds, diversity)
        next_char = indices_char[next_index]

        generated += next_char
        sentence = sentence[1:] + next_char

        sys.stdout.write(next_char)
        sys.stdout.flush()
    print()

----- diversity: 0.2
----- Generating with seed: "senta novinha"
senta novinhae   e    aoa 

  after removing the cwd from sys.path.


o  u aaeu o oao     ouema  en  a a a  m  a e      n  ea  o   ae  a aob a a ooaa  o n  o o oa e n oa e e  n o  a ae  ae 6sa  oa  aw       e    aa aoaa oooe  mao aaea    oea   xe  oaota     aa e     oaea      e  ea   o  e e      a       eoee  ana t ea a   a aan  o    r    oo co aoe o  uos eaoo e   a   e   aoo   o          eso a    o  aaaaee a e  n  d  a e o oe  xa      aea    e as  a no
----- diversity: 0.5
----- Generating with seed: "senta novinha"
senta novinhao dae eo ieao ost igso  snuc emermrua xrn e nnen   mae  o i aiasen 
 ors e i   e ada   a o m
m coau e
aan tmoaurmecameue  u    norooeau e ao on e e ta t  de u n 0a taneoaeraaaf seaomoromueana ere seed uoam  ooenam e a aeoon x enr tmama e   aaao  oue  eeam ooaooarl as  aevaim aam amh ol  eovnea naia  aee e m moolma m s hde  oe8aemaasocto
saaeu ad   m eei  xsl  esao  aerdaanaeao eroe aeooaeaad m   s
----- diversity: 1.0
----- Generating with seed: "senta novinha"
senta novinhaoa
eruemlmaeaeeuaaaauei su r ssco rn erhtircaei 
e  dau