# Rap Lyrics Generator


### Import The Library For All Parts

In [1]:
import requests
from bs4 import BeautifulSoup
import re
import operator
import sys
from keras.preprocessing.text import Tokenizer
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
from keras.layers import Embedding
from numpy import array 
from keras.utils import to_categorical
from pickle import dump
from pickle import load

Using TensorFlow backend.
  return f(*args, **kwds)


### Pt 1: Scrape Lyrics + Clean

Manually Add Links For Each Song

In [None]:
#Example
url_links = ["https://www.letssingit.com/travis-scott-lyrics-stargazing-58n9ght",
    "https://www.letssingit.com/travis-scott-lyrics-carousel-j64h43v"]

In [None]:
urls = []
urls.extend(url_links)

lyrics_list = []

def generate_lyrics_file(url):
    page = requests.get(url).text
    soup = BeautifulSoup(page, features="html5lib")
    div = soup.find('div', {'id':'lyrics'})
    list_lyrics = div.text.split("\n")
    raw_lyrics = []
    for item in list_lyrics:
        raw_lyrics.append(item.lower())
    return(list(filter(None, raw_lyrics)))

lyrics_list = []      
           
for url in urls:
        lyrics_list.extend(generate_lyrics_file(url))
           
lyrics_list = list(set(lyrics_list))
with open('raw_lyrics.txt', 'w') as file_handler:
    for item in lyrics_list:
        file_handler.write("{}\n".format(item))

Addition data cleaning and preparation

In [None]:
import string

filename = "raw_lyrics.txt"
raw_text = open(filename).read()

def clean_doc(doc):

    doc = doc.replace('--', ' ')
    doc = doc.replace('-',' ')

    tokens = doc.split()
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [word.lower() for word in tokens]
    return tokens

tokens = clean_doc(raw_text)
print('Total Sequences: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))

In [None]:
# Organize into sequences of tokens
length = 6 + 1
sequences = list()
for i in range(length, len(tokens)):
    
    seq = tokens[i-length:i]
    line = ' '.join(seq)
    sequences.append(line)
print('Total Sequences: %d' % len(sequences))
lines = sequences

### Pt 2: Training The LSTM

In [None]:
# Encodes sequences of words to ints
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)
vocab_size = len(tokenizer.word_index) + 1

# Seperate Into X & Y
sequences = array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]

In [None]:
# Keras Model
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=seq_length))
model.add(LSTM(256, return_sequences=True))
model.add(LSTM(256))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())


# Compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


filepath = "weights-improvement-{epoch:02d}--{loss:.4f}-bigger.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor="loss", verbose=1, save_best_only=True, mode="min")
callbacks_list = [checkpoint]

# Fit model
model.fit(X, y, batch_size=128, epochs=100, callbacks=callbacks_list)

dump(tokenizer, open('tokenizer.pkl', 'wb'))

### Pt 3: Generate Lyrics

In [None]:
filename = "" #Add most recent weights
# Load the tokenizer
tokenizer = load(open('tokenizer.pkl', 'rb'))

In [None]:
from keras.preprocessing.sequence import pad_sequences
from random import randint
# Load doc into memory
def load_doc(filename):
    # Open the file as read only
    file = open(filename, 'r')
    # Read all text
    text = file.read()
    # Close the file
    file.close()
    return text
 
# Generate a sequence from a language model
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
    result = list()
    in_text = seed_text
    # Generate a fixed number of words
    for _ in range(n_words):
        # Encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # Truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        # Predict probabilities for each word
        yhat = model.predict_classes(encoded, verbose=0)
        # Map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # Append to input
        in_text += ' ' + out_word
        result.append(out_word)
    return ' '.join(result)
 

seq_length = len(lines[0].split()) - 1
 
#Load the model
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=seq_length))
model.add(LSTM(256, return_sequences=True))
model.add(LSTM(256))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

# Load the tokenizer
tokenizer = load(open('tokenizer.pkl', 'rb'))
 
# Select a seed text
seed_text1 = lines[randint(0,len(lines))]
seed_text2 = lines[randint(0,len(lines))]
seed_text3 = lines[randint(0,len(lines))]
seed_text4 = lines[randint(0,len(lines))]

 
# Generate + Print out new lines
generated1 = generate_seq(model, tokenizer, seq_length, seed_text1, 6)
generated2 = generate_seq(model, tokenizer, seq_length, seed_text2, 6)
generated3 = generate_seq(model, tokenizer, seq_length, seed_text3, 6)
generated4 = generate_seq(model, tokenizer, seq_length, seed_text4, 6)

print(seed_text1 + '\n')
print(generated1)
print(seed_text2 + '\n')
print(generated2)
print(seed_text3 + '\n')
print(generated3)
print(seed_text4 + '\n')
print(generated4)

Useful tutorials:

https://machinelearningmastery.com/text-generation-lstm-recurrent-neural-networks-python-keras/
https://machinelearningmastery.com/how-to-develop-a-word-level-neural-language-model-in-keras/