# AutoPump

### This is a project whose purpose is to construct a generative network to generate new song lyrics. The network will train on lyrics from lyricsgenius -- a website hosting plaintext lyrics from just abotu every popular song ever released.

#### We start by accessing the lyricsgenius API to download lyrics for a given artist (for demonstration we use lil' Pump)

In [None]:
#import the lyricsgenius python package (can be installed with pip install lyricsgenius)
import lyricsgenius
#import required packages
import pandas as pd
import numpy as np
import json
import glob
import numpy.random as random

import keras.models as km
import keras.layers as kl

In [None]:
# Input the client token (acquired from making a lyricsgenius account)
client_token = 's86ExsILIhrEXrTfQmePOYXJ6jPT9KACHQ22dXs960suWEpwa4HQwUn56AB5Gsx7'

# Create an instance of lyrics genius
genius = lyricsgenius.Genius(client_token)

# Pick an artist and save their lyrics, here we use Lil Pump, and we choose to sort by title
# NOTE -- THE LYRICS GENIUS API WILL SOMETIMES TIME OUT -- THIS IS AN ISSUE WITH HE API. JUST RERUN UNTIL IT WORKS.
artist_name = "Lil Pump"
artist_tag=artist_name.replace(" ","")

# Check if we have already downloaded this data, skip download if you have
if glob.glob("Lyrics_"+artist_tag+".json")==[]:
    artist = genius.search_artist(artist_name, sort="title")
    artist.save_lyrics()

In [None]:
# Get the song lyric names
lyric_files = glob.glob("Lyrics_"+artist_tag+".json")

# load the lyrics into a dataframe
df = pd.DataFrame()
for i in range(len(lyric_files)):
    predf = pd.read_json(lyric_files[i],orient='index',typ='series')
    df = df.append(predf.songs)
    
# Store title and lyrics in a dataframe
data = df[['title','lyrics']]
data.sample(3)

data.to_csv('lyrics_titles_'+artist_tag+'.csv')

In [None]:
data=pd.read_csv('lyrics_titles_'+artist_tag+'.csv')
# Create a string of the entire set of lyrics
corpus0 = ""
for row in data.itertuples():
    text = row.lyrics
    # Append all the text from the lyrics to "all_lyrics"
    if type(text) == str:
        corpus0+=text

# Remove unwanted characters/strings from the corpus
corpus0 = corpus0.replace('[verse]' ,'')
corpus0 = corpus0.replace('[intro]', '')
corpus0 = corpus0.replace('[outro]', '')
corpus0 = corpus0.replace('[bridge]', '')
corpus0 = corpus0.replace('[chorus]' ,'')
corpus0 = corpus0.replace('[Intro]', '')
corpus0 = corpus0.replace('[Outro]', '')
corpus0 = corpus0.replace('[Bridge]', '')
corpus0 = corpus0.replace('[Chorus]', '')
corpus0 = corpus0.replace('[verse 1]', '')
corpus0 = corpus0.replace('[verse 2]', '')
corpus0 = corpus0.replace('[verse 3]', '')
corpus0 = corpus0.replace('[verse 4]', '')
corpus0 = corpus0.replace('Lyrics', '')

# Include spaces between punctuation and words, to reduce unique words
corpus0 = corpus0.replace(',', ' , ')
corpus0 = corpus0.replace('(', ' , ')
corpus0 = corpus0.replace(')', ' ) ')
corpus0 = corpus0.replace('[', ' [ ')
corpus0 = corpus0.replace(']', ' ] ')
corpus0 = corpus0.replace('.', ' . ')
corpus0 = corpus0.replace(';', ' ; ')
corpus0 = corpus0.replace(':', ' : ')
corpus0 = corpus0.replace('!', ' ! ')
corpus0 = corpus0.replace('?', ' ? ')
corpus0 = corpus0.replace('*', ' * ')
corpus0 = corpus0.replace("’", '\'')
corpus0 = corpus0.replace("\'\'", ' " ')
corpus0 = corpus0.replace('"', ' " ')
corpus0 = corpus0.replace("'", " ' ")
corpus0 = corpus0.replace('\r\n', ' \r\n ')
corpus0 = corpus0.replace('-', ' - ')
corpus0 = corpus0.replace('\n', ' \n ')
corpus0 = corpus0.replace('\u2005', ' ')
corpus0 = corpus0.replace('\u205f', ' ')
corpus0 = corpus0.replace('—', ' — ')
corpus0 = corpus0.replace('¿', ' ¿ ')
corpus0 = corpus0.replace('¡', ' ¡ ')

  

# Convert the text to lower case so that lower and uppercase are not treated differently
corpus0 = corpus0.lower()

# Split the words by spaces; 
corpus1 = corpus0.split(' ')

# Remove empty strings that sometimes show up
while (corpus1.count('') > 0): 
    corpus1.remove('')
    
print('The number of words in the corpus is ', len(corpus1))

In [None]:
# Preprocessing is done.  Now get the unique words, and encode them.
words = sorted(list(set(corpus1)))
num_words = len(words)

# Create an encoding where each unique word in the corpus is assigned to an integer,
# allowing us to respresent sentences as sequences of numbers
encoding = {w: i for i, w in enumerate(words)}
decoding = {i: w for i, w in enumerate(words)}

print('We have', num_words, 'unique words.')
corpus = corpus1

In [None]:
s_length=[15]
n_LSTMs=[128]
drop_rates=[0.1]

#s_length = [5,10,50]
#n_LSTMs=[8,32,128]
#drop_rates=[0.0,0.5,0.8]

val_accuracies=np.zeros((3,3,3,50))

for sentence_indx in range(len(s_length)):
    sentence_length = s_length[sentence_indx]
    
    for lstm_indx in range(len(n_LSTMs)):
        n_LSTM = n_LSTMs[lstm_indx]
        
        for d_indx in range(len(drop_rates)):
            drop_rate=drop_rates[d_indx]      
            
            
            # Initialize empty lists to store the data
            x_data = []
            y_data = []

            # Loop over the corpus, take each 50 words sequence, encode it, and save it in x_data
            # Take each 51st word, encode it, and save it to y_data 
            for i in range(0, len(corpus) - sentence_length):
                sentence = corpus[i: i + sentence_length]
                next_word = corpus[i + sentence_length]
                x_data.append([encoding[word] for word in sentence])
                y_data.append(encoding[next_word])

            # Determine the numebr of 50 word sequences ("sentences") in the data
            num_sentences = len(x_data)
            print('We have', len(x_data), 'sentences.')

            # Create the variables to hold the data as it will be used.
            x = np.zeros((num_sentences, sentence_length, num_words), dtype = np.bool)
            y = np.zeros((num_sentences, num_words), dtype = np.bool)

            # Populate the sentences. It is encoded as :
            #                                                Index 1. Sentence Number
            #                                                Index 2. each word in the sentence one hot encoded in the unique word list
            print('Encoding data.')
            for i, sentence in enumerate(x_data):
                for t, encoded_word in enumerate(sentence):
                    x[i, t, encoded_word] = 1
                y[i, y_data[i]] = 1


            print('Building network.')
            model = km.Sequential()
            model.add(kl.Bidirectional(kl.LSTM(n_LSTM, return_sequences=False), input_shape = (sentence_length, num_words)))# try using a bidirectional LSTM (pass data forwards and baackwards through network, since words in a sentence depend both of previous and future words)
            model.add(kl.Dropout(drop_rate))
            model.add(kl.Dense(num_words, activation = 'softmax'))
            model.compile(loss = 'categorical_crossentropy', optimizer = 'RMSprop', metrics = ['accuracy'],)
            fit = model.fit(x, y, epochs = 50, batch_size = 128,validation_split=.1,verbose=2)

            val_accuracies[sentence_indx,lstm_indx,d_indx]=fit.history.get("val_accuracy")

In [None]:
fit = model.fit(x, y, epochs = 50, batch_size = 128,validation_split=.1,verbose=2)

In [None]:
# Randomly choose 50 words from the dictionary of words as our
# starting sentence.
seed = []
for i in range(sentence_length):
    seed.append(decoding[np.random.randint(0, num_words - 1)])

# Encode the seed sentence.
ax = np.zeros((1, sentence_length, num_words), dtype = np.bool)
for i, w in enumerate(seed):
    ax[0, i, encoding[w]] = 1

text = ''

# Run the seed sentence through the model.  Add the output to the
# generated text.  Take the output and append it to the seed sentence
# and remove the first word from the seed sentence.  Then repeat until
# you've generated as many words as you like.
for i in range(150):

    # Get the most-probably next word.
    pred = np.argmax(model.predict(ax, verbose = 0))

    if i%10 == 0:
        text+='\n'
    
    # Put in verse and chorus flags for style
    if i == 0:
        text+="\n[Verse]\n\n"
        
    if i == 75:
        text+="\n\n[Chorus]\n"
        
    # Add it to the generated text.
    text += decoding[pred].capitalize()+" "
    

    # Encode the next word.
    next_word = np.zeros((1, 1, num_words), dtype = np.bool)
    next_word[0, 0, pred] = 1

    # Concatenate the next word to the seed sentence, but leave off
    # the first element so that the length stays the same.
    ax = np.concatenate((ax[:, 1:, :], next_word), axis = 1)

    
# Print out the generated text.
print("Lyrics: \n")
print(text)

In [None]:
s_length = [5,10,50]
n_LSTMs=[8,32,128]
drop_rates=[0.0,0.5,0.8]

In [None]:
from pylab import *

In [None]:

figure(figsize=(20,10))
counter=0
for i in range(2):#(len(val_accuracies)):
    for j in range(len(val_accuracies[i])):
        subplot(2,3,counter+1)
        plot(arange(0,50),ones(50)*.3,color = 'red')
        xlabel("s_length="+str(s_length[i])+", n_LSTM="+str(n_LSTMs[j]))
        ylim(.1,.48)
        counter+=1
        for k in range(len(val_accuracies[i][j])):
            if max(val_accuracies[i][j][k])!=0:
                if drop_rates[k] == 0.8: c = "black"
                if drop_rates[k] == 0.5: c = "blue"
                if drop_rates[k] == 0.0: c= "green"
                plot(val_accuracies[i][j][k],color=c,label="dropout_rate="+str(drop_rates[k]))
                
        legend(loc="upper left")