### Useful links:
* https://medium.com/analytics-vidhya/music-lyrics-analysis-using-natural-language-processing-7647922241c0
* https://machinelearningmastery.com/how-to-develop-a-word-level-neural-language-model-in-keras/

## Load, clean, tokenize, and create sequences of lyrics

In [2]:
# load lyrics into memory
def load_lyrics(filename):
    # open the lyrics as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# load lyrics
in_filename = 'texts/Action Bronson_lyrics_of_50_songs_v2.txt'
lyrics = load_lyrics(in_filename)
print(lyrics[:500])

Heartbreak drowned sorrows in a large steak

Why you always all on my back?
Why you gotta do me like that?
Why you gotta act like a bitch when I'm with you?
Baby girl, I'm blue

Because you treat me like shit
I paid for the bed and never even slept in it
I paid for that crib I never stepped foot in
And now somebody else is eating all the pudding
Things change, now my dashboard wooden
All black Benz, like a young Doc Gooden
Dark shades, 'cause I'm stone crazy
Girl, we grown, stop playin' on my ph


In [3]:
import string

# turn a doc into clean tokens
def clean_doc(doc):
    # replace '--' with a space ' '
    doc = doc.replace('--', ' ')
    # split into tokens by white space
    tokens = doc.split()
    # remove punctuation from each token
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # make lower case
    tokens = [word.lower() for word in tokens]
    return tokens

# clean document
tokens = clean_doc(lyrics)
print(tokens[:200])
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))

['heartbreak', 'drowned', 'sorrows', 'in', 'a', 'large', 'steak', 'why', 'you', 'always', 'all', 'on', 'my', 'back', 'why', 'you', 'gotta', 'do', 'me', 'like', 'that', 'why', 'you', 'gotta', 'act', 'like', 'a', 'bitch', 'when', 'im', 'with', 'you', 'baby', 'girl', 'im', 'blue', 'because', 'you', 'treat', 'me', 'like', 'shit', 'i', 'paid', 'for', 'the', 'bed', 'and', 'never', 'even', 'slept', 'in', 'it', 'i', 'paid', 'for', 'that', 'crib', 'i', 'never', 'stepped', 'foot', 'in', 'and', 'now', 'somebody', 'else', 'is', 'eating', 'all', 'the', 'pudding', 'things', 'change', 'now', 'my', 'dashboard', 'wooden', 'all', 'black', 'benz', 'like', 'a', 'young', 'doc', 'gooden', 'dark', 'shades', 'cause', 'im', 'stone', 'crazy', 'girl', 'we', 'grown', 'stop', 'playin', 'on', 'my', 'phone', 'baby', 'all', 'your', 'childish', 'attempts', 'to', 'make', 'me', 'angry', 'fall', 'short', 'which', 'only', 'fuels', 'the', 'rage', 'you', 'have', 'because', 'you', 'have', 'nothing', 'understandable', 'im', '

In [4]:
# organize into sequences of tokens
length = 50 + 1
sequences = list()
for i in range(length, len(tokens)):
    # select sequence of tokens
    seq = tokens[i-length:i]
    # convert into a line
    line = ' '.join(seq)
    # store
    sequences.append(line)
print('Total Sequences: %d' % len(sequences))

Total Sequences: 20979


In [5]:
# save tokens to file, one dialog per line
def save_doc(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()
    
# save sequences to file
out_filename = 'texts/action_bronson_sequences.txt'
save_doc(sequences, out_filename)

## Train Language Model

In [6]:
# load
in_filename = 'texts/action_bronson_sequences.txt'
lyrics = load_lyrics(in_filename)
lines = lyrics.split('\n')

In [7]:
import tensorflow as tf

from numpy import array
from pickle import dump
from tf.preprocessing.text import Tokenizer
from tf.utils import to_categorical
from tf.models import Sequential
from tf.layers import Dense
from tf.layers import LSTM
from tf.layers import Embedding

# load
in_filename = 'texts/action_bronson_sequences.txt'
lyrics = load_lyrics(in_filename)
lines = lyrics.split('\n')

# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)
# vocabulary size
vocab_size = len(tokenizer.word_index) + 1

# separate into input and output
sequences = array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]

# define model
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=seq_length))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit model
model.fit(X, y, batch_size=128, epochs=100)

# save the model to file
model.save('model.h5')
# save the tokenizer
dump(tokenizer, open('tokenizer.pkl', 'wb'))

ModuleNotFoundError: No module named 'tensorflow'

In [8]:
from numpy import array
from pickle import dump
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# load
in_filename = 'republic_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')

# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)
# vocabulary size
vocab_size = len(tokenizer.word_index) + 1

# separate into input and output
sequences = array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]

# define model
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=seq_length))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit model
model.fit(X, y, batch_size=128, epochs=100)

# save the model to file
model.save('model.h5')
# save the tokenizer
dump(tokenizer, open('tokenizer.pkl', 'wb'))

ModuleNotFoundError: No module named 'keras'

In [1]:
import tensorflow.keras

ModuleNotFoundError: No module named 'tensorflow'