In [None]:
import keras
from keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential




In [None]:
# Load in the Pink Floyd song lyrics data
lyrics = []
with open("pink_floyd_lyrics.txt", "r") as f:
    for line in f:
        lyrics.append(line.strip())



In [None]:
# Tokenize the lyrics
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lyrics)
sequences = tokenizer.texts_to_sequences(lyrics)



In [None]:
# Get the vocabulary size
vocab_size = len(tokenizer.word_index) + 1

# Pad the sequences so that they are all the same length
max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding="pre")



In [None]:
# Split the data into training and testing sets
train_frac = 0.8
split_index = int(train_frac * len(sequences))
X_train = sequences[:split_index]
y_train = sequences[1:split_index+1]
X_test = sequences[split_index:]
y_test = sequences[split_index+1:]



In [None]:
# Create the model
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_length))
model.add(Bidirectional(LSTM(128)))
model.add(Dropout(0.5))
model.add(Dense(vocab_size, activation="softmax"))



In [None]:
# Compile the model
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

# Fit the model on the training data
model.fit(X_train, y_train, epochs=100, verbose=2)

