In [9]:
import pandas as pd
import numpy as np
import re
import string
import tensorflow as tf
import matplotlib.pyplot as plt

from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras_preprocessing.sequence import pad_sequences
import keras.utils as ku

# utility function for plotting graphs
def plot_graphs(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric], '')
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])

# Load the datasets
songs_path = 'datasets/lyrics.csv'
songs_df = pd.read_csv(songs_path, sep="\t")
print("Length of songs dataset: ", len(songs_df))

poems_path = 'datasets/PoetryFoundationData.csv'
poems_df = pd.read_csv(poems_path)
print("Length of poems dataset: ", len(poems_df))

# Combine the datasets
combined_df = pd.concat([songs_df['lyrics'], poems_df['Poem']], axis=0, ignore_index=True)
combined_df = combined_df.dropna()
combined_list = combined_df.tolist()




Length of songs dataset:  20404
Length of poems dataset:  13854


In [10]:
# utility function for cleaning the lyrics
def clean_lyrics(lyrics):
    # Remove newlines and brackets
    lyrics = re.sub(r'\\n', ' ', lyrics)
    lyrics = re.sub(r'\[.*?\]', '', lyrics)
    # Remove punctuation
    lyrics = lyrics.translate(str.maketrans('', '', string.punctuation))
    # Convert to lowercase
    lyrics = lyrics.lower()
    return lyrics

# Clean the lyrics
for i in range(len(combined_list)):
    combined_list[i] = clean_lyrics(combined_list[i])

# Tokenize the lyrics
def tokenize_lyrics(lyrics):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lyrics)
    total_words = len(tokenizer.word_index) + 1
    return tokenizer, total_words

tokenizer, total_words = tokenize_lyrics(combined_list)

def get_sequences(tokenizer, lyrics):
    sequences = []
    for line in lyrics:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            sequences.append(n_gram_sequence)
    return sequences
max_tokenized_len = max([len(x) for x in combined_list])
print(combined_list[:5])

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(combined_list)

max_sequence_len = max([len(x) for x in sequences])
print("Max sequence length: ", max_sequence_len)
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_len, padding='post')

print("Padded sequences: ", padded_sequences[0])


[' thought id end up with sean but he wasnt a match wrote some songs about ricky now i listen and laugh even almost got married and for pete im so thankful wish i could say thank you to malcolm cause he was an angel   one taught me love one taught me patience and one taught me pain now im so amazing say ive loved and ive lost but thats not what i see so look what i got look what you taught me and for that i say   thank you next next thank you next next thank you next im so fuckin grateful for my ex thank you next next thank you next next thank you next next im so fuckin—   spend more time with my friends i aint worried bout nothin plus i met someone else we havin better discussions i know they say i move on too fast but this one gon last cause her name is ari and im so good with that so good with that   she taught me love love she taught me patience patience how she handles pain pain that shits amazing yeah shes amazing ive loved and ive lost yeah yeah but thats not what i see yeah yea

In [11]:
# Build the model
def create_model(max_sequence_length, total_words):
    input_len = max_sequence_length - 1
    model = Sequential()
    # Add an embedding layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    # Add an LSTM layer
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    # Add a output layer
    model.add(Dense(total_words, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

model = create_model(max_sequence_len, total_words)
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 115272, 10)        3040610   
                                                                 
 lstm_2 (LSTM)               (None, 100)               44400     
                                                                 
 dropout_2 (Dropout)         (None, 100)               0         
                                                                 
 dense_2 (Dense)             (None, 304061)            30710161  
                                                                 
Total params: 33,795,171
Trainable params: 33,795,171
Non-trainable params: 0
_________________________________________________________________


2024-04-17 04:50:29.674368: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-04-17 04:50:29.676733: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-04-17 04:50:29.677725: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [None]:
model.fit