In [None]:
from tensorflow import keras
import keras_tuner
from keras.layers import Input, Embedding, Bidirectional, Dense, Dropout, LSTM, GRU

import pandas as pd
import numpy as np

from tqdm import tqdm

from sklearn.model_selection import train_test_split

In [None]:
EMBED_DIM = 300 #50, 100, 200, or 300 (see glove-6b data set)
MIN_WORD_OCCURENCE = None # use all words

LAYER_UNITS = 64
DENSE_DROPOUT = 0.2
# LEARNING_RATES = list(np.append(np.logspace(-10, -1, num=10), np.logspace(-10, -1, num=10)*5))
LEARNING_RATES = [0.0007, 0.0009, 0.001, 0.002, 0.003, 0.004]

MAX_TRIALS = 10
BATCH_SIZE = 64
MAX_EPOCHS = 200

CALLBACK = [keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5,
                                                    restore_best_weights=True,
                                                    verbose=1)]

In [None]:
import string
import regex as re
import numpy as np
import pandas as pd

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tokenizers import BertWordPieceTokenizer


def preprocess(text):
    # Remove integers
    text = re.sub(r'\d+', '', text)

    # remove newlines as \r and \n
    text = re.sub(r'\r', '', text)

    # Convert to lowercase
    text = text.lower()

    # Remove punctuation marks
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)

    return text


def encode_text_and_labels(df, max_num_words, pre_or_post='post', subword=False):
    # create a tokenizer
    if subword:
        t = BertWordPieceTokenizer(
            clean_text=True,
            handle_chinese_chars=False,
            strip_accents=False,
            lowercase=True
        )

        t.train_from_iterator(df['text'])
        vocab_size = t.get_vocab_size()
        # integer encode the documents
        encoded_list = t.encode_batch(df['text'])
        encoded_docs = [x.ids for x in encoded_list]
        # pad documents to be as long as the longest sequence in the dataset
        max_length = max([len(x) for x in encoded_docs])
    else:
        t = Tokenizer(num_words=max_num_words, oov_token='<unk>')
        t.fit_on_texts(df['text'])
        vocab_size = len(t.word_index) + 1
        # integer encode the documents
        encoded_docs = t.texts_to_sequences(df['text'])
        # pad documents to be as long as the longest sequence in the dataset
        max_length = df['text'].apply(lambda x: len(x.split(' '))).max()

    padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding=pre_or_post)

    # integer encode
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(df['artist'])
    # binary encode
    onehot_encoded = to_categorical(integer_encoded)
    return padded_docs, onehot_encoded, vocab_size, max_length, t


def load_and_preprocess_data(path, max_num_words=None, pre_or_post='post', subword=False):
    """
    Load the data and preprocess it
    :param path: path to the data
    :return: preprocessed data in the form of a pandas dataframe. The first item returned is the data,
    the second is the labels, the third is the vocabulary size, and the fourth is the maximum length of a sequence
    """
    df = pd.read_csv(path)

    df = df.groupby('artist').filter(lambda x: len(x) > 100)

    df['text'] = df['text'].apply(preprocess)

    # Identify the rows that contain duplicated text in the 'song' column
    no_covers = ~df['song'].duplicated()

    # Filter the DataFrame to include only the rows with unique text
    df = df[no_covers]

    # prepare text data for a recurrent network
    return encode_text_and_labels(df, max_num_words, pre_or_post, subword)

In [None]:
path = "/kaggle/input/spotify-million-song-dataset/spotify_millsongdata.csv"
padded_docs, artists_onehot_encoded, vocab_size, max_length, token = load_and_preprocess_data(path)

In [None]:
embedding_vector = {}
f = open(f'/kaggle/input/glove-6b/glove.6B.{EMBED_DIM}d.txt') 
for line in tqdm(f):
    vector = line.split(' ')
    word = vector[0]
    coef = np.asarray(vector[1:],dtype = 'float32')
    embedding_vector[word]=coef
f.close()
# print('Number of words found ',len(embedding_vector))

embedding_matrix = np.zeros((vocab_size, EMBED_DIM))
for word,i in tqdm(token.word_index.items()):
    embedding_vectors = embedding_vector.get(word)
    if embedding_vectors is not None:
        embedding_matrix[i] = embedding_vector[word]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    padded_docs, artists_onehot_encoded, 
    stratify=artists_onehot_encoded, 
    test_size=0.2, random_state=42)

# get validation set, which is 8% of entire data set
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, stratify=y_train,
    test_size=0.1, random_state=42) 

In [None]:
def lstm_model(hp):
    # set the input, embedding matrix uses the glove datasets
    inputs = Input(shape=(max_length,))
    embedding_layer = Embedding(vocab_size, EMBED_DIM, input_length=max_length, 
                        weights = [embedding_matrix], trainable = False)    
    x = embedding_layer(inputs)
    
    # three LSTM layers with dropout
    x = Bidirectional(LSTM(LAYER_UNITS, return_sequences=True))(x)
    x = Dropout(DENSE_DROPOUT)(x)    
    x = Bidirectional(LSTM(LAYER_UNITS, return_sequences=True))(x)
    x = Dropout(DENSE_DROPOUT)(x)    
    x = Bidirectional(LSTM(LAYER_UNITS))(x)
    x = Dropout(DENSE_DROPOUT)(x)
    
    outputs = Dense(artists_onehot_encoded.shape[1], activation="softmax")(x)
    
    lstm = keras.Model(inputs=inputs, outputs=outputs)

    lr = hp.Choice('learning_rate', LEARNING_RATES)
    optimizer = keras.optimizers.Adam(learning_rate=lr)
    lstm.compile(optimizer=optimizer, loss="categorical_crossentropy", metrics=["accuracy"])

    return lstm

In [None]:
def gru_model(hp):
    # set the input, embedding matrix uses the glove datasets
    inputs = Input(shape=(max_length,))
    embedding_layer = Embedding(vocab_size, EMBED_DIM, input_length=max_length, 
                        weights = [embedding_matrix], trainable = False)    
    x = embedding_layer(inputs)
    
    # three GRU layers with dropout
    x = Bidirectional(GRU(LAYER_UNITS, return_sequences=True))(x)
    x = Dropout(DENSE_DROPOUT)(x)    
    x = Bidirectional(GRU(LAYER_UNITS, return_sequences=True))(x)
    x = Dropout(DENSE_DROPOUT)(x)    
    x = Bidirectional(GRU(LAYER_UNITS))(x)
    x = Dropout(DENSE_DROPOUT)(x)
    
    outputs = Dense(artists_onehot_encoded.shape[1], activation="softmax")(x)
    
    gru = keras.Model(inputs=inputs, outputs=outputs)

    lr = hp.Choice('learning_rate', LEARNING_RATES)
    optimizer = keras.optimizers.Adam(learning_rate=lr)
                      
    gru.compile(optimizer=optimizer, loss="categorical_crossentropy", metrics=["accuracy"])

    return gru

In [None]:
# lstm_tuner = keras_tuner.BayesianOptimization(
#     hypermodel=lstm_model,
#     objective="val_accuracy",
#     max_trials=MAX_TRIALS,
#     seed=42,
#     directory="tuner_results",
#     project_name="lstm"
# )

gru_tuner = keras_tuner.BayesianOptimization(
    hypermodel=gru_model,
    objective="val_accuracy",
    max_trials=MAX_TRIALS,
    seed=42,
    directory="tuner_results",
    project_name="gru"
)

# lstm_tuner.search_space_summary()
print('\n')
gru_tuner.search_space_summary()

In [None]:
# lstm_tuner.search(
#     X_train,
#     y_train,
#     validation_data=(X_val, y_val),
#     epochs=MAX_EPOCHS,
#     batch_size=BATCH_SIZE,
#     callbacks=CALLBACK,
#     use_multiprocessing=True,
# )

In [None]:
# # get best model's hyper parameters
# lstm_tuner.get_best_hyperparameters(num_trials=1)
# print('\n')
# lstm_tuner.results_summary(num_trials=MAX_TRIALS)


In [None]:
gru_tuner.search(
    X_train,
    y_train,
    validation_data=(X_val, y_val),
    epochs=MAX_EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=CALLBACK,
    use_multiprocessing=True,
)

In [None]:
# get best model's hyper parameters
gru_tuner.get_best_hyperparameters(num_trials=1)
print('\n')
gru_tuner.results_summary(num_trials=MAX_TRIALS)