In [18]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Embedding

from sklearn.model_selection import train_test_split

## Preprocessing

In [10]:
import string
import regex as re
import numpy as np
import pandas as pd

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical


def preprocess(text):
    # Remove integers
    text = re.sub(r'\d+', '', text)

    # remove newlines as \r and \n
    text = re.sub(r'\r', '', text)

    # Convert to lowercase
    text = text.lower()

    # Remove punctuation marks
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)

    return text


def encode_text_and_labels(df, max_num_words, pre_or_post='post'):
    
    # create a tokenizer
    t = Tokenizer(num_words=max_num_words, oov_token='<unk>')
    t.fit_on_texts(df['text'])
    vocab_size = len(t.word_index) + 1
    # integer encode the documents
    encoded_docs = t.texts_to_sequences(df['text'])
    # pad documents to be as long as the longest sequence in the dataset
    max_length = df['text'].apply(lambda x: len(x.split(' '))).max()
    padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding=pre_or_post)

    # integer encode
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(df['artist'])
    # binary encode
    onehot_encoded = to_categorical(integer_encoded)
    return padded_docs, onehot_encoded, vocab_size, max_length, t


def load_and_preprocess_data(path, max_num_words, pre_or_post='post'):
    """
    Load the data and preprocess it
    :param path: path to the data
    :return: preprocessed data in the form of a pandas dataframe. The first item returned is the data,
    the second is the labels, the third is the vocabulary size, and the fourth is the maximum length of a sequence
    """
    df = pd.read_csv(path)
    
    df = df.groupby('artist').filter(lambda x: len(x) > 100)

    df['text'] = df['text'].apply(preprocess)

    # Identify the rows that contain duplicated text in the 'song' column
    no_covers = ~df['song'].duplicated()

    # Filter the DataFrame to include only the rows with unique text
    df = df[no_covers]

    # prepare text data for a recurrent network
    return encode_text_and_labels(df, max_num_words, pre_or_post='post')


In [11]:
from tqdm import tqdm
embedding_vector = {}
f = open('/kaggle/input/glove-6b/glove.6B.100d.txt') 
for line in tqdm(f):
    vector = line.split(' ')
    word = vector[0]
    coef = np.asarray(vector[1:],dtype = 'float32')
    embedding_vector[word]=coef
f.close()
print('Number of words found ',len(embedding_vector))

400000it [00:08, 47358.12it/s]

Number of words found  400000





In [14]:
padded_docs, artists_onehot_encoded, vocab_size, max_length, token = load_and_preprocess_data("/kaggle/input/spotify-million-song-dataset/spotify_millsongdata.csv", None)

In [15]:
embedding_matrix = np.zeros((vocab_size,100))
for word,i in tqdm(token.word_index.items()):
    embedding_vectors = embedding_vector.get(word)
    if embedding_vectors is not None:
        embedding_matrix[i] = embedding_vector[word]

100%|██████████| 61310/61310 [00:00<00:00, 472132.77it/s]


In [16]:
X_train, X_test, y_train, y_test = train_test_split(padded_docs, artists_onehot_encoded, test_size=0.2, random_state=42)

In [20]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [14]:
# class TokenAndPositionEmbedding(layers.Layer):
#     def __init__(self, max_length, vocab_size, embed_dim):
#         super().__init__()
#         self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
#         self.pos_emb = layers.Embedding(input_dim=max_length, output_dim=embed_dim)

#     def call(self, x):
#         max_length = tf.shape(x)[-1]
#         positions = tf.range(start=0, limit=max_length, delta=1)
#         positions = self.pos_emb(positions)
#         x = self.token_emb(x)
#         return x + positions

In [22]:
embed_dim = 100  # Embedding size for each token
num_heads = 6  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

inputs = layers.Input(shape=(max_length,))
# embedding_layer = TokenAndPositionEmbedding(max_length, vocab_size, embed_dim)
embedding_layer = Embedding(vocab_size, 100, input_length=max_length, 
                        weights = [embedding_matrix], trainable = False)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(32, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(artists_onehot_encoded.shape[1], activation="softmax")(x)

transformer = keras.Model(inputs=inputs, outputs=outputs)

In [23]:
transformer.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
transformer.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, 950)]             0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 950, 100)          6131100   
_________________________________________________________________
transformer_block_1 (Transfo (None, 950, 100)          248832    
_________________________________________________________________
global_average_pooling1d (Gl (None, 100)               0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 32)                3232      
_________________________________________________________________
dropout_5 (Dropout)          (None, 32)                0     

In [None]:
callbacks = [keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5,
                                                    restore_best_weights=True,
                                                    verbose=1)]

history = transformer.fit(
    X_train,
    y_train,
    validation_split=0.1,
    epochs=200,
    batch_size=64,
    callbacks=callbacks,
    use_multiprocessing=True
)

2023-01-13 13:19:32.375980: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/200
Epoch 2/200