In [1]:
import os
os.environ['TF_XLA_FLAGS'] = '--tf_xla_enable_xla_devices'

import tensorflow as tf
from tensorflow import keras
# import keras_nlp
# # recommended by https://keras.io/guides/keras_nlp/getting_started/
# # Use mixed precision for optimal performance
# keras.mixed_precision.set_global_policy("mixed_float16")

from tensorflow.keras import layers
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dropout, Dense
from tensorflow.keras.models import Sequential


import json 
import pickle

from sklearn.model_selection import train_test_split
embed_dim = 300 #50, 100, 200, or 300
min_word_occurence = 0

2023-01-17 22:33:36.416671: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/conda/lib
2023-01-17 22:33:36.416815: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
# detect and init the TPU
tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()

# instantiate a distribution strategy
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

2023-01-17 22:33:42.899463: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/conda/lib
2023-01-17 22:33:42.899514: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
2023-01-17 22:33:42.899537: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (8f424cab3d6f): /proc/driver/nvidia/version does not exist
2023-01-17 22:33:42.902482: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-17 22:33:42.924561: I tensorflow/core/platform/profile_utils/cpu_util

## Preprocessing

In [3]:
import string
import regex as re
import numpy as np
import pandas as pd

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tokenizers import BertWordPieceTokenizer


def preprocess(text):
    # Remove integers
    text = re.sub(r'\d+', '', text)

    # remove newlines as \r and \n
    text = re.sub(r'\r', '', text)

    # Convert to lowercase
    text = text.lower()

    # Remove punctuation marks
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)

    return text


def encode_text_and_labels(df, max_num_words, pre_or_post='post', subword=False):
    # create a tokenizer
    if subword:
        t = BertWordPieceTokenizer(
            clean_text=True,
            handle_chinese_chars=False,
            strip_accents=False,
            lowercase=True
        )

        t.train_from_iterator(df['text'])
        vocab_size = t.get_vocab_size()
        # integer encode the documents
        encoded_list = t.encode_batch(df['text'])
        encoded_docs = [x.ids for x in encoded_list]
        # pad documents to be as long as the longest sequence in the dataset
        max_length = max([len(x) for x in encoded_docs])
    else:
        t = Tokenizer(num_words=max_num_words, oov_token='<unk>')
        t.fit_on_texts(df['text'])
        vocab_size = len(t.word_index) + 1
        # integer encode the documents
        encoded_docs = t.texts_to_sequences(df['text'])
        # pad documents to be as long as the longest sequence in the dataset
        max_length = df['text'].apply(lambda x: len(x.split(' '))).max()

    padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding=pre_or_post)

    # integer encode
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(df['artist'])
    # binary encode
    onehot_encoded = to_categorical(integer_encoded)
    return padded_docs, onehot_encoded, vocab_size, max_length, t


def load_and_preprocess_data(path, max_num_words=None, pre_or_post='post', subword=False):
    """
    Load the data and preprocess it
    :param path: path to the data
    :return: preprocessed data in the form of a pandas dataframe. The first item returned is the data,
    the second is the labels, the third is the vocabulary size, and the fourth is the maximum length of a sequence
    """
    df = pd.read_csv(path)

    df = df.groupby('artist').filter(lambda x: len(x) > 100)

    df['text'] = df['text'].apply(preprocess)

    # Identify the rows that contain duplicated text in the 'song' column
    no_covers = ~df['song'].duplicated()

    # Filter the DataFrame to include only the rows with unique text
    df = df[no_covers]

    # prepare text data for a recurrent network
    return encode_text_and_labels(df, max_num_words, pre_or_post, subword)

In [4]:
from tqdm import tqdm
embedding_vector = {}
f = open(f'/kaggle/input/glove-6b/glove.6B.{embed_dim}d.txt') 
for line in tqdm(f):
    vector = line.split(' ')
    word = vector[0]
    coef = np.asarray(vector[1:],dtype = 'float32')
    embedding_vector[word]=coef
f.close()
print('Number of words found ',len(embedding_vector))

400000it [00:26, 14910.14it/s]

Number of words found  400000





In [5]:
padded_docs, artists_onehot_encoded, vocab_size, max_length, token = load_and_preprocess_data("/kaggle/input/spotify-million-song-dataset/spotify_millsongdata.csv", max_num_words=None, subword=False)

In [6]:
embedding_matrix = np.zeros((vocab_size, embed_dim))
for word,i in tqdm(token.word_index.items()):
    embedding_vectors = embedding_vector.get(word)
    if embedding_vectors is not None:
        embedding_matrix[i] = embedding_vector[word]

100%|██████████| 61310/61310 [00:00<00:00, 407540.53it/s]


In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    padded_docs, artists_onehot_encoded, 
    stratify=artists_onehot_encoded, 
    test_size=0.2, random_state=42)

In [8]:
# from https://keras.io/examples/nlp/text_classification_with_transformer/
# original source https://web.stanford.edu/~jurafsky/slp3/ed3book.pdf
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [9]:
num_heads = 8  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

# instantiating the model in the strategy scope creates the model on the TPU
with tpu_strategy.scope():
    inputs = layers.Input(shape=(max_length,))
#     embedding_layer = Embedding(vocab_size, embed_dim, input_length=max_length, 
#                             weights = [embedding_matrix], trainable = False)    
    embedding_layer = Embedding(vocab_size, embed_dim, input_length=max_length, trainable = True)
    x = embedding_layer(inputs)
    x = layers.Dropout(0.25)(x)
    transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
    x = transformer_block(x)
#     x = layers.Dense(ff_dim, activation="relu")(x)
#     x = layers.Dense(embed_dim)(x)
#     transformer_block2 = TransformerBlock(embed_dim, num_heads, ff_dim)
#     x = transformer_block2(x)
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dropout(0.1)(x)
#     x = layers.Dense(32)(x)
#     x = layers.Dropout(0.1)(x)

    outputs = layers.Dense(artists_onehot_encoded.shape[1], activation="softmax")(x)

    multi_transformer_layers = keras.Model(inputs=inputs, outputs=outputs)

In [10]:
with tpu_strategy.scope():
    inputs = layers.Input(shape=(max_length,))
    embedding_layer = Embedding(vocab_size, embed_dim, input_length=max_length, 
                            weights = [embedding_matrix], trainable = False)    
#     embedding_layer = Embedding(vocab_size, embed_dim, input_length=max_length, trainable = True)
    x = embedding_layer(inputs)
    x = Dropout(0.25)(x)
    x = Bidirectional(LSTM(128, return_sequences=True))(x)
    x = Dropout(0.25, seed=42)(x)
    x = Bidirectional(LSTM(128))(x)
    outputs = layers.Dense(artists_onehot_encoded.shape[1], activation="softmax")(x)
    lstm = keras.Model(inputs=inputs, outputs=outputs)

In [53]:
# # Model params.
# NUM_LAYERS = 3
# MODEL_DIM = 256
# INTERMEDIATE_DIM = 512
# NUM_HEADS = 4
# DROPOUT = 0.1
# NORM_EPSILON = 1e-5

# # instantiating the model in the strategy scope creates the model on the TPU
# # with tpu_strategy.scope():
# inputs = layers.Input(shape=(max_length,))
# # embedding_layer = TokenAndPositionEmbedding(max_length, vocab_size, embed_dim)
# embedding_layer = Embedding(vocab_size, embed_dim, input_length=max_length, 
#                         weights = [embedding_matrix], trainable = False)
# x = embedding_layer(inputs)

# # Apply layer normalization and dropout to the embedding.
# x = keras.layers.LayerNormalization(epsilon=NORM_EPSILON)(x)
# x = keras.layers.Dropout(rate=DROPOUT)(x)

# # Add a number of encoder blocks
# for i in range(NUM_LAYERS):
#     x = keras_nlp.layers.TransformerEncoder(
#         intermediate_dim=INTERMEDIATE_DIM,
#         num_heads=NUM_HEADS,
#         dropout=DROPOUT,
#         layer_norm_epsilon=NORM_EPSILON,
#     )(x)

# outputs = layers.Dense(artists_onehot_encoded.shape[1], activation="softmax")(x)

# multi_transformer_layers = keras.Model(inputs, outputs)
# multi_transformer_layers.summary()

In [11]:
# optimizer = keras.optimizers.Adam(learning_rate=0.00005)
# multi_transformer_layers.compile(optimizer=optimizer, loss="categorical_crossentropy", metrics=["accuracy"])
# multi_transformer_layers.summary()

optimizer = keras.optimizers.Adam(learning_rate=0.001)
lstm.compile(optimizer=optimizer, loss="categorical_crossentropy", metrics=["accuracy"])
lstm.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 950)]             0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 950, 300)          18393300  
_________________________________________________________________
dropout_4 (Dropout)          (None, 950, 300)          0         
_________________________________________________________________
bidirectional (Bidirectional (None, 950, 256)          439296    
_________________________________________________________________
dropout_5 (Dropout)          (None, 950, 256)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               394240    
_________________________________________________________________
dense_3 (Dense)              (None, 268)               6887

In [12]:
callbacks = [keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=10,
                                                    restore_best_weights=True,
                                                    verbose=1)]

# history = multi_transformer_layers.fit(
#     X_train,
#     y_train,
#     validation_split=0.1,
#     epochs=200,
#     batch_size=64,
#     callbacks=callbacks,
#     use_multiprocessing=True
# )

history = lstm.fit(
    X_train,
    y_train,
    validation_split=0.1,
    epochs=200,
    batch_size=64,
    callbacks=callbacks,
    use_multiprocessing=True
)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Restoring model weights from the end of the best epoch.
Epoch 00032: early stopping


In [None]:
# tuner.search(img_train, label_train, epochs=50, validation_split=0.2, callbacks=[stop_early])

# # Get the optimal hyperparameters
# best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

# print(f"""
# The hyperparameter search is complete. The optimal number of units in the first densely-connected
# layer is {best_hps.get('units')} and the optimal learning rate for the optimizer
# is {best_hps.get('learning_rate')}.
# """)

In [None]:
file_name = "transformerv3.1_subword=True"
with open(file_name + "_history", 'w') as f:
    json.dump(history.history, f)

In [None]:
import matplotlib.pyplot as plt
def plot_graphs(history, string, title):
    if type(history) is dict:
        plt.plot(history[string])
        plt.plot(history['val_'+string])
    else:
        plt.plot(history.history[string])
        plt.plot(history.history['val_'+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.title(f"{title} {string}")
    plt.show()
    
def plot_model_comparison(history1, history2, string, title):
    plt.plot(history1.history[string])
    plt.plot(history1.history['val_'+string])
    plt.plot(history2.history[string])
    plt.plot(history2.history['val_'+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.title(f"{title} {string}")
    plt.show()
    
plot_graphs(history, 'accuracy', 'transformer')
plot_graphs(history, 'loss', 'transformer')

# plot_graphs(trainable_history, 'accuracy', 'trainable embedding GRU')
# plot_graphs(trainable_history, 'loss', 'trainable embedding GRU')

In [None]:
with open('/kaggle/working/transformerv2.0_history', "rb") as file_pi:
    abc = pickle.load(file_pi)

In [None]:
plot_graphs(abc, 'accuracy', 'transformer')
plot_graphs(abc, 'loss', 'transformer')

In [None]:
transformer.to_json()

In [None]:
type(abc)