In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import tensorflow as tf
import tensorflow_recommenders as tfrs

TF_ENABLE_ONEDNN_OPTS = 0

In [29]:
# df = load_data(file_path='../data/dataset.csv')
PATH = os.getcwd()
df = pd.read_csv(os.path.join(PATH, "data/enriched_synthetic_data.csv"))
# df.drop(["music_id"], axis=1, inplace=True)
df.head()

In [None]:
def check_missing_value():
    # show number of missing values as a dataframe by column
    missing_values = df.isnull().sum()
    missing_values = missing_values[missing_values > 0]
    missing_values = missing_values.sort_values(ascending=False)
    missing_values = missing_values.reset_index()
    missing_values.columns = ["Feature", "Missing Values"]
    return missing_values


check_missing_value()

In [36]:
df.fillna("None", inplace=True)

In [None]:
# Fill genre missing values with "Unknown"
df['genre']= df["genre"].fillna("Unknown")

In [None]:
check_missing_value()

In [37]:
df.isna().sum()

user_id             0
age                 0
gender              0
music               0
artist_name         0
featured_artists    0
genre               0
plays               0
duration            0
music_id            0
id_artists          0
acousticness        0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
instrumentalness    0
liveness            0
valence             0
tempo               0
time_signature      0
explicit            0
rating              0
age_group           0
dtype: int64

In [38]:
# Ensure all columns have consistent df types
df["user_id"] = df["user_id"].astype(str)
df["music_id"] = df["music_id"].astype(str)
df["music"] = df["music"].astype(str)
df["age"] = df["age"].astype(int)
df["gender"] = df["gender"].astype(str)
df["age_group"] = df["age_group"].astype(str)
df["plays"] = df["plays"].astype(int)
df["duration"] = df["duration"].astype(float)
df["acousticness"] = df["acousticness"].astype(float)
df["danceability"] = df["danceability"].astype(float)
df["energy"] = df["energy"].astype(float)
df["key"] = df["key"].astype(int)
df["loudness"] = df["loudness"].astype(float)
df["mode"] = df["mode"].astype(int)
df["speechiness"] = df["speechiness"].astype(float)
df["instrumentalness"] = df["instrumentalness"].astype(float)
df["liveness"] = df["liveness"].astype(float)
df["valence"] = df["valence"].astype(float)
df["tempo"] = df["tempo"].astype(float)
df["time_signature"] = df["time_signature"].astype(float)
df["explicit"] = df["explicit"].astype(bool)
df["rating"] = df["rating"].astype(float)
# df["release_year"] = df["release_year"].astype(int)
# df['release_year'] = df['release_year'].astype(int)

In [39]:
import tensorflow as tf
import tensorflow_recommenders as tfrs
import pandas as pd

# df


def df_to_dataset(dataframe, shuffle=True, batch_size=64):
    dataframe = dataframe.copy()
    # labels = dataframe.pop('target').astype('float64')
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe)))
    # if shuffle:
    # ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds


# Convert DataFrame to TensorFlow dataset
dataset = df_to_dataset(df, batch_size=64)

# Set seed for reproducibility
tf.random.set_seed(42)

dataset_size = tf.data.experimental.cardinality(dataset)
print(f"Estimated dataset size: {dataset_size.numpy()}")


# Shuffle the dataset with a buffer size of 1000
shuffled_dataset = dataset.shuffle(
    buffer_size=len(df), seed=42, reshuffle_each_iteration=False
)

# Split the shuffled dataset (adjust the split ratio as needed)
train_size = int(0.8 * len(df))
train_dataset = shuffled_dataset.take(train_size)
test_dataset = shuffled_dataset.skip(train_size)
print(
    f"Estimated train size: {train_size}\nEstimated test dataset size: {len(df) - train_size}"
)

# # Print dataset shapes for debugging
# print("Train dataset shapes:")
# for batch in cached_train.take(10):
#   for key, value in batch.items():
#     print(f"{key}: {tf.shape(value)}")

# print("Test dataset shapes:")
# for batch in cached_test.take(10):
#   for key, value in batch.items():
#     print(f"{key}: {tf.shape(value)}")

Estimated dataset size: 313
Estimated train size: 15979
Estimated test dataset size: 3995


In [41]:
# import tensorflow as tf
# import tensorflow_recommenders as tfrs

# Define embedding dimensions
embedding_dimension = 128

# Extract unique values for lookup layers and ensure no duplicates
unique_user_ids = df["user_id"].astype(str).unique().tolist()
unique_ages = df["age"].unique().tolist()
unique_genders = df["gender"].unique().tolist()
# unique_song_ids = df["music_id"].unique().tolist()
unique_music = df["music"].unique().tolist()
unique_genres = df["genre"].unique().tolist()
unique_artist_names = df["artist_name"].unique().tolist()
unique_age_groups = df["age_group"].unique().tolist()


# Define user model
class UserModel(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.user_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.StringLookup(
                    vocabulary=unique_user_ids, mask_token=None
                ),
                tf.keras.layers.Embedding(
                    len(unique_user_ids) + 1, embedding_dimension
                ),
            ]
        )
        self.age_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.IntegerLookup(vocabulary=unique_ages, mask_token=None),
                tf.keras.layers.Embedding(len(unique_ages) + 1, embedding_dimension),
            ]
        )
        self.gender_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.StringLookup(
                    vocabulary=unique_genders, mask_token=None
                ),
                tf.keras.layers.Embedding(len(unique_genders) + 1, embedding_dimension),
            ]
        )
        self.age_group_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.StringLookup(
                    vocabulary=unique_age_groups, mask_token=None
                ),
                tf.keras.layers.Embedding(
                    len(unique_age_groups) + 1, embedding_dimension
                ),
            ]
        )

    def call(self, inputs):
        return tf.concat(
            [
                self.user_embedding(inputs["user_id"]),
                self.age_embedding(inputs["age"]),
                self.gender_embedding(inputs["gender"]),
                self.age_group_embedding(inputs["age_group"]),
            ],
            axis=1,
        )


# Define item model with audio features
class ItemModel(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.song_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.StringLookup(vocabulary=unique_music, mask_token=None),
                tf.keras.layers.Embedding(len(unique_music) + 1, embedding_dimension),
            ]
        )
        self.genre_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.StringLookup(vocabulary=unique_genres, mask_token=None),
                tf.keras.layers.Embedding(len(unique_genres) + 1, embedding_dimension),
            ]
        )
        self.artist_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.StringLookup(
                    vocabulary=unique_artist_names, mask_token=None
                ),
                tf.keras.layers.Embedding(
                    len(unique_artist_names) + 1, embedding_dimension
                ),
            ]
        )
        self.audio_features = tf.keras.layers.Dense(embedding_dimension)

    def call(self, inputs):
        song_embedding = self.song_embedding(inputs["music"])
        genre_embedding = self.genre_embedding(inputs["genre"])
        artist_embedding = self.artist_embedding(inputs["artist_name"])

        # Check shape of audio features
        audio_features = tf.stack(
            [
                inputs["acousticness"],
                inputs["danceability"],
                inputs["energy"],
                inputs["key"],
                inputs["loudness"],
                inputs["mode"],
                inputs["speechiness"],
                inputs["instrumentalness"],
                inputs["liveness"],
                inputs["valence"],
                inputs["tempo"],
                inputs["time_signature"],
            ],
            axis=1,
        )

        # Print shapes for debugging
        print(f"Audio features shape before Dense layer: {audio_features.shape}")

        audio_features = self.audio_features(audio_features)

        return tf.concat(
            [song_embedding, genre_embedding, artist_embedding, audio_features], axis=1
        )


# Define the multitask model
class MultitaskModel(tfrs.models.Model):
    def __init__(self, user_model, item_model, candidates):
        super().__init__()
        self.user_model = user_model
        self.item_model = item_model
        self.retrieval_task = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(candidates=candidates)
        )
        self.plays_prediction_task = tf.keras.Sequential(
            [tf.keras.layers.Dense(128, activation="relu"), tf.keras.layers.Dense(1)]
        )
        self.plays_loss = tf.keras.losses.MeanSquaredError()

    def call(self, features):
        user_embeddings = self.user_model(features)
        item_embeddings = self.item_model(features)
        return user_embeddings, item_embeddings

    def compute_loss(self, features, training=False):
        user_embeddings, item_embeddings = self(features)
        retrieval_loss = self.retrieval_task(
            user_embeddings, item_embeddings, compute_metrics=not training
        )
        plays_predictions = self.rating_prediction_task(
            tf.concat([user_embeddings, item_embeddings], axis=1)
        )
        plays_loss = self.rating_loss(features["plays"], plays_predictions)
        return retrieval_loss + plays_loss


# # Convert DataFrame to TensorFlow datasets
# def df_to_tf_dataset(dataframe):
#     return tf.data.Dataset.from_tensor_slices(dict(dataframe))


# # Prepare datasets
# train_dataset = df_to_tf_dataset(train)
# test_dataset = df_to_tf_dataset(test)

# Check if candidates are not empty
candidates = train_dataset.map(lambda x: x["music"])
for candidate in candidates.take(1):
    print(f"Candidate sample: {candidate.numpy()}")

# Ensure candidates are correctly mapped
if not list(candidates):
    raise ValueError(
        "Candidates set is empty. Ensure 'music' is correctly mapped from the dataset."
    )

# Instantiate models
user_model = UserModel()
item_model = ItemModel()
model = MultitaskModel(user_model, item_model, candidates)

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

# Cache datasets for performance
batch_size = 1024  # Adjust as needed

cached_train = train_dataset.batch(batch_size).cache().prefetch(tf.data.AUTOTUNE)
cached_test = test_dataset.batch(batch_size).cache().prefetch(tf.data.AUTOTUNE)

# Print dataset shapes for debugging
print("Train dataset shapes:")
for batch in cached_train.take(1):
    for key, value in batch.items():
        print(f"{key}: {tf.shape(value)}")

print("Test dataset shapes:")
for batch in cached_test.take(1):
    for key, value in batch.items():
        print(f"{key}: {tf.shape(value)}")

# Fit the model
try:
    model.fit(cached_train, epochs=10)
except Exception as e:
    print(f"Error during model fitting: {e}")

# Evaluate the model
try:
    model.evaluate(cached_test, return_dict=True)
except Exception as e:
    print(f"Error during model evaluation: {e}")

Candidate sample: [b'Tequila' b'Altes Kamuffel' b'Wish You Were Here' b'A-Punk' b'Morning'
 b'Homegrown' b'Take Me To Church' b'99 Red Balloons'
 b'Young Dumb & Broke' b'Attention' b'1991' b'Evermore' b'Wait'
 b'How Deep Is Your Love' b'Sex Me (Part I) / Sex Me (Part II)' b'Shining'
 b'Mad' b"I'm Into You" b'Holy Grail' b'Jesus Saves' b'Aloha' b'Party Up'
 b'Cecilia' b'Timber' b'Way Down We Go' b'Free to Be Me' b'Like A Virgin'
 b'POWER' b'Poison & Wine' b'505' b'Yuba Diamond'
 b'Para Sempre (feat. Elin Melgarejo)' b'Limbo' b'Notice' b'Irreplaceable'
 b'B.Y.O.B.' b'Whispers In The Dark' b'Goodies' b'Evil Woman' b'Domino'
 b'Awakening' b'Annie' b'Bulls On Parade' b"She's so High" b'Salute'
 b'Bottoms Up' b'Teenage Dream' b'Landslide' b'Hello, My Name Is'
 b'As You Are' b'I Want a Hippopotamus for Christmas (Hippo the Hero)'
 b'Money Longer' b"Ain't Your Mama" b'Spiderwebs' b'Sorry' b'Firework'
 b'Mi Primer Amor' b'My Love (feat. Major Lazer, WizKid, Dua Lipa)'
 b'I Wanna Dance with Some

2024-07-29 18:56:37.207223: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-07-29 18:56:37.490918: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


ValueError: Cannot convert '('c', 'o', 'u', 'n', 't', 'e', 'r')' to a shape. Found invalid entry 'c' of type '<class 'str'>'. 