In [54]:
import pandas as pd
import numpy as np
import torch
from pathlib import Path
import faiss
from sklearn.metrics import ndcg_score, average_precision_score
from db_connector import music_connector_factory

In [3]:
con = music_connector_factory(
    "DF",
    data_path="/home/artem/grad/mvectorizer/data/gtzan",
    music_location="/home/artem/grad/mvectorizer/data/gtzan/samples",
    music_info_df_name="music_info.csv"
)
emb_map_name = "embeddings"
con.load_map(emb_map_name)

In [11]:
user_num = 1000
syntetic = []
for i in range(user_num):
    user_interactions = np.zeros(1000)
    liked = np.random.choice(list(range(10)), (6))
    primary = liked[0:1]
    secondary = liked[1:3]
    thirdly = liked[3:]

    user_interactions[primary[0] * 100 + np.random.choice(list(range(100)), 40)] = 1
    for second in secondary:
        user_interactions[second * 100 + np.random.choice(list(range(100)), 20)] = 1
    for third in thirdly:
        user_interactions[third * 100 + np.random.choice(list(range(100)), 10)] = 1

    syntetic.append(user_interactions)
    
syntetic = np.vstack(syntetic)
syntetic

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 1., ..., 1., 0., 1.]])

In [15]:
emb_location = Path("/home/artem/grad/mvectorizer/data/gtzan/features")
music_info_df = pd.read_csv("/home/artem/grad/mvectorizer/data/gtzan/music_info.csv")
embeddings_from_jukebox = []
for music_emb in music_info_df["tracks"]:
    embeddings_from_jukebox.append(np.load(emb_location / (music_emb + ".npy")))


embeddings_from_model = np.vstack(map(lambda x: con._get_embedding(emb_map_name, x), range(1000)))
embeddings_from_jukebox = np.vstack(embeddings_from_jukebox)

  embeddings_from_model = np.vstack(map(lambda x: con._get_embedding(emb_map_name, x), range(1000)))


In [23]:
user_vectors_model = []
user_vectors_jukebox = []
for user_emb in syntetic:
    user_music = np.where(user_emb > 0)[0]
    user_vectors_jukebox.append(np.sum(embeddings_from_jukebox[user_music], axis=0))
    user_vectors_model.append(np.sum(embeddings_from_model[user_music], axis=0))
    user_vectors_jukebox[-1] = user_vectors_jukebox[-1] / len(user_music)
    user_vectors_model[-1] = user_vectors_model[-1] / len(user_music)


user_vectors_model = np.vstack(user_vectors_model)
user_vectors_jukebox = np.vstack(user_vectors_jukebox)

In [26]:
user_vectors_jukebox.shape

(1000, 4800)

In [28]:
index_jukebox = faiss.index_factory(user_vectors_jukebox.shape[1], "Flat", faiss.METRIC_INNER_PRODUCT)
index_model = faiss.index_factory(user_vectors_model.shape[1], "Flat", faiss.METRIC_INNER_PRODUCT)

In [32]:
index_jukebox.add(embeddings_from_jukebox)#, np.array(range(1000)))
index_model.add(embeddings_from_model)#, np.array(range(1000)))

In [46]:
recommendations = []
for user, index in zip([user_vectors_model, user_vectors_jukebox ], [index_model, index_jukebox]):
    recommendations.append(index.search(user, 1000))

In [47]:
recommendations[0] = np.take(recommendations[0][0], recommendations[0][1])
recommendations[1] = np.take(recommendations[1][0], recommendations[1][1])

In [58]:
random_solution = np.random.rand(*syntetic.shape) * 2 - 1
recommendations.append(random_solution)
for solution_type, recommendation in zip(["model", "jukebox", "random"], recommendations):
    print(solution_type, " ndcg score", ndcg_score(syntetic, recommendation, k=100))
    print(solution_type, " average precision score", average_precision_score(syntetic, recommendation))


model  ndcg score 0.11638086044239242
model  average precision score 0.11296332766633338
jukebox  ndcg score 0.09784821429211422
jukebox  average precision score 0.09941748734646913
random  ndcg score 0.0990616609795642
random  average precision score 0.1004856217643217
