Codigo basado en https://github.com/PUC-RecSys-Class/RecSysPUC-2024-2/blob/master/practicos/FastFM_factorization_machines.ipynb

In [None]:
%%capture
!pip install gdown
!pip3 install fastFM

In [None]:
import numpy as np
import pandas as pd
import fastFM
from fastFM.datasets import make_user_item_regression
from sklearn.model_selection import train_test_split
from fastFM import sgd
from fastFM import als
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from scipy.sparse import csc_matrix
from fastFM import mcmc
import functools as fct
import itertools as itools
import random, scipy
import gdown
import os
import json
from tqdm import tqdm

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
datadir = '/content/drive/MyDrive/UC/Ramos/Sis. Rec./project/data'

## Recolectar los datos

In [None]:
with open(datadir+'/idx2uri.json', 'r') as f:
    idx2uri = json.load(f)

with open(datadir+'/uri2features.json', 'r') as f:
    uri2features = json.load(f)

In [None]:
def get_track_features(uri):
    if uri in uri2features:
        return uri2features[uri]
    else:
        uri = uri.replace('spotify:track:', '')
        with open(datadir + f'/tracks_dataset_spotify_audio_features/{uri}.json', 'r') as f:
            audio_features = json.load(f)
        return audio_features

In [None]:
uri2features[idx2uri['1']]

{'danceability': 0.712,
 'energy': 0.759,
 'key': 11,
 'loudness': -5.397,
 'mode': 1,
 'speechiness': 0.334,
 'acousticness': 0.016,
 'instrumentalness': 0.0808,
 'liveness': 0.217,
 'valence': 0.738,
 'tempo': 79.103,
 'type': 'audio_features',
 'id': '4OKXvqtfwlvY2fYJ2lzHPH',
 'uri': 'spotify:track:4OKXvqtfwlvY2fYJ2lzHPH',
 'track_href': 'https://api.spotify.com/v1/tracks/4OKXvqtfwlvY2fYJ2lzHPH',
 'analysis_url': 'https://api.spotify.com/v1/audio-analysis/4OKXvqtfwlvY2fYJ2lzHPH',
 'duration_ms': 249333,
 'time_signature': 4}

In [None]:
interactions = []
playlists = set()
tracks = set()
playlists_tracks = {}
with open(datadir+'/pid_itemid_1000p.txt', 'r') as f:
    lines = f.readlines()

for line in lines:
    items = line.replace('\n','').split(' ')
    interactions.append((int(items[0]), int(items[1])))
    tracks.add(int(items[1]))
    playlists.add(int(items[0]))
    if int(items[0]) not in playlists_tracks:
        playlists_tracks[int(items[0])] = []
    playlists_tracks[int(items[0])].append(int(items[1]))

In [None]:
train = []
playlists_train_tracks = {}
with open(datadir+'/playlistid_itemid_1000p_train.txt', 'r') as f:
    lines = f.readlines()

for line in lines:
    items = line.replace('\n','').split(' ')
    train.append((int(items[0]), int(items[1]) - 1))
    if int(items[0]) not in playlists_train_tracks:
        playlists_train_tracks[int(items[0])] = []
    playlists_train_tracks[int(items[0])].append(int(items[1]) - 1)

In [None]:
test = []
test_playlists = set()
playlists_test_tracks = {}
with open(datadir+'/test_challenge_200p.txt', 'r') as f:
    lines = f.readlines()

for line in lines:
    items = line.replace('\n','').split(' ')
    test.append((int(items[0]), int(items[1]) - 1))
    test_playlists.add(int(items[0]))
    if int(items[0]) not in playlists_test_tracks:
        playlists_test_tracks[int(items[0])] = []
    playlists_test_tracks[int(items[0])].append(int(items[1]) - 1)

In [None]:
len(interactions)

67503

In [None]:
len(playlists)

1000

In [None]:
len(tracks)

34443

In [None]:
neg_samples = []

with open(datadir+'/neg_samples.json', 'r') as f:
    neg_samples = json.load(f)

# Convertir a formato fastFM

In [None]:
features = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
# Definimos que features usaremos
use_features = [False, True, False, False, False, False, True, True, False, False, False]
# Definimos cuantos negatives samples crearemos en proporcion a los samples reales
neg_sample_prop = 5

assert neg_sample_prop >= 0

In [None]:
usable_features = []
for i in range(len(features)):
    if use_features[i]:
        usable_features.append(features[i])

In [None]:
# # Determinamos el tamaño de la matriz csc
# col_shape = len(tracks) + len(playlists) + len(usable_features)
# row_shape = len(interactions) + int(len(interactions) * neg_sample_prop)

# shape = (row_shape, col_shape)
# shape

In [None]:
# Determinamos el tamaño de la matriz csc
col_shape = len(tracks) + len(playlists) + len(usable_features)
row_shape = len(train) + int(len(train) * neg_sample_prop)

shape = (row_shape, col_shape)
shape

(305328, 35446)

In [None]:
# Determinamos la columna de cada feature en la matriz csc
feauture_index = {}
for i in range(len(usable_features)):
    feauture_index[usable_features[i]] = i + len(tracks) + len(playlists)

In [None]:
feauture_index

{'energy': 35443, 'acousticness': 35444, 'instrumentalness': 35445}

## Funciones

In [None]:
datalist = []
row_inds, col_inds = [], []
ratings = []

# Creamos los datos para cada tupla de los datos
for i, interaction in tqdm(enumerate(train)):
    playlist, track = interaction

    # Agregamos el dato de la playlist
    datalist.append(1)
    row_inds.append(i)
    col_inds.append(playlist)

    # Agregamos el dato del track
    datalist.append(1)
    row_inds.append(i)
    col_inds.append(track + len(playlists))

    # Agegamos los datos de las features
    trackid = track + 1
    track_uri = idx2uri[str(trackid)]
    track_features = uri2features[track_uri]
    #track_features = get_track_features(track_uri)

    if track_features is None:
        ratings.append(5)
        continue
    for feature in usable_features:
        datalist.append(track_features[feature])
        row_inds.append(i)
        col_inds.append(feauture_index[feature])

    ratings.append(5)

50888it [00:00, 103621.54it/s]


In [None]:
len(neg_samples)

34376279

In [None]:
neg_random_samples = random.sample(neg_samples, int(len(train) * neg_sample_prop))

In [None]:
# Creamos los datos negativos para cada tupla de los datos
neg_interactions = []
for i in tqdm(range(len(neg_random_samples))):
    playlist, track = neg_random_samples[i]

    # Agregamos el dato de la playlist
    datalist.append(1)
    row_inds.append(len(train) + i)
    col_inds.append(playlist)

    # Agregamos el dato del track
    datalist.append(1)
    row_inds.append(len(train) + i)
    col_inds.append(track + len(playlists))

    # Agregamos los datos de las features
    trackid = track + 1
    track_uri = idx2uri[str(trackid)]
    track_features = uri2features[track_uri]
    #track_features = get_track_features(track_uri)
    if track_features is None:
        ratings.append(1)
        continue
    for feature in usable_features:
        datalist.append(track_features[feature])
        row_inds.append(len(train) + i)
        col_inds.append(feauture_index[feature])

    ratings.append(1)

print('\nDimension of FM input: {}'.format(shape))

100%|██████████| 254440/254440 [00:02<00:00, 116524.51it/s]


Dimension of FM input: (305328, 35446)





In [None]:
X = csc_matrix((datalist, (row_inds, col_inds)), shape=shape)
y = np.array(ratings)

In [None]:
X.shape

(305328, 35446)

In [None]:
y.shape

(305328,)

In [None]:
datalist_test = []
row_inds_test, col_inds_test = [], []
ratings_test = []

# Creamos los datos para cada tupla de los datos
for i, interaction in tqdm(enumerate(test)):
    playlist, track = interaction

    # Agregamos el dato de la playlist
    datalist_test.append(1)
    row_inds_test.append(i)
    col_inds_test.append(playlist)

    # Agregamos el dato del track
    datalist_test.append(1)
    row_inds_test.append(i)
    col_inds_test.append(track + len(playlists))

    # Agegamos los datos de las features
    trackid = track + 1
    track_uri = idx2uri[str(trackid)]
    track_features = uri2features[track_uri]
    #track_features = get_track_features(track_uri)

    if track_features is None:
        ratings_test.append(5)
        continue
    for feature in usable_features:
        datalist_test.append(track_features[feature])
        row_inds_test.append(i)
        col_inds_test.append(feauture_index[feature])

    ratings_test.append(5)

6000it [00:00, 120639.80it/s]


In [None]:
X_test = csc_matrix((datalist_test, (row_inds_test, col_inds_test)), shape=(len(test), col_shape))
y_test = np.array(ratings_test)

In [None]:
# # split train y test
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [None]:
# X_train.shape

In [None]:
# X_test.shape

In [None]:
# entrenar modelo optimizando con ALS y hacer la prediccion
fm = als.FMRegression(n_iter=100, init_stdev=0.1, rank=32, l2_reg_w=0.1, l2_reg_V=0.5)
fm.fit(X, y)
y_pred = fm.predict(X_test)

In [None]:
error_als = mean_squared_error(y_test, y_pred)
print('Mean squared error under ALS: {}'.format(error_als))

Mean squared error under ALS: 15.602580258980277


In [None]:
playlist_in_train = {}

for playlist in playlists:
    playlist_in_train[playlist] = []
    if playlist in playlists_train_tracks:
      playlist_in_train[playlist] = playlists_train_tracks[playlist]

In [None]:
# Sacamos los ratings por cada track
def get_ratings_for_playlist(playlist, model):
    datalist = []
    row_inds, col_inds = [], []

    track_ids = []

    for track in tracks:
        # Verificamos que el track tenga features
        trackid = track + 1
        track_uri = idx2uri[str(trackid)]
        track_features = uri2features[track_uri]
        if track_features is None:
            continue

        track_ids.append(track)

        # Agregamos el dato de la playlist
        datalist.append(1)
        row_inds.append(len(track_ids) - 1)
        col_inds.append(playlist)

        # Agregamos el dato del track
        datalist.append(1)
        row_inds.append(len(track_ids) - 1)
        col_inds.append(track + len(playlists))

        # Agegamos los datos de las features
        trackid = track + 1
        track_uri = idx2uri[str(trackid)]
        track_features = uri2features[track_uri]
        #track_features = get_track_features(track_uri)
        for feature in usable_features:
            datalist.append(track_features[feature])
            row_inds.append(len(track_ids) - 1)
            col_inds.append(feauture_index[feature])

    X = csc_matrix((datalist, (row_inds, col_inds)), shape=(len(tracks), col_shape))
    y_pred = model.predict(X)
    return y_pred, track_ids

# Obtenemos las n recomendaciones para una playlist
def get_n_best_ratings(playlist, model, n = 10):
    y_pred, track_ids = get_ratings_for_playlist(playlist, model)

    comb = list(zip(y_pred, track_ids))
    comb.sort(key=lambda x: x[0], reverse=True)
    y_pred, track_ids = zip(*comb)

    y_pred = list(y_pred)
    track_ids = list(track_ids)

    res = []

    for track in track_ids:
        if track not in playlist_in_train[playlist]:
            res.append(track)
            if len(res) == n:
                break
    return res

In [None]:
get_n_best_ratings(0, fm, 20)

[25696,
 18595,
 29892,
 29425,
 3188,
 24427,
 33259,
 4247,
 16216,
 21023,
 28205,
 11265,
 9395,
 22530,
 12943,
 30217,
 27661,
 20710,
 982,
 18767]

## Metricas

In [None]:
def dcg_at_k(r, k):
    r = np.asfarray(r)[:k]
    if r.size:
        return np.sum((np.power(2, r) - 1) / np.log2(np.arange(2, r.size + 2)))
    return 0.0

def idcg_at_k(k):
    return dcg_at_k(np.ones(k), k)

def ndcg_at_k(r, k, max_relevant):
    idcg = idcg_at_k(min(k, max_relevant))
    if not idcg:
        return 0.0
    return dcg_at_k(r, k) / idcg

def calculate_ndcg(recommendations, relevant_items, k=10):
    """
    Calculate ndcg@k given recommendations and relevant items.

    Parameters:
        recommendations (list): List of recommended items (ordered by ranking).
        relevant_items (list): List of relevant items.
        k (int): The value of k for ndcg@k (default is 10).

    Returns:
        float: The ndcg@k score.
    """
    # Create a relevance vector: 1 if the item is relevant, 0 otherwise
    relevance = [1 if item in relevant_items else 0 for item in recommendations]

    # Calculate ndcg@k
    return ndcg_at_k(relevance, k, len(relevant_items))

def recall_at_k(relevant_items, recommended_items, k):
    relevant_items = set(relevant_items)
    recommended_items = set(recommended_items[:k])
    intersection = relevant_items.intersection(recommended_items)
    recall = len(intersection) / len(relevant_items)
    return recall

def precision_at_k(relevant_items, recommended_items, k):
    """
    Calcula la precisión en los primeros k elementos recomendados.

    Parameters:
        relevant_items (list): Lista de elementos relevantes.
        recommended_items (list): Lista ordenada de elementos recomendados.
        k (int): Número de elementos recomendados a considerar.

    Returns:
        float: Precision@k.
    """
    relevant_items = set(relevant_items)
    recommended_items = recommended_items[:k]
    relevant_recommended = [item for item in recommended_items if item in relevant_items]
    precision = len(relevant_recommended) / k
    return precision

In [None]:
def get_metrics(recommendations, relevant_items, k=10):
    ndcg = calculate_ndcg(recommendations, relevant_items, k)
    recall = recall_at_k(relevant_items, recommendations, k)
    precision = precision_at_k(relevant_items, recommendations, k)
    return ndcg, recall, precision

In [None]:
ndcgs = []
recalls = []
precisions = []
k = 10
rec = {}
pred = {}

for playlist in test_playlists:
    relevant_items = playlists_test_tracks[playlist]
    recommendations = get_n_best_ratings(playlist, fm, 25)
    rec[playlist] = recommendations
    pred[playlist] = relevant_items
    ndcg, recall, precision = get_metrics(recommendations, relevant_items, k=k)
    ndcgs.append(ndcg)
    recalls.append(recall)
    precisions.append(precision)

ndgc_avg = np.mean(ndcgs)
recall_avg = np.mean(recalls)
precision_avg = np.mean(precisions)

print(f"NDCG@10: {ndgc_avg}")
print(f"Recall@10: {recall_avg}")
print(f"Precision@10: {precision_avg}")

NDCG@10: 0.005574979859526343
Recall@10: 0.0016666666666666668
Precision@10: 0.005


In [None]:
name = '_'.join(usable_features)
with open(datadir[:-5]+f'/FastFMResults/recommendations_{name}.json', 'w') as f:
    json.dump({
        'rec': rec,
        'rel': pred
    }, f)

In [None]:
ndcgs = []
recalls = []
precisions = []
k = 20

for playlist in test_playlists:
    relevant_items = playlists_test_tracks[playlist]
    recommendations = rec[playlist]
    ndcg, recall, precision = get_metrics(recommendations, relevant_items, k=k)
    ndcgs.append(ndcg)
    recalls.append(recall)
    precisions.append(precision)

ndgc_avg = np.mean(ndcgs)
recall_avg = np.mean(recalls)
precision_avg = np.mean(precisions)

print(f"NDCG@20: {ndgc_avg}")
print(f"Recall@20: {recall_avg}")
print(f"Precision@20: {precision_avg}")

NDCG@20: 0.006783911963729912
Recall@20: 0.004672413793103448
Precision@20: 0.006999999999999999


In [None]:
ndcgs = []
recalls = []
precisions = []
k = 25

for playlist in test_playlists:
    relevant_items = playlists_test_tracks[playlist]
    recommendations = rec[playlist]
    ndcg, recall, precision = get_metrics(recommendations, relevant_items, k=k)
    ndcgs.append(ndcg)
    recalls.append(recall)
    precisions.append(precision)

ndgc_avg = np.mean(ndcgs)
recall_avg = np.mean(recalls)
precision_avg = np.mean(precisions)

print(f"NDCG@25: {ndgc_avg}")
print(f"Recall@25: {recall_avg}")
print(f"Precision@25: {precision_avg}")

NDCG@25: 0.006668566888064784
Recall@25: 0.005696679438058749
Precision@25: 0.0068000000000000005
