In [1]:
import pandas as pd
import numpy as np
import argparse
import os
import logging
import gensim
import pickle
from gensim.models import Doc2Vec, Word2Vec
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.preprocessing import LabelBinarizer

In [2]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Constants
MDL_FOLDER = 'models/'
SRC_FOLDER = 'formatted/dataset/'
dataset = pd.read_feather(SRC_FOLDER + 'dataframe.feather')

In [3]:
def retrieve_track_info(track_ids):
        # If track_ids is not a list, convert it into a list
    if isinstance(track_ids, int):
        track_ids = [track_ids]

    # Search for each track_id in the DataFrame and print the results
    for track_id in track_ids:
        result = dataset[dataset['track_id'] == int(track_id)]

        if not result.empty:
            track_name = result['track_name'].values[0]
            artist_name = result['artist_name'].values[0]
            playlist_name = result['playlist_name'].values[0]
            logging.info(f"Track ID: {track_id} | Track Name: {track_name} | Artist: {artist_name} | Playlist: {playlist_name}")
        else:
            logging.info(f"Track ID: {track_id} not found.")


In [4]:
def mean_vectors(tracks, model):
    vec = []
    for track_id in tracks:
        try:
            vec.append(model.wv(track_id))
        except KeyError:
            continue
    return np.mean(vec, axis=0) if vec else None

In [5]:
def get_similar_tracks(model, track_id, top_n=10):
    if isinstance(model, Word2Vec):
        if track_id not in model.wv:
            logging.warning(f"Track ID '{track_id}' not found in Word2Vec model's vocabulary.")
            return []
        similar_tracks = model.wv.most_similar(track_id, topn=top_n)
    elif isinstance(model, Doc2Vec):
        if track_id not in model.dv:
            logging.warning(f"Track ID '{track_id}' not found in Doc2Vec model's vocabulary.")
            return []
        similar_tracks = model.dv.most_similar(track_id, topn=top_n)
    else:
        raise ValueError("Invalid model type")
    
    return [track[0] for track in similar_tracks]

In [6]:
def get_recommendations_for_playlist(model, playlist_id, test_set, top_n=10):
    tracklist = test_set.loc[test_set['playlist_id'] == playlist_id, 'track_id'].values[0]
    avg_vector = mean_vectors(tracklist, model)  # Calculate the average vector for the playlist
    if avg_vector is None:
        return []
    predicted_tracks = model.wv.similar_by_vector(avg_vector, topn=top_n)  # Get similar tracks

    return [track for track, _ in predicted_tracks]  # Return just track IDs

In [7]:
def calculate_metrics(ground_truth, predictions):
    y_true = []
    y_pred = []

    for _, gt_row in ground_truth.iterrows():
        playlist_id = gt_row['playlist_id']
        actual_tracks = gt_row['actual_track_ids']
        
        pred_row = predictions[predictions['playlist_id'] == playlist_id]
        predicted_tracks = pred_row['predicted_track_ids'].values[0] if not pred_row.empty else []
        
        y_true.extend(actual_tracks)
        y_pred.extend(predicted_tracks)

    lb = LabelBinarizer()
    y_true_bin = lb.fit_transform(y_true)
    y_pred_bin = lb.fit_transform(y_pred)

    precision = precision_score(y_true_bin, y_pred_bin, average='macro')
    recall = recall_score(y_true_bin, y_pred_bin, average='macro')
    f1 = f1_score(y_true_bin, y_pred_bin, average='macro')
    accuracy = accuracy_score(y_true_bin, y_pred_bin, average='macro')
    r_precision = r_precision_score(predictions, ground_truth)
    ndcg = ndcg_score(predictions, ground_truth)

    return precision, recall, f1, accuracy, r_precision, ndcg

In [8]:
def r_precision_score(predictions, ground_truth):
    r_precisions = []
    for _, true_row in ground_truth.iterrows():
        playlist_id = true_row['playlist_id']
        true_tracks = true_row['actual_track_ids']
        pred_tracks = predictions[predictions['playlist_id'] == playlist_id]['predicted_track_ids'].values[0]

        R = len(true_tracks)
        relevant_retrieved = len(set(pred_tracks[:R]) & set(true_tracks))
        r_precisions.append(relevant_retrieved / R if R > 0 else 0)

    return sum(r_precisions) / len(r_precisions) if r_precisions else 0


def ndcg_score(predictions, ground_truth):
    ndcgs = []
    for _, true_row in ground_truth.iterrows():
        playlist_id = true_row['playlist_id']
        true_tracks = true_row['actual_track_ids']
        pred_tracks = predictions[predictions['playlist_id'] == playlist_id]['predicted_track_ids'].values[0]

        dcg = sum(1 / np.log2(i + 2) for i, track in enumerate(pred_tracks) if track in true_tracks)
        idcg = sum(1 / np.log2(i + 2) for i in range(min(len(true_tracks), len(pred_tracks))))

        ndcgs.append(dcg / idcg if idcg > 0 else 0)

    return sum(ndcgs) / len(ndcgs) if ndcgs else 0


In [9]:
def build_ground_truth(df):
    ground_truth = []

    for _, row in df.iterrows():
        playlist_id = row['playlist_id']
        tracklist = row['track_id']
        ground_truth.append((playlist_id, tracklist))
    
    return pd.DataFrame(ground_truth, columns=['playlist_id', 'actual_track_ids'])

In [10]:
def load_model(model_type, model_timestamp):
    files = os.listdir(MDL_FOLDER)

    if model_type == 'D2V':
        return Doc2Vec.load(MDL_FOLDER + f'd2v/d2v-trained-model-{model_timestamp}.model')
    elif model_type == 'W2V':
        return Word2Vec.load(MDL_FOLDER + f'w2v/w2v-trained-model-{model_timestamp}.model')
    else:
        raise ValueError("Invalid model type, use: 'W2V' or 'D2V'")

In [11]:
def leave_one_out_evaluation(model, test_set, top_n):
    total_hits = 0
    total_playlists = 0

    for _, row in test_set.iterrows():
        playlist_id = row['playlist_id']
        tracklist = row['track_id']
        hits = 0
        
        for i, song in enumerate(tracklist):
            remaining_tracks = tracklist[:i].tolist() + tracklist[i + 1:].tolist()
            recommendations = get_recommendations_for_playlist(model, playlist_id, remaining_tracks, top_n)
            
            if song in recommendations:
                hits += 1
        
        total_hits += hits
        total_playlists += len(tracklist)

    average_hit_rate = total_hits / total_playlists if total_playlists > 0 else 0
    return average_hit_rate


In [12]:
model = Word2Vec.load('models/w2v/w2v-trained-model-20241002_032052.model')

2024-10-02 18:04:49,456 - INFO - loading Word2Vec object from models/w2v/w2v-trained-model-20241002_032052.model
2024-10-02 18:04:49,547 - INFO - loading wv recursively from models/w2v/w2v-trained-model-20241002_032052.model.wv.* with mmap=None
2024-10-02 18:04:49,549 - INFO - loading vectors from models/w2v/w2v-trained-model-20241002_032052.model.wv.vectors.npy with mmap=None
2024-10-02 18:04:49,932 - INFO - loading syn1neg from models/w2v/w2v-trained-model-20241002_032052.model.syn1neg.npy with mmap=None
2024-10-02 18:04:50,360 - INFO - setting ignored attribute cum_table to None
2024-10-02 18:04:55,205 - INFO - Word2Vec lifecycle event {'fname': 'models/w2v/w2v-trained-model-20241002_032052.model', 'datetime': '2024-10-02T18:04:55.205733', 'gensim': '4.3.3', 'python': '3.9.13 (main, Aug 25 2022, 23:51:50) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22631-SP0', 'event': 'loaded'}


In [13]:
songs = pd.read_feather('formatted/dataframe.feather')
songs.drop_duplicates(subset=['track_id'], inplace=True)
songs.set_index('track_id', inplace=True)
# Crea una colonna booleana che indica se la canzone è presente nel vocabolario
songs['in_vocab'] = songs.index.isin(model.wv.index_to_key)

# Visualizza le prime 5 righe per controllare
songs.head()
songs

Unnamed: 0_level_0,playlist_id,playlist_name,track_name,track_uri,pos,artist_id,artist_name,artist_uri,in_vocab
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,967000,90's country,meet in the middle,7lUE02KHkZM44BZgjCaWRO,0,1,diamond rio,0svyIX7Xu3DVMkrOtB88i6,True
2,967000,90's country,love a little stronger,19WNVZ9WG2DvHJsPAx7qZM,1,1,diamond rio,0svyIX7Xu3DVMkrOtB88i6,True
3,967000,90's country,how your love makes me feel,0QOEdRACkh52czOmA9b8hi,2,1,diamond rio,0svyIX7Xu3DVMkrOtB88i6,True
4,967000,90's country,keeper of the stars - bonus track,5BSlukklfUTrmzmsuctmMr,3,2,tracy byrd,0xxJO75NeIAF5jYruLYIDT,True
5,967000,90's country,ten rounds with jose cuervo - recall mix,26J61MmE6YCWqIyROZdEgL,4,2,tracy byrd,0xxJO75NeIAF5jYruLYIDT,True
...,...,...,...,...,...,...,...,...,...
459970,844998,soft songs,we found love,48kgZuWwh0GVduspIFGCQa,16,13712,piano tribute players,4Xx6QMLTWppMwdABkN0Afj,False
459971,844998,soft songs,grenade made famous by bruno mars,2StB0ZEwDhkhIcQtMLPdth,17,13712,piano tribute players,4Xx6QMLTWppMwdABkN0Afj,False
459972,844998,soft songs,set fire to the rain,1rWhwmfjLWMWiH3nLP0rfv,19,13712,piano tribute players,4Xx6QMLTWppMwdABkN0Afj,False
459973,844998,soft songs,last friday night t g i f,6vcpSpwtoHPuqq07cpHcaX,21,13712,piano tribute players,4Xx6QMLTWppMwdABkN0Afj,False


In [20]:
def meanVectors(playlist):
    vec = []
    for song_id in playlist:
        try:
            vec.append(model.wv[song_id])
        except KeyError:
            continue
    return np.mean(vec, axis=0)
    
with open('dataset/train.pkl', 'rb') as f:
    playlist_test = pickle.load(f)
playlist_vec = list(map(meanVectors, playlist_test))


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [32]:
print(len(playlist_vec))
print(playlist_test[:100])

49000
[[10228, 12428, 82, 10225, 815, 921], [87031, 14393, 7675, 422946, 14398, 111064, 253371, 12354, 40002, 422947, 386928, 314029, 189049, 109821, 29933, 143143, 382762, 422948, 221241, 422949, 422950, 422951, 422952, 28639, 13618], [70667, 4350, 1114, 505, 16737, 1139, 401507, 43538, 518, 91, 2442], [3426, 11793, 33442, 444699, 444700, 38259, 57034, 130921, 444701, 142797, 30784, 6727, 70311, 74403, 45205, 3420, 20536, 8661, 14605, 13445, 88269, 48904, 23576, 17140, 109569, 46812, 39069, 9526, 158143, 9527, 56373, 744, 64419, 4996, 14412, 32596, 4329, 232691, 119143, 3417, 12796, 23160, 7461, 29721, 17134, 32571, 64422, 13559, 235799, 11434, 15964, 444702, 108003, 73233, 222, 92315, 74275, 1862, 11770, 344343, 228353, 11772, 64419, 64422, 15686, 12692, 855, 6010, 45049, 35750, 493, 513, 226, 209507, 29701, 377535, 17694, 444703, 6018, 29583, 8288, 6285, 59146, 29139, 6000, 10437, 6905, 13445, 35123, 16696, 6287, 444704, 7549, 15118, 151796, 380, 3408, 2424, 17688, 2413, 36689, 1145

In [34]:
def similarSongsByVector(vec, n = 10, by_name = True):
    # extract most similar songs for the input vector
    similar_songs = model.wv.similar_by_vector(vec, topn = n)
    
    # extract name and similarity score of the similar products
    if by_name:
        similar_songs = [songs.loc[song_id, ['track_name', 'artist_name']], sim)
                              for song_id, sim in similar_songs]
    
    return similar_songs

In [36]:
def print_recommended_songs(idx, n):
    print("============================")
    print("SONGS PLAYLIST")
    print("============================")
    for song_id in playlist_test[idx]:
        print(songs.loc[song_id, ['track_name', 'artist_name']])
    print()
    print("============================")
    print(f"TOP {n} RECOMMENDED SONGS")
    print("============================")
    for song, sim in similarSongsByVector(playlist_vec[idx], n):
        print(f"[Similarity: {sim:.3f}] {song}")
    print("============================")

In [37]:
print_recommended_songs(82, n=10)

SONGS PLAYLIST
track_name     under pressure - remastered 2011
artist_name                               queen
Name: 5860, dtype: object
track_name     sugar daddy - theme from vinyl
artist_name                  sturgill simpson
Name: 150815, dtype: object
track_name     the world is yours
artist_name             ty taylor
Name: 173703, dtype: object
track_name     mama he treats your daughter mean
artist_name                           ruth brown
Name: 174583, dtype: object
track_name       mr pitiful
artist_name    otis redding
Name: 174584, dtype: object
track_name     stairway to heaven
artist_name          led zeppelin
Name: 949, dtype: object
track_name        easy lover
artist_name    philip bailey
Name: 11232, dtype: object
track_name     forget me nots - 12" version
artist_name                  patrice rushen
Name: 174585, dtype: object
track_name     owner of a lonely heart
artist_name                        yes
Name: 14142, dtype: object
track_name     owner of a lonely heart

KeyError: 'artist - title'

In [14]:
def main(model_type, playlist_id=None, track_id=None, model_timestamp=None):
    #model = load_model(model_type, model_timestamp)
    model = Word2Vec.load(MDL_FOLDER + f'w2v/w2v-trained-model-{model_timestamp}.model')
    logging.info(f"Model loaded: {model_type}")
    
    test_set = pd.read_feather(SRC_FOLDER + 'test.feather')

    if playlist_id is not None:
        logging.info(f"Generating recommendations for playlist ID: {playlist_id}")
        recommendations = get_recommendations_for_playlist(model, playlist_id, test_set, top_n=10)
        logging.info(f"Recommended tracks for playlist {playlist_id}: {recommendations}")
    elif track_id is not None:
        logging.info(f"Finding similar tracks for track ID: {track_id}")
        logging.info(retrieve_track_info(track_id))
        similar_tracks = get_similar_tracks(model, track_id, top_n=10)
        logging.info(f"Similar tracks to {track_id}: {similar_tracks}")
        logging.info(retrieve_track_info(similar_tracks))
    else:
        logging.error("Please provide either a playlist_id or a track_id for recommendations.")
        return

    ground_truth = build_ground_truth(test_set)

    precision, recall, f1, accuracy, r_precision, ndcg = calculate_metrics(test_set, ground_truth)
    logging.info(f'Accuracy: {accuracy:.4f}')
    logging.info(f'Precision: {precision:.4f}')
    logging.info(f'Recall: {recall:.4f}')
    logging.info(f'F1 Score: {f1:.4f}')
    logging.info(f'R-Precision: {r_precision:.4f}')
    logging.info(f'NDCG: {ndcg:.4f}')

    average_hit_rate = leave_one_out_evaluation(model, test_set, 10)
    logging.info(f'Average Hit Rate at 10: {average_hit_rate:.4f}')

# Esegui la funzione direttamente
# Sostituisci questi valori con quelli effettivi che vuoi testare
model_type = 'W2V'  # O 'D2V'
playlist_id = None  # Sostituisci con l'ID della tua playlist
track_id = 270  # Se vuoi usare playlist_id, lascia track_id a None
model_timestamp = '20241002_032052'  # Data o identificativo del modello
main(model_type, playlist_id=playlist_id, track_id=track_id, model_timestamp=model_timestamp)


2024-10-02 17:39:37,721 - INFO - loading Word2Vec object from models/w2v/w2v-trained-model-20241002_032052.model
2024-10-02 17:39:37,792 - INFO - loading wv recursively from models/w2v/w2v-trained-model-20241002_032052.model.wv.* with mmap=None
2024-10-02 17:39:37,794 - INFO - loading vectors from models/w2v/w2v-trained-model-20241002_032052.model.wv.vectors.npy with mmap=None
2024-10-02 17:39:37,966 - INFO - loading syn1neg from models/w2v/w2v-trained-model-20241002_032052.model.syn1neg.npy with mmap=None
2024-10-02 17:39:38,147 - INFO - setting ignored attribute cum_table to None
2024-10-02 17:39:43,691 - INFO - Word2Vec lifecycle event {'fname': 'models/w2v/w2v-trained-model-20241002_032052.model', 'datetime': '2024-10-02T17:39:43.691942', 'gensim': '4.3.3', 'python': '3.9.13 (main, Aug 25 2022, 23:51:50) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22631-SP0', 'event': 'loaded'}
2024-10-02 17:39:43,693 - INFO - Model loaded: W2V
2024-10-02 17:39:43,817 - INFO - Findin

KeyError: 'actual_track_ids'