In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import ast

In [2]:
genres = pd.read_csv("./dataset/id_genres_mmsr.tsv", sep="\t")
infos = pd.read_csv("./dataset/id_information_mmsr.tsv", sep="\t")
genres.shape

(5148, 2)

In [3]:
def random_sample(title, artist, topK=10):
    idx_to_drop = infos[(infos['song'] == title) & (infos['artist'] == artist)].index
    if len(idx_to_drop) == 0:
        print("Song not found; returning just any random sample")
        return infos.sample(topK)
    return infos.drop(idx_to_drop[0]).sample(topK)

In [4]:
def get_max_scorers(sample, genre_list):
    max_score = max(sample.values())

    # Extract keys with the highest score
    highest_scoring_genres = [genre for genre, score in sample.items() if score == max_score and genre in genre_list]
    return highest_scoring_genres


def create_interaction_matrix(df, genres, inter, num_rows):

    # Cache genre lists for each ID to avoid repeating lookups
    id_to_genres = genres.set_index('id')['genre'].to_dict()

    # Loop through each pair (i, j) to populate the interaction matrix
    for i in tqdm(range(num_rows)):
        # Read the first row (i)
        id_i = df.iloc[i]['id']
        sample_i = ast.literal_eval(df.iloc[i]['(tag, weight)'])
        genre_list_i = ast.literal_eval(id_to_genres.get(id_i, '[]'))  # Safely fetch genres for id_i
        top_i = np.array(get_max_scorers(sample_i, genre_list_i))

        for j in range(i + 1, num_rows):  # Only need to process pairs (i, j) with i < j
            # Read the second row (j)
            id_j = df.iloc[j]['id']
            sample_j = ast.literal_eval(df.iloc[j]['(tag, weight)'])
            genre_list_j = ast.literal_eval(id_to_genres.get(id_j, '[]'))  # Safely fetch genres for id_j
            top_j = np.array(get_max_scorers(sample_j, genre_list_j))

            # Calculate intersection of top genres
            intersection = np.intersect1d(top_i, top_j)
            is_interaction = 1 if len(intersection) > 0 else 0

            # Update the interaction matrix symmetrically
            inter[i][j] = inter[j][i] = is_interaction


In [5]:

# Read the datasets once
df = pd.read_csv("./dataset/id_tags_dict.tsv", sep="\t")
genres = pd.read_csv("./dataset/id_genres_mmsr.tsv", sep="\t")

# Initialize the interaction matrix
num_rows = len(df)
inter = np.zeros((num_rows, num_rows))

create_interaction_matrix(df, genres, inter, num_rows)

  0%|          | 16/5148 [00:25<2:17:54,  1.61s/it]


KeyboardInterrupt: 

In [None]:
np.savetxt("./dataset/interaction_matrix.csv", inter, delimiter="\t")

In [None]:
song_title = ""
song_artist = ""
length = 5148

randomTopK = random_sample(song_title, song_artist, topK=10)
print(randomTopK.drop(columns=['id']))

Song not found; returning just any random sample
                 artist                    song  \
1646  Angels & Airwaves            Young London   
4286     Jennifer Lopez              Do It Well   
387      Sunrise Avenue  Somebody Will Find You   
3614          Powerwolf          Saturday Satan   
4878        The Rapture         Never Die Again   
3993         Deafheaven                    Luna   
2958        Post Malone                    Stay   
2163      Azealia Banks          Skylar Diggins   
1853    Electric Wizard         Electric Wizard   
3500           Converge            On My Shield   

                                        album_name  
1646                               Love, Pt. 1 & 2  
4286                                         Brave  
387                             Acoustic Tour 2010  
3614                                     Lupus Dei  
4878                     In the Grace of Your Love  
3993                                   New Bermuda  
2958              

In [None]:
ndcg_at_k(inter, length, 10)

array([[0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])