In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import ast
from similarity_measures import *
import os

In [2]:
def get_max_scorers(tags, genre_list):
    only_tags = {}
    for x in genre_list:
        # tags[x] works only bercause eac genr is definetly in the tags
        only_tags[x] = tags[x]

    max_score = max(list(only_tags.values()))

    return_list = []

    for x in only_tags:
        if only_tags[x] == max_score:
            return_list.append(x)

    return return_list

def create_top_genres(infos, tags, genres):
    # all genres per song; stored as a dict for easy retrieval
    id_to_genres = genres.set_index('id')['genre'].to_dict()
    id_to_tags = tags.set_index('id')['(tag, weight)'].to_dict()

    # create return list dict for top genres
    return_list = []
    for i in tqdm(range(len(infos))):
        id = infos.iloc[i]['id']
        tags = ast.literal_eval(id_to_tags.get(id, '[]'))
        genre_list = ast.literal_eval(id_to_genres.get(id, '[]'))
        
        top_genre = get_max_scorers(tags, genre_list)
        return_list.append({'id': id, 'top_genre': top_genre})
    
    return pd.DataFrame(return_list)

def jaccard_similarity_matrix(infos, top_genres):
    id_to_top_genre = top_genres.set_index('id')['top_genre'].to_dict()
    ids = infos['id'].tolist()

    inter = np.zeros((len(infos), len(infos)))

    for i in tqdm(range(len(infos))):
        genres1 = id_to_top_genre.get(ids[i], [])

        # important plus 1 to avoid self-similarity
        for j in range(i+1, len(infos)):
            genres2 = id_to_top_genre.get(ids[j], [])

            sim = jaccard_similarity(genres1, genres2)
            inter[i, j] = sim
            inter[j, i] = sim

    return inter

def generate_binary_relevancy(inter_base, threshold):
    inter = (inter_base > threshold).astype(int)
    return inter

In [4]:
directory = "./predictions"
if not os.path.exists(directory):
    os.makedirs(directory)

infos = pd.read_csv("./dataset/id_information_mmsr.tsv", sep="\t")
tags = pd.read_csv("./dataset/id_tags_dict.tsv", sep="\t")
genres = pd.read_csv("./dataset/id_genres_mmsr.tsv", sep="\t")

top_genres = create_top_genres(infos, tags, genres)
top_genres.to_csv("./dataset/top_genres.tsv", sep="\t", index=False)

inter = jaccard_similarity_matrix(infos, top_genres)
np.savetxt("./predictions/jaccard_similarity_matrix.csv", inter, delimiter="\t")

inter_binar_10 = generate_binary_relevancy(inter, 0.99999)
inter_binar_075 = generate_binary_relevancy(inter, 0.75)
inter_binar_05 = generate_binary_relevancy(inter, 0.5)
inter_binar_025 = generate_binary_relevancy(inter, 0.25)
inter_binar_00 = generate_binary_relevancy(inter, 0)
np.savetxt("./predictions/binary_relevancy_matrix_10.csv", inter_binar_10, delimiter="\t")
np.savetxt("./predictions/binary_relevancy_matrix_075.csv", inter_binar_075, delimiter="\t")
np.savetxt("./predictions/binary_relevancy_matrix_05.csv", inter_binar_05, delimiter="\t")
np.savetxt("./predictions/binary_relevancy_matrix_025.csv", inter_binar_025, delimiter="\t")
np.savetxt("./predictions/binary_relevancy_matrix_00.csv", inter_binar_00, delimiter="\t")

100%|██████████| 5148/5148 [00:01<00:00, 4321.31it/s]
100%|██████████| 5148/5148 [00:16<00:00, 303.92it/s] 
