In [8]:
import numpy as np
import pandas as pd
import ast
from tqdm import tqdm
import matplotlib.pyplot as plt
from networkx.algorithms.community import louvain_communities
from metadata_graph import *
matplotlib.use('TkAgg')

  from tqdm.autonotebook import tqdm


In [9]:
infos = pd.read_csv('dataset/id_information_mmsr.tsv', sep='\t')
metadata = pd.read_csv('dataset/id_metadata_mmsr.tsv', sep='\t')
tags = pd.read_csv('dataset/id_tags_dict.tsv', sep='\t')
genres_base = pd.read_csv('dataset/top_genres.tsv', sep='\t')

## Metadata Information



| **Attribute**   | **Description**                                                                                                                                     |
|------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------|
| **id**          | Unique 16-character identifier for the song in the database.                                                                                       |
| **artist**       | Name of the artist that published the song on last.fm. There are 16,269 unique artists in the database.                                            |
| **song**         | Name of the song.                                                                                                                                  |
| **lang**         | Language assigned to the lyrics by the tool langdetect. There are 46 unique languages in the database.                                             |
| **spotify id**   | Song identifier in the Spotify application.                                                                                                       |
| **popularity**   | Integer value ranging from 0 to 100 representing how popular a song is on Spotify. This value is based on the total number of plays and recency.   |
| **album name**   | Name of the album that the song is in. There are 38,363 different albums in the database.                                                          |
| **release**      | Year in which the song was released.                                                                                                               |
| **danceability** | Real value ranging from 0.0 to 1.0 representing how suitable the song is for dancing, based on musical elements provided by the Spotify API.       |
| **energy**       | Real value ranging from 0.0 to 1.0 provided by the Spotify API that is a perceptual measure of intensity and activity.                             |
| **key**          | Overall key of the song, using standard Pitch Class notation, provided by the Spotify API.                                                        |
| **mode**         | Binary value provided by the Spotify API corresponding to the modality of the song, where major is represented by 1 and minor by 0.                |
| **valence**      | Real value ranging from 0.0 to 1.0 provided by the Spotify API that measures how positive a song is.                                               |
| **tempo**        | Speed or pace of the song, measured in beats per minute (BPM), provided by the Spotify API.                                                        |
| **genres**       | List of genre tags associated with the song. There are 853 unique genre tags in the database.                                                      |
| **tags**         | User-given tags from the last.fm application.                                                                                                      |


In [10]:
genres = genres_base.copy()
genres['top_genre'] = genres_base['top_genre'].apply(ast.literal_eval)
exp_genres = genres.explode('top_genre').reset_index(drop=True)
genre_amount = exp_genres['top_genre'].value_counts()
unique_genres_amount = len(list(set(list(genre_amount))))

threshold = 50
filtered_genre_counts = genre_amount[genre_amount >= threshold]

In [11]:
graph = GraphMetaData()
#graph.generate_graph()
graph.generate_multigraph()

communities = louvain_communities(graph.graph, weight='weight')

100%|██████████| 5148/5148 [01:34<00:00, 54.25it/s] 


In [18]:
nn_df = graph.batch_nearest_neighbor(infos, topK=100)
com_df = graph.batch_community_search(infos["id"].values, topK=100)

nn_df.to_csv('./predictions/ui/rets_mgm_nn_100.csv', sep=',', index=False)
com_df.to_csv('./predictions/ui/rets_mgm_comm_100.csv', sep=',', index=False)

Processing for UI:   0%|          | 1/5148 [00:00<15:37,  5.49it/s]

Processing for UI: 100%|██████████| 5148/5148 [01:05<00:00, 78.57it/s] 
Processing for UI: 100%|██████████| 5148/5148 [01:05<00:00, 78.91it/s] 


In [16]:
id_to_index = {row['id']: idx for idx, row in infos.iterrows()}
inter_mat = np.zeros((len(infos), len(infos)))

# Transform to matrix using the precomputed dictionary
for idx, row in tqdm(nn_df.iterrows(), total=len(nn_df), desc="Transforming to matrix"):
    source_idx = id_to_index[row['source_id']]
    target_idx = id_to_index[row['target_id']]
    inter_mat[source_idx, target_idx] = row['similarity']

np.savetxt('./predictions/rets_mgm_nn_100_matrix.csv', inter_mat, delimiter=",")

Transforming to matrix: 100%|██████████| 51480/51480 [00:05<00:00, 10102.17it/s]


In [17]:
id_to_index = {row['id']: idx for idx, row in infos.iterrows()}
inter_mat = np.zeros((len(infos), len(infos)))

# Transform to matrix using the precomputed dictionary
for idx, row in tqdm(com_df.iterrows(), total=len(com_df), desc="Transforming to matrix"):
    source_idx = id_to_index[row['source_id']]
    target_idx = id_to_index[row['target_id']]
    inter_mat[source_idx, target_idx] = row['similarity']

np.savetxt('./predictions/rets_mgm_comm_100_matrix.csv', inter_mat, delimiter=",")

Transforming to matrix: 100%|██████████| 51480/51480 [00:04<00:00, 10705.17it/s]
