In [77]:
import numpy as np
import pandas as pd
import networkx as nx

from sklearn.preprocessing import MinMaxScaler

from tqdm import tqdm, trange
import plotly.express as px
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

In [5]:
raw_style_data = pd.read_csv("influence_data.csv")
m = np.shape(raw_style_data)[0]
musician_dict = dict()
for i in range(m):
    if musician_dict.get(raw_style_data["influencer_id"][i], -1) == -1:
        musician_dict[raw_style_data["influencer_id"][i]] = raw_style_data["influencer_main_genre"][i]
    if musician_dict.get(raw_style_data["follower_id"][i], -1) == -1:
        musician_dict[raw_style_data["follower_id"][i]] = raw_style_data["follower_main_genre"][i]

In [6]:
info_columns = ["danceability", "energy", "valence", "tempo", "loudness", "mode", "key", "acousticness",
                "instrumentalness", "liveness", "speechiness", "duration_ms"]

raw_full_data = pd.read_csv("full_music_data.csv")
artists_id = raw_full_data["artists_id"].values
genre_list = []
n = np.shape(artists_id)[0]
for i in range(n):
    temp, flag = artists_id[i][1:-1].split(','), False
    for x in temp:
        if musician_dict.get(int(x.lstrip(' ')), -1) != -1:
            genre_list.append(musician_dict[int(x.lstrip(' '))])
            flag = True
            break
    if not flag:
        genre_list.append("Unknown")

genres = pd.unique(genre_list)

In [66]:
raw_data_artist = pd.read_csv("data_by_artist.csv")

scaler = MinMaxScaler()
X = scaler.fit_transform(raw_data_artist[info_columns].values)

I tried both raw data $X$ and transformed data $XC^T$ whose coefficients $C^T$ are provided by Factor Analysis in Problem 2.  
The result shows that with linear transform by FA, the divergences between genres are larger and within a genre are smaller.

In [70]:
fa_coef = pd.read_csv("P2_transform_coefficients.csv", index_col = 0)
musician_embedding = dict()
genre_dict = {g: [] for g in genres}

for i in range(np.shape(raw_data_artist)[0]):
    if musician_dict.get(raw_data_artist["artist_id"][i], -1) != -1:
        musician_embedding[raw_data_artist["artist_id"][i]] = np.dot(X[i, :], fa_coef)
        genre_dict[musician_dict[raw_data_artist["artist_id"][i]]].append(raw_data_artist["artist_id"][i])

In [96]:
# for g1 in tqdm(genres):
for g1 in genres:
    for g2 in genres:
        g1_artists, g2_artists = genre_dict[g1], genre_dict[g2]
        g1_embedding, g2_embedding = [], []
        for artist in g1_artists:
            g1_embedding.append(musician_embedding[artist])
        for artist in g2_artists:
            g2_embedding.append(musician_embedding[artist])
        g1_embedding, g2_embedding = np.array(g1_embedding), np.array(g2_embedding)
#       Gaussian similarity
#         sim = np.zeros((len(g1_embedding), len(g2_embedding)))
#         for i in range(len(g1_embedding)):
#             sim[i, :] = np.exp(-np.linalg.norm(g1_embedding[i]-g2_embedding, 2, axis = 1)**2 / 0.18)
#         similarity_matrix[g1][g2] = np.mean(sim[:])

#       Cosine similarity
        g1_norm = np.linalg.norm(g1_embedding, 2, axis = 1, keepdims = True)
        g2_norm = np.linalg.norm(g2_embedding, 2, axis = 1, keepdims = True)
        denominator = np.dot(g1_norm, g2_norm.T)
        similarity_matrix[g1][g2] = np.mean((np.dot(g1_embedding, g2_embedding.T)/denominator)[:])

In [118]:
print(similarity_matrix)

                Electronic      R&B;     Vocal  Pop/Rock  Religious     Blues  \
Electronic        0.757994  0.800035  0.356712  0.796576   0.739828  0.670787   
R&B;              0.800035  0.912202  0.542164  0.871274   0.852849  0.814321   
Vocal             0.356712  0.542164  0.828474  0.398491   0.510668  0.705910   
Pop/Rock          0.796576  0.871274  0.398491  0.861008   0.812561  0.726816   
Religious         0.739828  0.852849  0.510668  0.812561   0.799240  0.760781   
Blues             0.670787  0.814321  0.705910  0.726816   0.760781  0.826867   
Country           0.740361  0.872926  0.620734  0.809529   0.817225  0.823535   
Unknown           0.799540  0.909668  0.546850  0.867889   0.849714  0.815742   
Jazz              0.442060  0.549905  0.681034  0.448407   0.505824  0.659232   
Latin             0.744510  0.869425  0.619469  0.807146   0.812066  0.823025   
New Age          -0.021641  0.007611  0.423562 -0.080618  -0.007625  0.211812   
Folk              0.434905  

In [121]:
fig = px.imshow(similarity_matrix)
fig.show()

# fig, ax = plt.subplots()
# im = ax.imshow(similarity_matrix)
# ax.set_xticks(np.arange(len(genres)))
# ax.set_yticks(np.arange(len(genres)))
# ax.set_xticklabels(genres)
# ax.set_yticklabels(genres)
# plt.setp(ax.get_xticklabels(), rotation=60, ha="right",
#          rotation_mode="anchor")
# cbar = ax.figure.colorbar(im, ax=ax)
# cbar.ax.set_ylabel("", rotation=-90, va="bottom")
# plt.show()