In [1]:
import fastavro
import pandas as pd
from sklearn.cluster import KMeans
import numpy as np
import seaborn as sns

In [2]:
avro_file = 'data/easyNormalized.avro'

with open(avro_file, 'rb') as f:
    avro_data = list(fastavro.reader(f))

df = pd.DataFrame.from_records(avro_data)

In [3]:
df_selected = df.loc[:,['danceability', 'duration', 'end_of_fade_in', 'loudness', 'tempo',
       'hotness','year', 'time_signature']]

# K-mean

## Model training

In [4]:
k = 100  # Number of clusters

kmeans = KMeans(n_clusters=k)
kmeans.fit(df_selected)

cluster_centers = kmeans.cluster_centers_

labeled_df = df.copy()
labeled_df['Cluster'] = kmeans.labels_

## Get the most k related center

In [116]:
track_id = b"TRAXLZU12903D05F94"

In [109]:
def getKMostRelatedCenter(track_id, labeled_df, cluster_centers, k=3)-> list:
    """
    Return the list of k most close centers

    Args: 
        track id, labeled_df, cluster_centers, k
    
    Returns:
        a list
    """

    song = labeled_df[labeled_df["track_id"] == track_id]
    song = song.loc[:,['danceability', 'duration', 'end_of_fade_in', 'loudness', 'tempo',
       'hotness','year', 'time_signature']].to_numpy()
    dis = []
    for center in cluster_centers:
        dis.append(np.dot(song, center) / (np.linalg.norm(song) * np.linalg.norm(center)))
    
    sorted_np = np.argsort(np.array(dis), axis=0)[:,0] < k
    index = range(0, np.shape(cluster_centers)[0])

    cluster_centers_withindex = np.hstack((cluster_centers, np.array(index).reshape(-1, 1)))
    selected_centers = cluster_centers_withindex[sorted_np][:, -1]

    return list(selected_centers)

In [110]:
# select the n most related centers
selected_center_num = 10

related_centers = getKMostRelatedCenter(b"TRAXLZU12903D05F94", labeled_df, cluster_centers, selected_center_num)

In [111]:
df_related = labeled_df[labeled_df["Cluster"].isin(related_centers)]

In [112]:
features_numpy = df_related.loc[:,['danceability', 'duration', 'end_of_fade_in', 'loudness', 'tempo',
       'hotness','year', 'time_signature']].to_numpy()

song = labeled_df[labeled_df["track_id"] == b"TRAXLZU12903D05F94"]
song_numpy = song.loc[:,['danceability', 'duration', 'end_of_fade_in', 'loudness', 'tempo',
       'hotness','year', 'time_signature']].to_numpy()

song_numpy_duplicate = np.tile(song_numpy, (features_numpy.shape[0], 1))
cos_numpy = np.transpose(np.dot(features_numpy, np.transpose(song_numpy))) / np.transpose(np.linalg.norm(features_numpy, axis=1)) / np.transpose(np.linalg.norm(song_numpy_duplicate, axis=1))
cos_numpy = np.transpose(cos_numpy)

df_related["Cos_correlation_factor"] = cos_numpy


In [115]:
def selected_n_most_related_in_groupByDataFrame(df : pd.DataFrame):
    # default select 10 songs
    n = 1
    return (df.sort_values(by= "Cos_correlation_factor", ascending=False).iloc[:n, :])

df_related.groupby(by = "Cluster").apply(selected_n_most_related_in_groupByDataFrame)
    

Unnamed: 0_level_0,Unnamed: 1_level_0,danceability,duration,end_of_fade_in,loudness,tempo,hotness,song_id,track_id,album_id,year,title,time_signature,Cluster,Cos_correlation_factor
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
24,294093,0.0,0.056866,0.000227,0.632392,0.283683,0.669305,b'SOFSFRQ12A8C140A14',b'TRHQBBE128F42B15CE',238334,0.41573,b'Caress Me Baby',0.333333,24,0.99242
26,82820,0.0,0.080952,0.000162,0.789091,0.446416,0.157274,b'SOWVYOI12A8C1392C2',b'TRCDRFH128F42779B7',158796,0.73971,b'\xc3\x89\xc3\xa9n Nacht Met Jou',0.5,26,0.884449
37,797796,0.0,0.001601,0.000104,0.687228,0.527149,0.542865,b'SODHWOU12AC9DCB042',b'TRUSVCS128F92E7941',434456,0.692023,b'Outbound',0.211311,37,0.957317
41,167759,0.0,0.082165,0.001925,0.650858,0.293377,0.365144,b'SOUMBUR12A670216F0',b'TREITVF128E0786E89',8912,0.730337,b'Maybe It Was Magic',0.333333,41,0.954277
65,725045,0.0,0.020634,0.00032,0.483263,0.271223,0.556969,b'SOIXSDM12A8C13573A',b'TRSVDUX128F426F39F',91413,0.322092,b'Marcia Disperata',0.0,65,0.932654
66,481439,0.0,0.063038,0.000143,0.646009,0.266219,0.531722,b'SOBNRBJ12A8C13C78F',b'TRMMTKC128F42821B7',217657,0.606505,"b""Giz Starts Buggin' (LP Version)""",0.5,66,0.987778
69,919543,0.0,0.050875,0.0,0.766209,0.373787,0.544001,b'SOLGCQG12A8C13CC94',b'TRXXMYZ128F427788E',251359,0.743123,b'Marcha Real',0.0,69,0.925604
78,668626,0.0,0.051933,0.000332,0.55794,0.258997,0.59437,b'SOPTVNF12A6D4F6327',b'TRRIXCH128EF35CC91',80509,0.529575,b'Los llantos del alba',0.333333,78,0.999636
85,1286,0.0,0.064803,0.002439,0.864055,0.536194,0.550624,b'SORPJVW12AB01866BA',b'TRAAVMC128F932F388',505526,0.831545,b'World Premier',0.5,85,0.968891
91,968645,0.0,0.074289,0.000341,0.69294,0.482617,0.773615,b'SORIVNZ12A8AE48994',b'TRZESLM128F4237EE6',185020,0.270922,b'Sto Piangendo',0.5,91,0.951392


In [119]:
labeled_df[labeled_df["track_id"] == track_id]

Unnamed: 0,danceability,duration,end_of_fade_in,loudness,tempo,hotness,song_id,track_id,album_id,year,title,time_signature,Cluster
35330,0.0,0.069658,0.000119,0.8069,0.359387,0.864249,b'SOCWJDB12A58A776AF',b'TRAXLZU12903D05F94',786795,0.730337,b'Never Gonna Give You Up',0.5,46


In [124]:
def Cosine(data_A, data_B):
    sumData = np.dot(data_A, data_B)
    denom = np.linalg.norm(data_A) * np.linalg.norm(data_B)
    # normalized to [0,1]
    return (1 - sumData / denom) / 2