In [1]:
from code.data_utils import get_token, get_auth_header, get_recs, get_audio_features
import pandas as pd

5835805e22a345948373bd954162324c 41b27d9b0cbe44deb1d9edd7d306160d


In [2]:
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
import numpy as np

# Define the circle of fifths order
CIRCLE_OF_FIFTHS = [0, 7, 2, 9, 4, 11, 6, 1, 8, 3, 10, 5]

# Create a mapping from pitch classes to their positions in the circle of fifths
PITCH_CLASS_TO_POSITION = {pc: idx for idx, pc in enumerate(CIRCLE_OF_FIFTHS)}

# Number of pitch classes
N_PITCH_CLASSES = len(CIRCLE_OF_FIFTHS)

# Calculate the angle for each pitch class
ANGLES = {pc: (2 * np.pi * idx) / N_PITCH_CLASSES for pc, idx in PITCH_CLASS_TO_POSITION.items()}

def angle_to_vector(angle):
    return np.array([np.cos(angle), np.sin(angle)])

# Create vectors for each pitch class
PITCH_CLASS_VECTORS = {pc: angle_to_vector(angle) for pc, angle in ANGLES.items()}

# def cosine_similarity(vec1, vec2):
#     return np.dot(vec1, vec2) / (norm(vec1) * norm(vec2))

def normalize_cosine_similarity(cosine_similarity):
    """
    Normalize cosine similarity from [-1, 1] to [0, 1].
    """
    return (cosine_similarity + 1) / 2
    
def normalize_audio_feats(feats_df, include_categorical=True, circle_5 = True):
    # Normalize continuous features
    scaler = MinMaxScaler()
    continuous_features = ['tempo', 'valence', 'liveness', 'instrumentalness', 'acousticness', 'speechiness', 'loudness', 'energy', 'danceability']
    feats_df[continuous_features] = scaler.fit_transform(feats_df[continuous_features])

    # Encode categorical features
    if include_categorical:
        if circle_5: 
            categorical_encoded = np.array([PITCH_CLASS_VECTORS[k] for k in feats_df['key'].tolist()])
            categorical_encoded_df = pd.DataFrame(categorical_encoded)
            categorical_encoded_df.columns = ['C5_0', 'C5_1']
        else:
            encoder = OneHotEncoder(sparse_output=False)
            categorical_features = feats_df[['key']]
            categorical_encoded = encoder.fit_transform(categorical_features)
            categorical_encoded_df = pd.DataFrame(categorical_encoded)
        
        # # Combine all features
        feats_df = pd.concat([feats_df[continuous_features], categorical_encoded_df], axis=1)
    # print(feats_df)    
    # scaler = MinMaxScaler()
    # feats_df = pd.DataFrame(scaler.fit_transform(feats_df))
    # feats_df.columns = continuous_features+['C5_0', 'C5_1']
    # print(feats_df)
    return feats_df

def compare_seeds_rec(seed_audio_feats_df, rec_audio_feats_df, include_categorical=True, circle_5 = True):
    n_seeds = len(seed_audio_feats_df)
    audio_feats_df = pd.concat([seed_audio_feats_df, rec_audio_feats_df], axis=0, ignore_index=True)
    audio_feats_df = normalize_audio_feats(audio_feats_df, include_categorical, circle_5)
    d = cosine_similarity(audio_feats_df.iloc[n_seeds:,:], audio_feats_df.iloc[:n_seeds,:])
    d_max = np.max(d, axis=1)
    return d_max

In [3]:
from requests import post, get
import json 

def xlist(x):
    return (x or [])

def xtolist(x):
    #handle None type
    x = xlist(x)

    # Check if x is a string
    if isinstance(x, str):
        # Convert the string to a list containing the string
        x = [x]
    elif isinstance(x, list):
        # Check if each item in the list is a string, if not, raise an error
        if not all(isinstance(item, str) for item in x):
            raise ValueError("Each item in the input list must be a string")
    else:
        # If the input is neither a string nor a list, raise an error or handle it as needed
        raise ValueError("input-x must be a string or a list of strings")

    return x


def get_recs(token, seed_tracks=None, seed_artists=None, limit=None, max_popularity=None, 
             min_tempo=None, max_tempo=None, min_key=None, max_key=None, target_valence=None, 
             target_key=None, target_tempo=None, time_signature=None, mode=None,
            min_energy=None, max_energy=None, min_valence=None, min_danceability=None):
    seed_tracks = xtolist(seed_tracks)
    seed_artists = xtolist(seed_artists)

    if not seed_tracks and not seed_artists:
        raise ValueError("Must provide at least one seed track or seed artist")

    headers = get_auth_header(token)
    query_url = "https://api.spotify.com/v1/recommendations?"
    if seed_tracks:
        query_url += f"seed_tracks={'%2C'.join(seed_tracks)}"
    if seed_artists:
        query_url += f"&seed_artists={'%2C'.join(seed_artists)}"
    if isinstance(limit, int):
        query_url += f"&limit={limit}"
    if isinstance(max_popularity, int):
        query_url += f"&max_popularity={max_popularity}"
    if isinstance(min_tempo, int):
        query_url += f"&min_tempo={min_tempo}"
    if isinstance(max_tempo, int):
        query_url += f"&max_tempo={max_tempo}"
    if isinstance(min_key, int):
        query_url += f"&min_key={min_key}"
    if isinstance(max_key, int):
        query_url += f"&max_key={max_key}"
    if isinstance(target_key, int):
        query_url += f"&target_key={target_key}"
    if isinstance(target_valence, float):
        query_url += f"&target_valence={target_valence}"
    if isinstance(min_valence, float):
        query_url += f"&min_valence={min_valence}"
    if isinstance(min_danceability, float):
        query_url += f"&min_danceability={min_danceability}"
    if isinstance(target_tempo, float):
        query_url += f"&target_tempo={target_tempo}"
    if isinstance(time_signature, int):
        query_url += f"&time_signature={time_signature}"
    if isinstance(mode, int):
        query_url += f"&mode={mode}"
    if isinstance(min_energy, float):
        query_url += f"&min_energy={min_energy}"
    if isinstance(max_energy, float):
        query_url += f"&max_energy={max_energy}"

    result = get(query_url, headers=headers)
    print(result)
    js_result = json.loads(result.content)['tracks']
    
    # print(js_result)
    ids = [res['artists'][0]['name']  for res in js_result]
    artists = [res['artists'][0]['name']  for res in js_result]
    release_dates = [res['album']['release_date'] for res in js_result]
    df_track_recs = pd.DataFrame(js_result)[['id', 'name', 'popularity']]
    df_track_recs.insert(0, 'artist', artists)
    df_track_recs.insert(0, 'release_date', release_dates)
    df_track_recs['query'] = [f"artist:\042{a}\042 track:\042{t}\042" for a,t in zip(df_track_recs['artist'].tolist(), df_track_recs['name'].tolist())]
    return df_track_recs

In [388]:
token=get_token()
print(token)

BQDpwb3-MBTXERckRvR0NEhQMhp2bCUReSshZ89k5HJFf4PQhoruSvlqPOCdBqJV320wriyrDXxVqaWZ785kpjd20WQbXx6eOruaEqExyDY9Yu3Z7qg


In [409]:
#https://open.spotify.com/track/5BckPAYcKEJuYs1eV1BHHe?si=673046463e194519
#https://open.spotify.com/track/1y5V5qja332UyMeUurFhDS?si=dd94312627474256
#https://open.spotify.com/track/1Pq47iFLC5U7j8xeNiNcuS?si=4c3805d42336452c 
#https://open.spotify.com/track/1Pq47iFLC5U7j8xeNiNcuS?si=3261d74375214611
#https://open.spotify.com/track/2grjqo0Frpf2okIBiifQKs?si=e5776c0fb52548b1


seed_tracks = ['2grjqo0Frpf2okIBiifQKs', '1Pq47iFLC5U7j8xeNiNcuS']
seed_artists = None #['4Z8W4fKeB5YxbusRsdQVPb']
seed_audio_feats = get_audio_features(token, seed_tracks)
seed_audio_feats_df = pd.DataFrame(seed_audio_feats)
seed_audio_feats_df = seed_audio_feats_df.loc[:,['tempo', 'valence', 'liveness', 'instrumentalness',
       'acousticness', 'speechiness', 'mode', 'loudness', 'key', 'energy',
       'danceability']]
n_seeds = len(seed_tracks)

2
request 1 - 2 tracks completed
Elapsed Time: 0.8174185752868652 seconds


In [410]:
# seed_audio_feats_df.tempo = [86.851*2, 121.178]
seed_audio_feats_df
#CIRCLE_OF_FIFTHS = [0, 7, 2, 9, 4, 11, 6, 1, 8, 3, 10, 5]

Unnamed: 0,tempo,valence,liveness,instrumentalness,acousticness,speechiness,mode,loudness,key,energy,danceability
0,125.926,0.979,0.269,0.00131,0.168,0.0298,1,-7.264,9,0.832,0.697
1,128.388,0.961,0.0393,0.0,0.368,0.0625,1,-11.708,9,0.501,0.692


In [416]:
rec_df = get_recs(token, seed_tracks=seed_tracks, seed_artists=seed_artists, limit=100, max_popularity=80,
                 min_tempo=None, max_tempo=None, time_signature=None, target_key=None, mode=None, min_energy=None, max_energy=None,
                 min_valence=None, min_danceability=None)
with pd.option_context('display.max_colwidth', None): 
    # display(rec_df.sort_values('release_date', ascending=True))
    display(rec_df.head())

rec_track_ids = rec_df['id'].tolist()
rec_audio_feats = get_audio_features(token, rec_track_ids)
rec_audio_feats_df = pd.DataFrame(rec_audio_feats)

rec_audio_feats_df = rec_audio_feats_df.loc[:,['tempo', 'valence', 'liveness', 'instrumentalness',
       'acousticness', 'speechiness', 'mode', 'loudness', 'key', 'energy',
       'danceability']]


rec_track_ids = rec_df['id'].tolist()
rec_audio_feats = get_audio_features(token, rec_track_ids)
rec_audio_feats_df = pd.DataFrame(rec_audio_feats)

rec_audio_feats_df = rec_audio_feats_df.loc[:,['tempo', 'valence', 'liveness', 'instrumentalness',
       'acousticness', 'speechiness', 'mode', 'loudness', 'key', 'energy',
       'danceability']]

<Response [200]>


Unnamed: 0,release_date,artist,id,name,popularity,query
0,2022-04-20,"Grover Washington, Jr.",4Wfxqaksd2ohhZM57DagnA,Just the Two of Us (feat. Bill Withers) - Edit,28,"artist:""Grover Washington, Jr."" track:""Just the Two of Us (feat. Bill Withers) - Edit"""
1,1980-11-17,John Lennon,5URfZHMlUWTWxPvvSBWcPk,Beautiful Boy (Darling Boy) - Remastered 2010,68,"artist:""John Lennon"" track:""Beautiful Boy (Darling Boy) - Remastered 2010"""
2,1968-08-03,Engelbert Humperdinck,03MOcbkkoNLjP0G91iODSB,Quando Quando Quando,61,"artist:""Engelbert Humperdinck"" track:""Quando Quando Quando"""
3,1983-08-19,"Earth, Wind & Fire",2qTmwq9fbpGVnMHQIhLSgF,Dance Dance Dance,28,"artist:""Earth, Wind & Fire"" track:""Dance Dance Dance"""
4,1975-01-10,Little Nell,4WFeJTXNHIS2wURtwlAkhu,Time Warp,58,"artist:""Little Nell"" track:""Time Warp"""


50
request 1 - 50 tracks completed
Next request in ~ 2 seconds
Elapsed Time: 2.9829976558685303 seconds
100
request 2 - 100 tracks completed
Elapsed Time: 3.60513973236084 seconds
50
request 1 - 50 tracks completed
Next request in ~ 1 seconds
Elapsed Time: 1.7184162139892578 seconds
100
request 2 - 100 tracks completed
Elapsed Time: 2.4010775089263916 seconds


In [412]:
# sub_rec_audio_feats_df = rec_audio_feats_df[rec_audio_feats_df.key.isin([0, 1, 2, 6, 7, 8, 9, 11])].copy()
seed_audio_feats_df_sub = seed_audio_feats_df.iloc[[0],:].copy()
seed_audio_feats_df_sub

Unnamed: 0,tempo,valence,liveness,instrumentalness,acousticness,speechiness,mode,loudness,key,energy,danceability
0,125.926,0.979,0.269,0.00131,0.168,0.0298,1,-7.264,9,0.832,0.697


In [156]:
# 11,0
# 7, 1
# 6, 0
# 2, 1
# 0, 0
# 1, 0
seed_audio_feats_df_sub = pd.DataFrame(np.repeat(seed_audio_feats_df_sub.values, 2, axis=0))
seed_audio_feats_df_sub.columns = seed_audio_feats_df.columns
# seed_audio_feats_df_sub.key = np.array([0, 4]).astype(int)
# # seed_audio_feats_df_sub['mode']= np.array([1, 0]).astype(int)

seed_audio_feats_df_sub.key = np.array([2, 7]).astype(int)
seed_audio_feats_df_sub['mode']= np.array([1, 1]).astype(int)


# np.array([7, 2, 9, 11, 6, 2]).reshape(len(seed_audio_feats_df_sub), -1)
seed_audio_feats_df_sub

Unnamed: 0,tempo,valence,liveness,instrumentalness,acousticness,speechiness,mode,loudness,key,energy,danceability
0,173.702,0.249,0.118,0.371,0.0946,0.0262,1,-12.602,2,0.413,0.364
1,173.702,0.249,0.118,0.371,0.0946,0.0262,1,-12.602,7,0.413,0.364


In [169]:
seed_audio_feats_df_sub

Unnamed: 0,tempo,valence,liveness,instrumentalness,acousticness,speechiness,mode,loudness,key,energy,danceability
0,173.702,0.249,0.118,0.371,0.0946,0.0262,1,-12.602,2,0.413,0.364
1,173.702,0.249,0.118,0.371,0.0946,0.0262,1,-12.602,7,0.413,0.364


In [417]:
half_time=True
d_max = compare_seeds_rec(seed_audio_feats_df_sub.copy(), rec_audio_feats_df.copy(), include_categorical=True, circle_5 = True)
if half_time:
    # seed_audio_feats_df.at[0, 'tempo'] = seed_audio_feats_df.at[0, 'tempo']/ 2
    # seed_audio_feats_df.at[1, 'tempo'] = seed_audio_feats_df.at[1, 'tempo']/ 2
    # seed_audio_feats_df.at[2, 'tempo'] = seed_audio_feats_df.at[2, 'tempo']/ 2
    # seed_audio_feats_df.at[3, 'tempo'] = seed_audio_feats_df.at[3, 'tempo']/ 2
    # seed_audio_feats_df.at[4, 'tempo'] = seed_audio_feats_df.at[4, 'tempo']/ 2
    seed_audio_feats_df_sub.tempo = seed_audio_feats_df_sub.tempo.values/2 
    
    # seed_audio_feats_df_sub = seed_audio_feats_df.iloc[:2,:].copy()
    d_max_half = compare_seeds_rec(seed_audio_feats_df_sub.copy(), rec_audio_feats_df.copy(), include_categorical=True, circle_5 = True)
    # seed_audio_feats_df.at[0, 'tempo'] = seed_audio_feats_df.at[0, 'tempo']* 2
    # seed_audio_feats_df.at[1, 'tempo'] = seed_audio_feats_df.at[1, 'tempo']* 2
    # seed_audio_feats_df.at[2, 'tempo'] = seed_audio_feats_df.at[2, 'tempo']* 2
    # seed_audio_feats_df.at[3, 'tempo'] = seed_audio_feats_df.at[3, 'tempo']* 2
    # seed_audio_feats_df.at[4, 'tempo'] = seed_audio_feats_df.at[4, 'tempo']* 2
    seed_audio_feats_df_sub.tempo = seed_audio_feats_df_sub.tempo.values*2
    
    d_max = np.vstack([d_max, d_max_half]).max(0)
    
print(np.sort(d_max)[::-1][:15])
print(np.argsort(d_max)[::-1][:15])

# rec_df.iloc[[0],:].copy()
top_sim_ix = np.argsort(d_max.flatten())[::-1][:25] #[1,None]
top_sims = d_max[top_sim_ix, None]

# sub_rec_df = rec_df.iloc[sub_rec_audio_feats_df.index.tolist(),:].copy()
with pd.option_context('display.max_colwidth', None): 
    # display(rec_df.sort_values('release_date', ascending=True))
    display(rec_df.iloc[top_sim_ix,:])



[0.97064385 0.95888762 0.95826224 0.95336237 0.95245123 0.94609881
 0.94523016 0.94104791 0.90553739 0.90512107 0.90471012 0.89843453
 0.89413535 0.88950261 0.88903643]
[13 89  3 24 86 27 63 65 87 14 50 84 57 64 41]


Unnamed: 0,release_date,artist,id,name,popularity,query
13,1970-01-01,Edison Lighthouse,5Ts1DYOuouQLgzTaisxWYh,Love Grows (Where My Rosemary Goes),72,"artist:""Edison Lighthouse"" track:""Love Grows (Where My Rosemary Goes)"""
89,1973-07-16,Elvis Presley,7zMUCLm1TN9o9JlLISztxO,Burning Love,77,"artist:""Elvis Presley"" track:""Burning Love"""
3,1983-08-19,"Earth, Wind & Fire",2qTmwq9fbpGVnMHQIhLSgF,Dance Dance Dance,28,"artist:""Earth, Wind & Fire"" track:""Dance Dance Dance"""
24,1978-11-24,Rod Stewart,4z7maGZkAonDlXlwo8q69f,Da Ya Think I'm Sexy?,71,"artist:""Rod Stewart"" track:""Da Ya Think I'm Sexy?"""
86,1973-01-22,Elton John,6WCeFNVAXUtNczb7lqLiZU,Crocodile Rock,73,"artist:""Elton John"" track:""Crocodile Rock"""
27,1977,Boney M.,1BqnZOkYJbvYLOhN0qPJDm,Ma Baker,64,"artist:""Boney M."" track:""Ma Baker"""
63,1976-01-01,Rose Royce,2pbWkjtGtjkzBdZ95GFINm,Car Wash - Long Version,50,"artist:""Rose Royce"" track:""Car Wash - Long Version"""
65,1999-01-01,Blondie,4qO03RMQm88DdpTJcxlglY,Call Me,73,"artist:""Blondie"" track:""Call Me"""
87,1981,George Benson,6a899bXlCuViXW2Qs8Rdqo,Turn Your Love Around,60,"artist:""George Benson"" track:""Turn Your Love Around"""
14,1982-07-22,Dexys Midnight Runners,3MrWxJaD2AT0W9DjWF64Vm,Come On Eileen,76,"artist:""Dexys Midnight Runners"" track:""Come On Eileen"""


In [178]:
seed_audio_feats_df_sub


Unnamed: 0,tempo,valence,liveness,instrumentalness,acousticness,speechiness,mode,loudness,key,energy,danceability
0,173.702,0.249,0.118,0.371,0.0946,0.0262,1,-12.602,2,0.413,0.364
1,173.702,0.249,0.118,0.371,0.0946,0.0262,1,-12.602,7,0.413,0.364


In [175]:
# CIRCLE_OF_FIFTHS = [0, 7, 2, 9, 4, 11, 6, 1, 8, 3, 10, 5]
rec_audio_feats_df.iloc[top_sim_ix,:].head(25)#.sort_values('tempo', ascending=False)

Unnamed: 0,tempo,valence,liveness,instrumentalness,acousticness,speechiness,mode,loudness,key,energy,danceability
46,173.32,0.251,0.114,0.481,0.148,0.0299,1,-16.378,2,0.301,0.278
59,93.512,0.218,0.101,0.709,0.00101,0.0263,1,-11.764,2,0.532,0.33
50,73.543,0.135,0.202,0.0997,0.167,0.0297,1,-12.592,2,0.229,0.463
75,94.843,0.204,0.14,5.2e-05,0.0783,0.0265,1,-14.918,7,0.434,0.439
41,74.807,0.324,0.172,9.3e-05,0.0638,0.0258,1,-9.129,7,0.501,0.36
21,82.495,0.623,0.0999,0.849,0.000413,0.0584,1,-6.281,7,0.682,0.406
97,204.105,0.353,0.206,0.0558,0.491,0.0327,1,-11.861,2,0.299,0.351
12,183.87,0.651,0.14,0.0,0.0286,0.0343,1,-7.287,2,0.493,0.386
54,149.508,0.491,0.11,0.635,0.0277,0.0574,0,-2.352,2,0.9,0.303
98,91.608,0.312,0.363,0.141,0.0115,0.0658,1,-8.065,7,0.755,0.23


In [13]:
sub_rec_audio_feats_df = rec_audio_feats_df[rec_audio_feats_df.key.isin([2, 6, 7, 9, 11])].copy()
sub_rec_audio_feats_df = sub_rec_audio_feats_df.sort_values('valence', ascending=False).head(20).copy()
sub_rec_audio_feats_df

Unnamed: 0,tempo,valence,liveness,instrumentalness,acousticness,speechiness,mode,loudness,key,energy,danceability
35,163.559,0.956,0.344,0.649,0.0142,0.0326,1,-11.797,9,0.797,0.587
57,155.997,0.873,0.0711,0.0139,0.0232,0.0389,1,-10.591,9,0.575,0.557
27,119.822,0.843,0.259,0.475,0.231,0.0352,0,-15.663,9,0.451,0.743
84,105.04,0.766,0.11,0.531,0.0973,0.0407,0,-7.457,7,0.912,0.574
3,138.039,0.698,0.045,0.521,0.00954,0.0932,1,-7.058,2,0.872,0.703
79,116.036,0.673,0.101,0.784,0.00404,0.0483,0,-6.096,11,0.842,0.737
36,108.011,0.657,0.444,0.92,0.0044,0.0754,0,-9.618,6,0.714,0.659
4,130.992,0.609,0.0158,0.838,0.0346,0.0424,0,-8.718,11,0.732,0.661
23,126.003,0.566,0.064,0.822,0.00182,0.067,0,-4.776,11,0.977,0.802
58,128.004,0.544,0.362,0.905,0.00356,0.057,1,-18.451,11,0.424,0.705


In [260]:
with pd.option_context('display.max_colwidth', None): 
    # display(rec_df.sort_values('release_date', ascending=True))
    display(rec_df.iloc[sub_rec_audio_feats_df.index.tolist(),:])

Unnamed: 0,release_date,artist,id,name,popularity,query
33,1977-01-01,The Greg Kihn Band,69V4Pshg17LP3cowmxZzWg,For You,18,"artist:""The Greg Kihn Band"" track:""For You"""
73,2019-06-19,Supermax,0yXJhOVoKmaeZVoO6rEwcS,Lovemachine,19,"artist:""Supermax"" track:""Lovemachine"""
96,2010-04-27,Urban Heroes,7FTeQEsOYCwQ64kp1w6EfY,Get It,7,"artist:""Urban Heroes"" track:""Get It"""
68,1986,Talk Talk,03d3DCqwvt65Orfdomjs6e,Living in Another World - 1997 Remaster,49,"artist:""Talk Talk"" track:""Living in Another World - 1997 Remaster"""
49,2011-05-30,Flash and the Pan,2xhHL62mvRlTVzyByKGQAN,Ayla,5,"artist:""Flash and the Pan"" track:""Ayla"""
14,1977-07-19,John Miles,07V8ZQUzN8kxm5E3rONkD7,Stranger In The City - Remastered 1985,36,"artist:""John Miles"" track:""Stranger In The City - Remastered 1985"""
81,1984-02-20,The Smiths,5bSASu4W0HJx6CuG8rbRcA,Reel Around the Fountain - 2011 Remaster,53,"artist:""The Smiths"" track:""Reel Around the Fountain - 2011 Remaster"""
85,2000-10-23,Blur,5CeL9C3bsoe4yzYS1Qz8cw,Girls & Boys,68,"artist:""Blur"" track:""Girls & Boys"""
26,2005-07-19,Iggy Pop,3Xws3f95Sv9XGWHTWuKgXm,Home,7,"artist:""Iggy Pop"" track:""Home"""
67,1969-10-22,Led Zeppelin,3MODES4TNtygekLl146Dxd,Ramble On - 1990 Remaster,68,"artist:""Led Zeppelin"" track:""Ramble On - 1990 Remaster"""


In [32]:
seed_audio_feats_df

Unnamed: 0,tempo,valence,liveness,instrumentalness,acousticness,speechiness,mode,loudness,key,energy,danceability
0,134.092,0.395,0.156,0.0,0.589,0.907,1,-22.075,6,0.172,0.695
1,99.15,0.732,0.0403,0.00816,0.0189,0.042,0,-6.706,6,0.607,0.86


In [33]:
# CIRCLE_OF_FIFTHS = [0, 7, 2, 9, 4, 11, 6, 1, 8, 3, 10, 5]
rec_audio_feats_df.iloc[top_sim_ix,:].head(25)#.sort_values('tempo', ascending=False)

Unnamed: 0,tempo,valence,liveness,instrumentalness,acousticness,speechiness,mode,loudness,key,energy,danceability
19,114.96,0.913,0.233,0.0024,0.389,0.0282,0,-7.843,6,0.678,0.874
87,106.929,0.799,0.241,0.0,0.0507,0.13,1,-13.497,6,0.543,0.863
5,105.894,0.732,0.043,0.000249,0.0194,0.0287,1,-10.727,1,0.642,0.717
70,118.881,0.947,0.183,0.0461,0.126,0.089,1,-9.875,11,0.821,0.878
83,103.912,0.751,0.0784,0.0,0.00509,0.0436,1,-5.9,11,0.846,0.611
11,135.03,0.677,0.0427,0.000326,0.00979,0.0456,1,-13.045,11,0.692,0.699
63,128.175,0.918,0.431,0.0702,0.000244,0.035,0,-3.762,6,0.963,0.554
93,124.945,0.507,0.0625,0.000169,0.0158,0.0444,0,-11.402,11,0.726,0.625
29,114.934,0.688,0.0462,1.1e-05,0.327,0.0394,0,-13.396,11,0.693,0.676
78,136.213,0.84,0.067,0.00512,0.0314,0.0353,1,-11.767,1,0.393,0.617


In [15]:
seed_audio_feats_df

Unnamed: 0,tempo,valence,liveness,instrumentalness,acousticness,speechiness,mode,loudness,key,energy,danceability
0,133.096,0.975,0.0945,0.14,0.000104,0.0614,1,-7.937,7,0.733,0.792


In [121]:
ix_by_key = np.where(rec_audio_feats_df.key.isin([2]))[0]
sub_rec_audio_feats_df = rec_audio_feats_df.iloc[ix_by_key,:].sort_values('tempo', ascending=False)
sub_rec_audio_feats_df[sub_rec_audio_feats_df['mode'].isin([0])]

Unnamed: 0,tempo,valence,liveness,instrumentalness,acousticness,speechiness,mode,loudness,key,energy,danceability
29,153.917,0.6,0.112,0.859,0.471,0.0656,0,-13.095,2,0.417,0.739
57,130.069,0.322,0.0751,0.88,0.00357,0.0372,0,-13.551,2,0.314,0.622
6,126.004,0.276,0.0645,0.738,0.00314,0.0343,0,-9.929,2,0.615,0.666
1,125.02,0.835,0.204,0.67,0.0143,0.0484,0,-6.818,2,0.979,0.683
79,124.866,0.523,0.0629,0.928,0.00534,0.129,0,-9.421,2,0.611,0.824
15,121.965,0.934,0.359,0.948,0.0523,0.0498,0,-15.004,2,0.681,0.839
73,121.007,0.47,0.0928,0.863,0.0041,0.0439,0,-8.656,2,0.845,0.656
45,120.626,0.743,0.897,0.126,0.00426,0.0494,0,-10.465,2,0.811,0.776
58,120.007,0.265,0.0847,0.906,0.00269,0.144,0,-14.703,2,0.551,0.811
68,119.012,0.488,0.0731,0.716,0.00766,0.0465,0,-10.95,2,0.789,0.689


In [110]:

with pd.option_context('display.max_colwidth', None): 
    # display(rec_df.sort_values('release_date', ascending=True))
    display(rec_df.iloc[sub_rec_audio_feats_df.index.tolist(),:])

Unnamed: 0,release_date,artist,id,name,popularity,query
27,1994-02-28,Banco De Gaia,2SzesHmjy2aeZYPXiR8ao5,Sheesha,9,"artist:""Banco De Gaia"" track:""Sheesha"""
21,1995-05-01,Banco De Gaia,3j1ZnSGeRxNccBaAodOg5N,Eagle,9,"artist:""Banco De Gaia"" track:""Eagle"""
66,2003-01-01,Martina Topley-Bird,4E74qeu2SgY5Q6L1WgILcM,I Still Feel,19,"artist:""Martina Topley-Bird"" track:""I Still Feel"""
54,1999,Dot Allison,39m8TmbpJZ1WF2UDv9XCsu,Morning Sun,7,"artist:""Dot Allison"" track:""Morning Sun"""
56,2006-12-04,Language Lab,0uLWjkPhVrI4MXa0sbtSIn,Burning Disaster - Groove Armada Bedtime Story Mix,26,"artist:""Language Lab"" track:""Burning Disaster - Groove Armada Bedtime Story Mix"""
...,...,...,...,...,...,...
22,1996-10-14,Olive,0e7brL8ZZQ5Ej2EdqjejiP,This Time,11,"artist:""Olive"" track:""This Time"""
24,1996-02-23,Babble,3TVCez6L6UDJcxHDdTck7D,Into Ether,4,"artist:""Babble"" track:""Into Ether"""
0,1994,Babble,4yRmUfelrTjcuTEUVJVjYt,Take Me Away,7,"artist:""Babble"" track:""Take Me Away"""
33,2007,Smith & Mudd,5tksl9Z2lg267512BX6AFb,Blue River,9,"artist:""Smith & Mudd"" track:""Blue River"""


In [179]:
n_seeds = len(seed_audio_feats_df)
audio_feats_df = pd.concat([seed_audio_feats_df, rec_audio_feats_df], axis=0, ignore_index=True)
audio_feats_df.head()

Unnamed: 0,tempo,valence,liveness,instrumentalness,acousticness,speechiness,mode,loudness,key,energy,danceability
0,121.309,0.48,0.142,0.859,0.0627,0.0352,1,-7.861,8,0.713,0.298
1,120.268,0.17,0.153,0.189,0.0194,0.049,1,-5.07,8,0.835,0.383
2,123.627,0.4,0.129,0.561,0.00581,0.0295,0,-13.758,8,0.481,0.635
3,121.309,0.48,0.142,0.859,0.0627,0.0352,1,-7.861,8,0.713,0.298
4,105.675,0.352,0.0894,0.0238,0.0262,0.0265,0,-14.317,8,0.417,0.66


In [189]:
pd.DataFrame(np.array([pitch_class_vectors[k] for k in audio_feats_df['key'].tolist()]))

Unnamed: 0,0,1
0,-0.5,-0.866025
1,-0.5,-0.866025
2,-0.5,-0.866025
3,-0.5,-0.866025
4,-0.5,-0.866025
5,-0.5,-0.866025
6,-0.5,-0.866025
7,-0.5,-0.866025
8,-0.5,-0.866025
9,-0.5,-0.866025


In [181]:
audio_feats_df['key'].tolist()

[8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 7,
 8,
 8,
 8,
 8,
 8,
 9,
 8,
 8,
 8,
 8,
 8,
 2,
 8,
 8,
 8,
 8,
 8,
 5,
 8,
 5,
 8,
 8,
 8,
 9,
 8]

In [187]:
pitch_class_vectors[5]

array([ 0.8660254, -0.5      ])

In [None]:
# res_df = rec_df.iloc[[34],:].copy()
# res_df[['score']] = d[34]
# res_df.reset_index(drop=True, inplace=True)

top_sim_df = rec_df.iloc[top_sim_ix,:].copy()
top_sim_df[['score']] = top_sims

res_df = pd.concat([res_df, top_sim_df], axis=0, ignore_index=True)
with pd.option_context('display.max_colwidth', None): 
    # display(rec_df.sort_values('release_date', ascending=True))
    display(res_df)

In [None]:
res_df.drop_duplicates(subset=['id'], inplace=True, ignore_index=True)
with pd.option_context('display.max_colwidth', None): 
    display(res_df.sort_values('score', ascending=False))
    # display(res_df)

In [144]:
def circle_of_fifths_distance(pc1, pc2):
    # Define the circle of fifths order
    circle_of_fifths = [0, 7, 2, 9, 4, 11, 6, 1, 8, 3, 10, 5]
    
    # Get the positions of the pitch classes in the circle of fifths
    pos1 = circle_of_fifths.index(pc1)
    pos2 = circle_of_fifths.index(pc2)
    
    # Calculate the clockwise and counterclockwise distances
    clockwise_distance = (pos2 - pos1) % 12
    counterclockwise_distance = (pos1 - pos2) % 12
    
    # Return the minimum distance
    return min(clockwise_distance, counterclockwise_distance)

#(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11) to (0, 7, 2, 9, 4, 11, 6, 1, 8, 3, 10, 5)




In [157]:
def similarity_metric(distance):
    # Maximum possible steps in a 12-pitch system
    max_distance = 6
    # Calculate similarity (1 - normalized distance)
    similarity = 1 - (distance / max_distance)
    return similarity


# Example usage
pc1 = 8  # C
pc2 = 1 # G
distance = circle_of_fifths_distance(pc1, pc2)
print(f"The distance between pitch class {pc1} and {pc2} is {distance} steps.")


# Example: Compute the similarity between C (0) and G (7)
similarity = similarity_metric(distance)
print(f"Similarity between pitch class {pc1} and {pc2} is {similarity}")


The distance between pitch class 8 and 1 is 1 steps.
Similarity between pitch class 8 and 1 is 0.8333333333333334


In [163]:
import numpy as np
from numpy.linalg import norm

# Define the circle of fifths order
circle_of_fifths = [0, 7, 2, 9, 4, 11, 6, 1, 8, 3, 10, 5]

# Create a mapping from pitch classes to their positions in the circle of fifths
pitch_class_to_position = {pc: idx for idx, pc in enumerate(circle_of_fifths)}

# Number of pitch classes
n_pitch_classes = len(circle_of_fifths)

# Calculate the angle for each pitch class
angles = {pc: (2 * np.pi * idx) / n_pitch_classes for pc, idx in pitch_class_to_position.items()}

def angle_to_vector(angle):
    return np.array([np.cos(angle), np.sin(angle)])

# Create vectors for each pitch class
pitch_class_vectors = {pc: angle_to_vector(angle) for pc, angle in angles.items()}

def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (norm(vec1) * norm(vec2))

#[0, 7, 2, 9, 4, 11, 6, 1, 8, 3, 10, 5]
# Example: Compute cosine similarity between C (0) and G (7)
pc1 = 8  # C
pc2 = 0  # G
vector1 = pitch_class_vectors[pc1]
vector2 = pitch_class_vectors[pc2]

similarity = cosine_similarity(vector1, vector2)
print(f"Cosine similarity between pitch class {pc1} and {pc2}: {similarity}")



Cosine similarity between pitch class 8 and 0: -0.5000000000000004


In [175]:
pitch_class_vectors

{0: array([1., 0.]),
 7: array([0.8660254, 0.5      ]),
 2: array([0.5      , 0.8660254]),
 9: array([6.123234e-17, 1.000000e+00]),
 4: array([-0.5      ,  0.8660254]),
 11: array([-0.8660254,  0.5      ]),
 6: array([-1.0000000e+00,  1.2246468e-16]),
 1: array([-0.8660254, -0.5      ]),
 8: array([-0.5      , -0.8660254]),
 3: array([-1.8369702e-16, -1.0000000e+00]),
 10: array([ 0.5      , -0.8660254]),
 5: array([ 0.8660254, -0.5      ])}