In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
os.chdir("../")

In [3]:
from ease_recommender import *
from npmi_recommender import *

import pickle as p

In [4]:
def create_mat(row, col, bool_to_int=True):
    # bool_to_int won't count duplicates in the same row, creates a different weighting basically
    if bool_to_int:
        data = np.ones_like(row, dtype=bool)
        return csr_matrix((data, (row, col))).astype(np.int64)
    else:
        data = np.ones_like(row, dtype=np.int64)
        return csr_matrix((data, (row, col)))

def check_if_all_terms_in_str(q, terms):
    for term in terms:
        if term not in q:
            return False

    return True

def get_cat2idx(category_type, D):
    if category_type == "track":
        return D["track2idx"]
    elif category_type == "album":
        return D["album2idx"]
    elif category_type == "artist":
        return D["artist2idx"]
    else:
        raise NotImplementedError

def find_match_using_terms(terms, cat2idx):
    matches = []
    for name in cat2idx.keys():
        if check_if_all_terms_in_str(name, terms):
            matches.append(name)

    if len(matches) > 1:
        raise Exception("Multiple matches found, filter down to a single match", matches)

    return matches[0]

In [5]:
print("loading cache data...")
D = p.load(open("cached_data/spotify_preprocessed.p", "rb"))

print("building csr matrices...")

# TODO: finish implementing track and album level recommendations

# track_mat = create_mat(D["playlist_indices"], D["track_indices"])
# album_mat = create_mat(D["playlist_indices"], D["album_indices"])
artist_mat = create_mat(D["playlist_indices"], D["artist_indices"])

print("done")

loading cache data...
building csr matrices...
done


In [6]:
cat2idx = get_cat2idx("artist", D)
idx2cat = {v:k for k, v in cat2idx.items()}

In [19]:
# use two items that you believe are similar to optimize the value of lambda_

# a = cat2idx[find_match_using_terms(["sgeir", "7xUZ4069zcyBM4Bn10NQ1c"], cat2idx)]
a = cat2idx[find_match_using_terms(["Fleet Foxes"], cat2idx)]

# b = cat2idx[find_match_using_terms(["Fleet Foxes"], cat2idx)]
b = cat2idx[find_match_using_terms(["Bon Iver", "4LEiUm1SRbFMgfqnQTwUbQ"], cat2idx)]
# b = cat2idx[find_match_using_terms(["SOHN"], cat2idx)]

In [20]:
lambda_ = optimize_lambda_using_a_to_b_matching_npmi(artist_mat, a, b)

lambda_: 0.0
error: 12.0
lambda_: 1.0
error: 15
lambda_: -1.618034
error: 63543
lambda_: -0.618033974844
error: 61721
lambda_: 0.381966
error: 11.381966
lambda_: 0.19099110267712452
error: 8.190991102677124
lambda_: 0.19941405018599068
error: 8.199414050185991
lambda_: 0.18939210311958657
error: 8.189392103119587
lambda_: 0.11705075905941056
error: 8.11705075905941
lambda_: 0.07234134882452375
error: 8.072341348824525
lambda_: 0.04470941317941571
error: 10.044709413179415
lambda_: 0.08941882341430253
error: 8.089418823414302
lambda_: 0.061786888893904414
error: 9.061786888893904
lambda_: 0.0788643634836832
error: 8.078864363483683
lambda_: 0.0683099039826648
error: 9.068309903982664
lambda_: 0.07483291864182424
error: 8.074832918641825
lambda_: 0.07080147396405825
error: 8.070801473964059
lambda_: 0.06984977894454532
error: 9.069849778944546
lambda_: 0.07138965380501082
error: 8.07138965380501
lambda_: 0.07043795882423498
error: 8.070437958824234
lambda_: 0.07021329410830944
error: 8.0

In [21]:
top_k = 20

In [22]:
# using normalized pointwise mutual information (popularity weighted)

similarity_scores = npmi_batch_popularity_weighted(artist_mat, a, lambda_)

top_k_matches = [idx2cat[idx] for idx in np.argsort(-similarity_scores)[:top_k].tolist()]

top_k_matches

['Bon Iver (spotify:artist:4LEiUm1SRbFMgfqnQTwUbQ)',
 'Iron & Wine (spotify:artist:4M5nCE77Qaxayuhp3fVn4V)',
 'The Shins (spotify:artist:4LG4Bs1Gadht7TCrMytQUO)',
 'The Head and the Heart (spotify:artist:0n94vC3S9c3mb2HyNAOcjg)',
 'Sufjan Stevens (spotify:artist:4MXUO7sVCaFgFjoTI5ox5c)',
 'The Tallest Man On Earth (spotify:artist:2BpAc5eK7Rz5GAwSp9UYXa)',
 'José González (spotify:artist:6xrCU6zdcSTsG2hLrojpmI)',
 'Gregory Alan Isakov (spotify:artist:5sXaGoRLSpd7VeyZrLkKwt)',
 'Grizzly Bear (spotify:artist:2Jv5eshHtLycR6R8KQCdc4)',
 'The Middle East (spotify:artist:6imbHAlhHrFwtsOgqpeBK2)',
 'The Avett Brothers (spotify:artist:196lKsA13K3keVXMDFK66q)',
 'Radical Face (spotify:artist:5EM6xJN2QNk0cL7EEm9HR9)',
 'Band of Horses (spotify:artist:0OdUWJ0sBjDrqHygGUXeCF)',
 'Beirut (spotify:artist:6pmxr66tMAePxzOLfjGNcX)',
 'Ben Howard (spotify:artist:5schNIzWdI9gJ1QRK8SBnc)',
 'Local Natives (spotify:artist:75dQReiBOHN37fQgWQrIAJ)',
 'Andrew Bird (spotify:artist:4uSftVc3FPWe6RJuMZNEe9)',
 'Th

In [23]:
# using normalized pointwise mutual information (popularity weighted)

similarity_scores = npmi_batch_popularity_weighted(artist_mat, b, lambda_)

top_k_matches = [idx2cat[idx] for idx in np.argsort(-similarity_scores)[:top_k].tolist()]

top_k_matches

['Ben Howard (spotify:artist:5schNIzWdI9gJ1QRK8SBnc)',
 'Iron & Wine (spotify:artist:4M5nCE77Qaxayuhp3fVn4V)',
 'The Head and the Heart (spotify:artist:0n94vC3S9c3mb2HyNAOcjg)',
 'José González (spotify:artist:6xrCU6zdcSTsG2hLrojpmI)',
 'The Lumineers (spotify:artist:16oZKvXb6WkQlVAjwo2Wbg)',
 'City and Colour (spotify:artist:74gcBzlQza1bSfob90yRhR)',
 'James Vincent McMorrow (spotify:artist:7FDlvgcodNfC0IBdWevl4u)',
 'The Paper Kites (spotify:artist:79hrYiudVcFyyxyJW0ipTy)',
 'Fleet Foxes (spotify:artist:4EVpmkEwrLYEg6jIsiPMIb)',
 'Daughter (spotify:artist:46CitWgnWrvF9t70C2p1Me)',
 'Gregory Alan Isakov (spotify:artist:5sXaGoRLSpd7VeyZrLkKwt)',
 'Band of Horses (spotify:artist:0OdUWJ0sBjDrqHygGUXeCF)',
 'Benjamin Francis Leftwich (spotify:artist:7D5oTJSXSHf51auG0106CQ)',
 'The Middle East (spotify:artist:6imbHAlhHrFwtsOgqpeBK2)',
 'Kodaline (spotify:artist:4BxCuXFJrSWGi1KHcVqaU4)',
 'Angus & Julia Stone (spotify:artist:4tvKz56Tr39bkhcQUTO0Xr)',
 'Sufjan Stevens (spotify:artist:4MXUO7s