In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
os.chdir("../")

In [72]:
import pickle as p

from ease_recommender import *
from npmi_recommender import *

from scipy.optimize import minimize_scalar, minimize

In [4]:
def create_mat(row, col, bool_to_int=True):
    # bool_to_int won't count duplicates in the same row, creates a different weighting basically
    if bool_to_int:
        data = np.ones_like(row, dtype=bool)
        return csr_matrix((data, (row, col))).astype(np.int64)
    else:
        data = np.ones_like(row, dtype=np.int64)
        return csr_matrix((data, (row, col)))

def check_if_all_terms_in_str(q, terms):
    for term in terms:
        if term not in q:
            return False

    return True

def get_cat2idx(category_type, D):
    if category_type == "track":
        return D["track2idx"]
    elif category_type == "album":
        return D["album2idx"]
    elif category_type == "artist":
        return D["artist2idx"]
    else:
        raise NotImplementedError

def find_match_using_terms(terms, cat2idx):
    matches = []
    for name in cat2idx.keys():
        if check_if_all_terms_in_str(name, terms):
            matches.append(name)

    if len(matches) > 1:
        raise Exception("Multiple matches found, filter down to a single match", matches)

    return matches[0]

In [82]:
def a_to_b_error_metric(mat, a, b, lambda_, verbose=True):
    similarity_scores_a = calculate_ease_for_item_cg(artist_mat, a, lambda_)
    ranking_a = np.argsort(-similarity_scores_a)
    
    similarity_scores_b = calculate_ease_for_item_cg(artist_mat, b, lambda_)
    ranking_b = np.argsort(-similarity_scores_b)
    
    # lower is better
    error = (ranking_a.tolist().index(b) + ranking_b.tolist().index(a))/2
    
    if verbose:
        print("lambda_:", lambda_)
        print("error:", error)
    
    return error

In [83]:
def optimize_lambda_using_a_to_b_matching(mat, a, b, fast_approximation=True):
    lambda_ = 1000000
    
    prev_error = np.inf
    while True:
        error = a_to_b_error_metric(mat, a, b, lambda_)
        
        if error >= prev_error:
            break
        else:
            lambda_ /= 10
            prev_error = error
            
    if fast_approximation:
        return round(lambda_*10)
    else:
        res = minimize_scalar(lambda lambda_: a_to_b_error_metric(mat, a, b, lambda_), 
                              bracket=(lambda_, lambda_*10, lambda_*100))

        return round(res.x)

In [93]:
def a_to_b_error_metric_npmi(mat, a, b, temp, verbose=True):
    similarity_scores_a = npmi_batch(artist_mat, a, temp)
    ranking_a = np.argsort(-similarity_scores_a)
    
    similarity_scores_b = npmi_batch(artist_mat, b, temp)
    ranking_b = np.argsort(-similarity_scores_b)
    
    # lower is better
    error = (ranking_a.tolist().index(b) + ranking_b.tolist().index(a))/2
    
    if verbose:
        print("temp:", temp)
        print("error:", error)
    
    return error

In [8]:
print("loading cache data...")
D = p.load(open("cached_data/spotify_preprocessed.p", "rb"))

print("building csr matrices...")
# track_mat = create_mat(D["playlist_indices"], D["track_indices"])
# album_mat = create_mat(D["playlist_indices"], D["album_indices"])
artist_mat = create_mat(D["playlist_indices"], D["artist_indices"])

print("done")

loading cache data...
building csr matrices...
done


In [9]:
cat2idx = get_cat2idx("artist", D)
idx2cat = {v:k for k, v in cat2idx.items()}

In [31]:
# use two items that you believe are similar to optimize the value of lambda_

# a = cat2idx[find_match_using_terms(["sgeir", "7xUZ4069zcyBM4Bn10NQ1c"], cat2idx)]
a = cat2idx[find_match_using_terms(["Fleet Foxes"], cat2idx)]

# b = cat2idx[find_match_using_terms(["Fleet Foxes"], cat2idx)]
b = cat2idx[find_match_using_terms(["Bon Iver", "4LEiUm1SRbFMgfqnQTwUbQ"], cat2idx)]
# b = cat2idx[find_match_using_terms(["SOHN"], cat2idx)]

In [59]:
lambda_ = optimize_lambda_using_a_to_b_matching(artist_mat, a, b)

lambda_: 1000000
error: 9.5
lambda_: 100000.0
error: 4.5
lambda_: 10000.0
error: 3.0
lambda_: 1000.0
error: 11.5
lambda_: 7639.320225002102
error: 3.0
lambda_: 12360.679774997896
error: 3.0
lambda_: 15278.640450004204
error: 4.0
lambda_: 10000.0
error: 3.0
lambda_: 9098.300562505257
error: 3.5
lambda_: 10901.699437494743
error: 3.0
lambda_: 11458.980337503153
error: 3.0


KeyboardInterrupt: 

In [144]:
a_to_b_error_metric_npmi(artist_mat, a, b, temp=1)

temp: 1
error: 7.0


7.0

In [147]:
top_k = 20

In [159]:
# using normalized pointwise mutual information

similarity_scores = npmi_batch(artist_mat, a, 1)

top_k_matches = [idx2cat[idx] for idx in np.argsort(-similarity_scores)[:top_k].tolist()]

top_k_matches

['Iron & Wine (spotify:artist:4M5nCE77Qaxayuhp3fVn4V)',
 'The Shins (spotify:artist:4LG4Bs1Gadht7TCrMytQUO)',
 'Sufjan Stevens (spotify:artist:4MXUO7sVCaFgFjoTI5ox5c)',
 'The Tallest Man On Earth (spotify:artist:2BpAc5eK7Rz5GAwSp9UYXa)',
 'Bon Iver (spotify:artist:4LEiUm1SRbFMgfqnQTwUbQ)',
 'The Head and the Heart (spotify:artist:0n94vC3S9c3mb2HyNAOcjg)',
 'The Middle East (spotify:artist:6imbHAlhHrFwtsOgqpeBK2)',
 'Gregory Alan Isakov (spotify:artist:5sXaGoRLSpd7VeyZrLkKwt)',
 'Radical Face (spotify:artist:5EM6xJN2QNk0cL7EEm9HR9)',
 'Beirut (spotify:artist:6pmxr66tMAePxzOLfjGNcX)',
 'José González (spotify:artist:6xrCU6zdcSTsG2hLrojpmI)',
 'Grizzly Bear (spotify:artist:2Jv5eshHtLycR6R8KQCdc4)',
 'Andrew Bird (spotify:artist:4uSftVc3FPWe6RJuMZNEe9)',
 'The Avett Brothers (spotify:artist:196lKsA13K3keVXMDFK66q)',
 'Local Natives (spotify:artist:75dQReiBOHN37fQgWQrIAJ)',
 'Band of Horses (spotify:artist:0OdUWJ0sBjDrqHygGUXeCF)',
 'Blind Pilot (spotify:artist:6qiGjRyN7TJ1GA2nXF68Hi)',
 'B

In [160]:
# using EASE

similarity_scores = calculate_ease_for_item_cg(artist_mat, a, lambda_)
top_k_matches = [idx2cat[idx] for idx in np.argsort(-similarity_scores)[:top_k].tolist()]

top_k_matches

['The Shins (spotify:artist:4LG4Bs1Gadht7TCrMytQUO)',
 'Sufjan Stevens (spotify:artist:4MXUO7sVCaFgFjoTI5ox5c)',
 'Iron & Wine (spotify:artist:4M5nCE77Qaxayuhp3fVn4V)',
 'Bon Iver (spotify:artist:4LEiUm1SRbFMgfqnQTwUbQ)',
 'Grizzly Bear (spotify:artist:2Jv5eshHtLycR6R8KQCdc4)',
 'The Tallest Man On Earth (spotify:artist:2BpAc5eK7Rz5GAwSp9UYXa)',
 'The Head and the Heart (spotify:artist:0n94vC3S9c3mb2HyNAOcjg)',
 'Beirut (spotify:artist:6pmxr66tMAePxzOLfjGNcX)',
 'Local Natives (spotify:artist:75dQReiBOHN37fQgWQrIAJ)',
 'Father John Misty (spotify:artist:2kGBy2WHvF0VdZyqiVCkDT)',
 'Andrew Bird (spotify:artist:4uSftVc3FPWe6RJuMZNEe9)',
 'Band of Horses (spotify:artist:0OdUWJ0sBjDrqHygGUXeCF)',
 'Radical Face (spotify:artist:5EM6xJN2QNk0cL7EEm9HR9)',
 'The Avett Brothers (spotify:artist:196lKsA13K3keVXMDFK66q)',
 'First Aid Kit (spotify:artist:21egYD1eInY6bGFcniCRT1)',
 'Arcade Fire (spotify:artist:3kjuyTCjPG1WMFCiyc5IuB)',
 'Neutral Milk Hotel (spotify:artist:2ooIqOf4X2uz4mMptXCtie)',
 '