## ALS Implicit Collaborative Filtering - binary ratings

https://medium.com/radon-dev/als-implicit-collaborative-filtering-5ed653ba39fe

In [1]:
import import_ipynb

In [2]:
from evaluation import DCG
from evaluation import nDCG
from evaluation import R_Precision
from time import time

importing Jupyter notebook from evaluation.ipynb
DCG = 0.5
IDCG = 1.0
nDCG = 0.5


In [3]:
import sys
import pandas as pd
import numpy as np
import scipy.sparse as sparse
import random
import implicit

from sklearn.preprocessing import MinMaxScaler
from scipy.sparse.linalg import spsolve

# Recomendation and evaluation functions

In [18]:
#---------------------
# FIND SIMILAR ITEMS
#---------------------

def similar_items(seed_track, top_n):
    track_id = D_track_id[seed_track] #["Lose Control (feat. Ciara & Fat Man Scoop)", "Missy Elliott"]
    n_similar =  top_n

    # Use implicit to get similar items.
    similar = model.similar_items(track_id, n_similar)
    
    similar_i = []
    
    # Print the names of our most similar artists
    for item in similar:
        idx, score = item
        track_uri = data.track_uri.loc[data.track_uri_id == idx].iloc[0]
        #print(data.track_uri.loc[data.track_uri_id == idx].iloc[0], D_desc[track_uri])
        similar_i.append(data.track_uri.loc[data.track_uri_id == idx].iloc[0])
    return similar_i

In [20]:
#------------------------------
# CREATE USER RECOMMENDATIONS
#------------------------------

def create_recs(pid,N):
    """
    returna list
    """
    
    pid_id = pid

    # Use the implicit recommender.
    recommended = model.recommend(pid_id, sparse_user_item, N=N)

    tracks = []
    scores = []
    desc = []

    # Get artist names from ids
    for item in recommended:
        idx, score = item
        tracks.append(data.track_uri.loc[data.track_uri_id == idx].iloc[0])
        scores.append(score)
        #desc.append(D_desc[data.track_uri.loc[data.track_uri_id == idx].iloc[0]])

    # Create a dataframe of artist names and scores
    #recommendations = pd.DataFrame({'track_uris': tracks, 'score': scores})

    return tracks

In [22]:
#----------------------------------------------
# CREATE USER RECOMMENDATIONS WITH DESCRIPTION
#----------------------------------------------

def create_recs_with_description(pid,N):
    pid_id = pid

    # Use the implicit recommender.
    recommended = model.recommend(pid_id, sparse_user_item, N=N)

    tracks = []
    scores = []
    desc = []

    # Get artist names from ids
    for item in recommended:
        idx, score = item
        tracks.append(data.track_uri.loc[data.track_uri_id == idx].iloc[0])
        scores.append(score)
        desc.append(D_desc[data.track_uri.loc[data.track_uri_id == idx].iloc[0]])

    # Create a dataframe of artist names and scores
    recommendations = pd.DataFrame({'track_uris': tracks, 'score': scores, 'description':desc})

    return recommendations

In [26]:
#----------------------------------
# GET RECOMMENDATIONS AND EVALUATE
#----------------------------------

def als_predict_and_evaluate_top_n(pid, top_n=100):
    """
    return
    (1) top_n predicted: track_ids
    (2) ground_truth : track_ids in the hold_out
    (3) R_Prec
    (4) NDGC
    
    """
    L_pred = create_recs(pid,top_n)
    
    ground_truth = ev_set_arr[ev_set_arr[:,0]==pid][:,1]
    
    R_Prec = R_Precision(L_pred[:len(ground_truth)],ground_truth)
    
    res = [int(el in ground_truth) for el in L_pred]
    
    NDCG = nDCG(res)[1]
    
    return L_pred, ground_truth, R_Prec, NDCG, res

In [27]:
#-----------------------------------
# SAVE R-PRECISION AND NDCG BY PID
#-----------------------------------

def save_als_res_k_n(n = 10, top_n=20):
    """
    k = number of factors
    n= number of random lists to predict
    """
    time0=time()
    RES={}
    for i,pid in enumerate(random.sample(evaluation_pids,n)):
        predictions=als_predict_and_evaluate_top_n(pid,top_n)
        RES[pid] = [predictions[2], predictions[3]]
        print(i)
        print(time()-time0)
    df = pd.DataFrame(RES).transpose().reset_index()
    df.columns=['pid','R-Precision','nDCG']
    df['rating'] = 'bin'
    df['model'] = f'ALS'
    df.to_csv(f'../evaluation/ALS_bin_topn_{top_n}_{n}.csv', index = None)
    print(time()-time0)
    return RES

# Load data

In [4]:
file_path = '../data-processed/full-data/pid-track-binary-rating-train-data.csv'

In [5]:
# Load the data like we did before
raw_data = pd.read_csv(file_path)
raw_data.head()

Unnamed: 0,pid,track_uri,binary_rating
0,491000,spotify:track:3giQ7393501IRNrd8iHugf,1
1,491000,spotify:track:3jpcVaeyNjWgjqIxAiWasz,1
2,491000,spotify:track:1uuqRaSJAiQ6VB8BWblXWJ,1
3,491000,spotify:track:7gXpcXwtmEiQzskYJmtGgk,1
4,491000,spotify:track:5wtIWwOtowY2howCZ7Veq2,1


In [6]:
raw_data.columns = ['pid', 'track_uri', 'rating']
data = raw_data

In [8]:
# Create a numeric user_id and artist_id column
data['pid'] = data['pid'].astype("category")
data['track_uri'] = data['track_uri'].astype("category")
data['pid_id'] = data['pid'].cat.codes
data['track_uri_id'] = data['track_uri'].cat.codes
data.head()

In [10]:
D_track_id = data.groupby('track_uri')['track_uri_id'].min().to_dict()

# Build ALS model

In [11]:
# The implicit library expects data as a item-user matrix so we
# create two matricies, one for fitting the model (item-user) 
# and one for recommendations (user-item)
sparse_item_user = sparse.csr_matrix((data['rating'].astype(float), (data['track_uri_id'], data['pid_id'])))
sparse_user_item = sparse.csr_matrix((data['rating'].astype(float), (data['pid_id'], data['track_uri_id'])))

In [12]:
# Initialize the als model and fit it using the sparse item-user matrix
model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=20)



In [13]:
# Calculate the confidence by multiplying it by our alpha value.
alpha_val = 15
data_conf = (sparse_item_user * alpha_val).astype('double')

In [14]:
#Fit the model
model.fit(data_conf)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




# Load dictionary with tracks

In [15]:
import json

with open('../data-processed/full-data/track_descriptions.json') as json_file:
    D_desc = json.load(json_file)
    
D_desc['spotify:track:0UaMYEvWZi0ZqiDOoHU3YI']

# Find similar items

In [19]:
similar_items('spotify:track:0UaMYEvWZi0ZqiDOoHU3YI',5)

['spotify:track:0UaMYEvWZi0ZqiDOoHU3YI',
 'spotify:track:3jagJCUbdqhDSPuxP8cAqF',
 'spotify:track:6zsk6uF3MxfIeHPlubKBvR',
 'spotify:track:04KTF78FFg8sOHC1BADqbY',
 'spotify:track:7nR3jmYBf3e9TEf6zpJiZb']

# Create recommendations 

In [21]:
pid = 1000
rec, train = create_recs(pid,100), data[data['track_uri'] == pid]
sum([int(r in train) for r in rec])

0

In [None]:
create_recs_with_description('spotify:track:3jagJCUbdqhDSPuxP8cAqF', 3)

# Evaluation

In [23]:
evaluation_set = pd.read_csv('../data-processed/full-data/evaluation-pids-ground-truth.csv')
evaluation_set.head()

Unnamed: 0,pid,pos,track_uri,hold_out,seed_pattern
0,491004,0,spotify:track:5dNfHmqgr128gMY2tc5CeJ,0,first n
1,491004,1,spotify:track:557un1HgwYMuqfWGSTmnxw,0,first n
2,491004,2,spotify:track:6Ms01Gqi8gVBs14YrNUlVZ,0,first n
3,491004,3,spotify:track:7J41dYQolQJEtj3UmKLu5r,0,first n
4,491004,4,spotify:track:6LGwYMXXgURfaequXipzHx,0,first n


In [24]:
ev_set = evaluation_set[evaluation_set['hold_out'] == 1][['pid','track_uri','hold_out']]
ev_set = ev_set[ev_set.isnull()==False]

ev_set_arr = ev_set.to_numpy()

evaluation_pids = list(ev_set.pid.unique())

ev_set.head()

Unnamed: 0,pid,track_uri,hold_out
38,491004,spotify:track:7i7UIbm5E0DD7aSOYvwp2v,1
39,491004,spotify:track:0FZvjrHpAmLKj574M4VwrF,1
40,491004,spotify:track:0rE8OLQpoOFe2xa35twgve,1
41,491004,spotify:track:4rY3KONqdimczgu2NhxI8k,1
42,491004,spotify:track:6DkXLzBQT7cwXmTyzAB1DJ,1


In [None]:
save_als_res_k_n(3000,500)

0
32.06257152557373
