In [1]:
import sys
import pandas as pd
import numpy as np
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
import random

from sklearn.preprocessing import MinMaxScaler

import implicit

# Load the data
raw_data = pd.read_table(r"F:\Data_Repository\lastfm\usersha1-artmbid-artname-plays.tsv")
raw_data = raw_data.drop(raw_data.columns[1], axis=1)
raw_data.columns = ['user', 'artist', 'plays']

In [2]:
# Subsetting the data for our analysis
raw_data1 = raw_data[0:2000000]

In [3]:
print('Total number of users in the data:',len(raw_data1['user'].unique()))
print('Total number of artists in the data:',len(raw_data1['artist'].unique()))

Total number of users in the data: 40913
Total number of artists in the data: 110821


There are about 204 nulls in the artist column. Lets drop the null rows from the dataset

In [4]:
# Drop NaN columns
data = raw_data1.dropna()
data = data.copy()

In [5]:
# Converting the numbers to categories to be used for creating the categorical codes to avoid using long hash keys 
data['user'] = data['user'].astype("category")
data['artist'] = data['artist'].astype("category")

#cat.codes creates a categorical id for the users and artists
data['user_id'] = data['user'].cat.codes
data['artist_id'] = data['artist'].cat.codes

# The implicit library expects data as a item-user matrix so we
# create two matrices, one for fitting the model (item-user) 
# and one for recommendations (user-item)

sparse_item_user = sparse.csr_matrix((data['plays'].astype(float), (data['artist_id'], data['user_id'])))
sparse_user_item = sparse.csr_matrix((data['plays'].astype(float), (data['user_id'], data['artist_id'])))

In [15]:
sparse_item_user.shape

(110820, 40913)

In [6]:
matrix_size = sparse_user_item.shape[0]*sparse_user_item.shape[1] # Number of possible interactions in the matrix
num_purchases = len(sparse_user_item.nonzero()[0]) # Number of items interacted with
sparsity = 100*(1 - (num_purchases/matrix_size))
sparsity

99.95588936009682

We have very high sparsity in our data. This might not result in favorable results at the end. But one thing we can be sure of if finding similar artists using the above data. As we will be having enough for each artist based on the people that have listened to them. Recommendations to the users might be less than expected

### Creating train and test data

In [7]:
import random

def make_train(ratings, pct_test = 0.2):
    test_set = ratings.copy() # Make a copy of the original set to be the test set. 
    test_set[test_set != 0] = 1 # Store the test set as a binary preference matrix
    
    training_set = ratings.copy() # Make a copy of the original data we can alter as our training set. 
    
    nonzero_inds = training_set.nonzero() # Find the indices in the ratings data where an interaction exists
    nonzero_pairs = list(zip(nonzero_inds[0], nonzero_inds[1])) # Zip these pairs together of item,user index into list

    
    random.seed(0) # Set the random seed to zero for reproducibility
    
    num_samples = int(np.ceil(pct_test*len(nonzero_pairs))) # Round the number of samples needed to the nearest integer
    samples = random.sample(nonzero_pairs, num_samples) # Sample a random number of item-user pairs without replacement

    item_inds = [index[0] for index in samples] # Get the item row indices

    user_inds = [index[1] for index in samples] # Get the user column indices

    
    training_set[item_inds, user_inds] = 0 # Assign all of the randomly chosen user-item pairs to zero
    training_set.eliminate_zeros() # Get rid of zeros in sparse array storage after update to save space
    
    return training_set, test_set, list(set(user_inds)) # Output the unique list of user columns that were altered

Lets use the above function to create the training and test sets for our analysis

In [8]:
# 20% of the data has been masked for this exercise
product_train, product_test, product_users_altered = make_train(sparse_item_user, pct_test = 0.05)

### Building the recommendation system

Here we will use the implicit package from python to build the recommendation engine.

In [9]:
%%time
# 1. factors = 20 -- Latent factors for user and item vectors
# 2. iterations = 20 -- Number of iterations to use while fitting the data
# 3. regularization = 0.1 -- regularization constant to be used in the cost function

model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=40)

# Calculate the confidence by multiplying it by our alpha value.(alpha value corresponds to the confidence metric 
# that we discussed earlier)

alpha_val = 15
data_conf = (product_train * alpha_val).astype('double')

# We have used an alpha_val of 15 after performing some iterations with different alpha values
#Fit the model
model.fit(data_conf)

100%|████████████████████████████████████████████████████████████████████████████████| 40.0/40 [00:07<00:00,  5.05it/s]


In [10]:
item_vecs = model.item_factors
user_vecs = model.user_factors

In [11]:
print('Shape of Artist vector matrix : ', item_vecs.shape)
print('Shape of User vector matrix : ', user_vecs.shape)

Shape of Artist vector matrix :  (110820, 20)
Shape of User vector matrix :  (40913, 20)


### Evaluating the recommendation system using AUC-ROC curve


In [12]:
from sklearn import metrics
import matplotlib.pylab as plt
def auc_score(predictions, test):
    '''   
    returns: - AUC (area under the Receiver Operating Characterisic curve)
    '''
    fpr, tpr, thresholds = metrics.roc_curve(test, predictions)
    return metrics.auc(fpr, tpr) 

In [13]:
def calc_mean_auc(training_set, altered_users, predictions, test_set):
        
    store_auc = [] # An empty list to store the AUC for each user that had an item removed from the training set
    popularity_auc = [] # To store popular AUC scores
    pop_items = np.array(test_set.sum(axis = 1)).reshape(-1) # Get sum of item iteractions to find most popular
    item_vecs = predictions[1]
    for user in altered_users: # Iterate through each user that had an item altered
        training_column = training_set[:,user].toarray().reshape(-1) # Get the training set column
        zero_inds = np.where(training_column == 0) # Find where the interaction had not yet occurred
        
        # Get the predicted values based on our user/item vectors
        user_vec = predictions[0][user,:]
        pred = user_vec.dot(item_vecs).toarray()[0,zero_inds].reshape(-1)
        
        # Get only the items that were originally zero
        # Select all ratings from the MF prediction for this user that originally had no iteraction
        actual = test_set[:,user].toarray()[zero_inds,0].reshape(-1)
        
        # Select the binarized yes/no interaction pairs from the original full data
        # that align with the same pairs in training 
        pop = pop_items[zero_inds] # Get the item popularity for our chosen items
        
        store_auc.append(auc_score(pred, actual)) # Calculate AUC for the given user and store
        
        popularity_auc.append(auc_score(pop, actual)) # Calculate AUC using most popular and score
    # End users iteration
    
    return float('%.3f'%np.mean(store_auc)), float('%.3f'%np.mean(popularity_auc))  
   # Return the mean AUC rounded to three decimal places for both test and popularity benchmark

In [14]:
%%time
calc_mean_auc(product_train, product_users_altered,
              [sparse.csr_matrix(user_vecs), sparse.csr_matrix(item_vecs.T)], product_test)
# AUC for our recommender system

(0.962, 0.934)

### Example for recommendation

### Finding similar artists

In [16]:
data[data['artist'] == 'red hot chili peppers'].head(5)

Unnamed: 0,user,artist,plays,user_id,artist_id
4,00000c289a1829a808ac09c00daf10bc3c4e223b,red hot chili peppers,691,0,80876
1422,000429493d9716b66b02180d208d09b5b89fbe64,red hot chili peppers,234,29,80876
2139,0007e26aafcfc0b6dcb87d7041583fbb7cced88a,red hot chili peppers,159,44,80876
3284,000b0bb32f149504e1df3cce85b6bfd20cef3dd0,red hot chili peppers,46,68,80876
3322,000b2ee840cbda56e0f41c8f248c4fb7ee275db3,red hot chili peppers,87,69,80876


In [17]:
# Find the 10 most similar to red hot chilli peppers
artist_id = 80876
n_similar = 10 # getting the top ten similar items

# Use implicit to get similar items.
similar = model.similar_items(artist_id, n_similar)
# Print the names of our most similar artists
for artist in similar:
    idx, score = artist
    print (data.artist.loc[data.artist_id == idx].iloc[0])

red hot chili peppers
muse
nirvana
coldplay
placebo
queen
the killers
foo fighters
the beatles
oasis


In a similar way, lets look atsome other band

In [18]:
data[data['artist'] == 'die Ärzte'].head(5)

Unnamed: 0,user,artist,plays,user_id,artist_id
0,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099,0,30264
2943,000a1585c5f65532a9c9187a882892982d345a5c,die Ärzte,148,61,30264
3787,000cb6427411006fe9a6193d3c4f59efed53fbef,die Ärzte,7,78,30264
6295,0014ffc91d3a5b59cce9bceaf22ef0d72e5711b8,die Ärzte,88,128,30264
13513,003059a886782e4d7936da913d3f064f637d0b2b,die Ärzte,5,274,30264


In [19]:
# Find the 10 most similar to red hot chilli peppers
artist_id = 30264
n_similar = 10 # getting the top ten similar items

# Use implicit to get similar items.
similar = model.similar_items(artist_id, n_similar)
# Print the names of our most similar artists
for artist in similar:
    idx, score = artist
    print (data.artist.loc[data.artist_id == idx].iloc[0])

die Ärzte
mando diao
guano apes
guns n roses
billy talent
limp bizkit
evanescence
bloodhound gang
him
3 doors down
