In [5]:
""" An example of using this library to calculate related artists
from the last.fm dataset. More details can be found
at http://www.benfrederickson.com/matrix-factorization/

The dataset here can be found at
http://www.dtic.upf.edu/~ocelma/MusicRecommendationDataset/lastfm-360K.html

Note there are some invalid entries in this dataset, running
this function will clean it up so pandas can read it:
https://github.com/benfred/bens-blog-code/blob/master/distance-metrics/musicdata.py#L39
"""

from __future__ import print_function

import logging
import argparse
import time

import numpy
import pandas
from scipy.sparse import coo_matrix
import annoy

from implicit import alternating_least_squares



In [6]:
OPENBLAS_NUM_THREADS=1

In [7]:
logging.basicConfig(level=logging.DEBUG)

In [29]:
def read_data(filename):
    """ Reads in the last.fm dataset, and returns a tuple of a pandas dataframe
    and a sparse matrix of artist/user/playcount """
    # read in triples of user/artist/playcount from the input dataset
    data = pandas.read_table(filename,
                             usecols=[0, 1, 3],
                             names=['artist', 'user', 'plays'])

    # map each artist and user to a unique numeric value
    data['user'] = data['user'].astype("category")
    data['artist'] = data['artist'].astype("category")

    # create a sparse matrix of all the users/plays
    plays = coo_matrix((data['plays'].astype(float),
                       (data['artist'].cat.codes.copy(),
                        data['user'].cat.codes.copy())))

    return data, plays


def bm25_weight(X, K1=3000, B=0.8):
    """ Weighs each row of the sparse matrix of the data by BM25 weighting """
    # calculate idf per term (user)
    X = coo_matrix(X)
    N = X.shape[0]
    idf = numpy.log(float(N) / (1 + numpy.bincount(X.col)))

    # calculate length_norm per document (artist)
    row_sums = numpy.ravel(X.sum(axis=1))
    average_length = row_sums.mean()
    length_norm = (1.0 - B) + B * row_sums / average_length

    # weight matrix rows by bm25
    X.data = X.data * (K1 + 1.0) / (K1 * length_norm[X.row] + X.data) * idf[X.col]
    return X


class TopRelated(object):
    def __init__(self, artist_factors):
        # fully normalize artist_factors, so can compare with only the dot product
        norms = numpy.linalg.norm(artist_factors, axis=-1)
        self.factors = artist_factors / norms[:, numpy.newaxis]

    def get_related(self, artistid, N=10):
        scores = self.factors.dot(self.factors[artistid])
        best = numpy.argpartition(scores, -N)[-N:]
        return sorted(zip(best, scores[best]), key=lambda x: -x[1])


class ApproximateTopRelated(object):
    def __init__(self, artist_factors, treecount=20):
        index = annoy.AnnoyIndex(artist_factors.shape[1], 'angular')
        for i, row in enumerate(artist_factors):
            index.add_item(i, row)
        index.build(treecount)
        self.index = index

    def get_related(self, artistid, N=5):
        neighbours = self.index.get_nns_by_item(artistid, N)
        return sorted(((other, 1 - self.index.get_distance(artistid, other))
                      for other in neighbours), key=lambda x: -x[1])


def calculate_similar_artists(input_filename, output_filename,
                              factors=30, regularization=0.01,
                              iterations=40,
                              exact=False, trees=20,
                              use_native=True,
                              dtype=numpy.float64,
                              cg=False):
    print("Calculating similar artists. This might take a while")
    print("reading data from %s", input_filename)
    start = time.time()
    df, plays = read_data(input_filename)
    print("read data file in %s", time.time() - start)

    print("weighting matrix by bm25")
    weighted = bm25_weight(plays)

    print("calculating factors")
    start = time.time()
    artist_factors, user_factors = alternating_least_squares(weighted,
                                                             factors=factors,
                                                             regularization=regularization,
                                                             iterations=iterations,
                                                             use_native=use_native,
                                                             dtype=dtype,
                                                             use_cg=cg)
    print("calculated factors in %s", time.time() - start)

    # write out artists by popularity
    print("calculating top artists")
    user_count = df.groupby('artist').size()
    artists = dict(enumerate(df['artist'].cat.categories))
    to_generate = sorted(list(artists), key=lambda x: -user_count[x])

    print(artist_factors)
    if exact:
        calc = TopRelated(artist_factors)
    else:
        calc = ApproximateTopRelated(artist_factors, trees)

    print("writing top related to %s", output_filename)
    with open(output_filename, "w") as o:
        for artistid in to_generate:
            artist = artists[artistid]
            for other, score in calc.get_related(artistid):
                o.write("%s\t%s\t%s\n" % (artist, artists[other], score))


In [30]:
calculate_similar_artists('data/bill_1.csv', 'data/billPerson_2.output')


DEBUG:implicit:finished iteration 0 in 0.0304811000824
DEBUG:implicit:finished iteration 1 in 0.0305988788605
DEBUG:implicit:finished iteration 2 in 0.0348169803619


Calculating similar artists. This might take a while
reading data from %s data/bill_1.csv
read data file in %s 0.0613839626312
weighting matrix by bm25
calculating factors


DEBUG:implicit:finished iteration 3 in 0.0368061065674
DEBUG:implicit:finished iteration 4 in 0.0348420143127
DEBUG:implicit:finished iteration 5 in 0.0414049625397
DEBUG:implicit:finished iteration 6 in 0.0366909503937
DEBUG:implicit:finished iteration 7 in 0.0382311344147
DEBUG:implicit:finished iteration 8 in 0.0441970825195
DEBUG:implicit:finished iteration 9 in 0.0385420322418
DEBUG:implicit:finished iteration 10 in 0.0412511825562
DEBUG:implicit:finished iteration 11 in 0.0369868278503
DEBUG:implicit:finished iteration 12 in 0.041825056076
DEBUG:implicit:finished iteration 13 in 0.0379238128662
DEBUG:implicit:finished iteration 14 in 0.0388481616974
DEBUG:implicit:finished iteration 15 in 0.0366549491882
DEBUG:implicit:finished iteration 16 in 0.0393178462982
DEBUG:implicit:finished iteration 17 in 0.0413429737091
DEBUG:implicit:finished iteration 18 in 0.0427808761597
DEBUG:implicit:finished iteration 19 in 0.0398151874542
DEBUG:implicit:finished iteration 20 in 0.0408370494843


calculated factors in %s 1.72389793396
calculating top artists
[[ -5.90747261e-03   9.08448306e-04   4.62682983e-02 ...,   6.95666457e-03
    1.24884337e-02   1.31453909e-02]
 [  5.46602572e-02  -3.51383271e-02   3.49669028e-02 ...,  -1.44070672e-02
   -1.58458732e-02  -3.23981164e-02]
 [  4.44312404e-02  -3.01737955e-02   1.25254121e-02 ...,   1.57609402e-02
    2.89474566e-02  -3.28779494e-02]
 ..., 
 [ -9.94665304e-03  -4.61746317e-02   9.78075528e-03 ...,  -2.70238470e-02
   -5.15347830e-03   4.67938562e-02]
 [  5.07836787e-02   1.34726745e-02  -6.57552030e-02 ...,   2.69928599e-02
   -5.61087256e-04   3.00529806e-02]
 [  8.79245562e-03   3.55183673e-03   3.45596127e-03 ...,  -1.66531111e-03
    1.27808735e-03   7.17304646e-05]]
writing top related to %s data/billPerson_2.output
