In [3]:
import pandas as pd
import numpy as np

from scipy.spatial.distance import cosine
from scipy.sparse import csr_matrix

pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [4]:
user_plays = pd.read_table("usersha1-artmbid-artname-plays.tsv",header = None, nrows = 2e7,
                          names = ['users', 'musicbrainz-artist-id', 'artist-name', 'tot_plays'],
                          usecols = ['users', 'artist-name', 'tot_plays'])

In [5]:
user_info = pd.read_table("usersha1-profile.tsv",header = None,
                          names = ['users', 'gender', 'age', 'country', 'signup_date'],
                          usecols = ['users', 'country'])

In [6]:
user_plays.head(20)

Unnamed: 0,users,artist-name,tot_plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty blowtorch,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717
4,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706
5,00000c289a1829a808ac09c00daf10bc3c4e223b,red hot chili peppers,691
6,00000c289a1829a808ac09c00daf10bc3c4e223b,magica,545
7,00000c289a1829a808ac09c00daf10bc3c4e223b,the black dahlia murder,507
8,00000c289a1829a808ac09c00daf10bc3c4e223b,the murmurs,424
9,00000c289a1829a808ac09c00daf10bc3c4e223b,lunachicks,403


Popular Artists

In [7]:
if user_plays['artist-name'].isnull().sum() > 0:
    user_plays = user_plays.dropna(axis = 0, subset = ['artist-name'])

In [8]:
artist_plays = (user_plays.
     groupby(by = ['artist-name'])['tot_plays'].
     sum().
     reset_index().
     rename(columns = {'tot_plays': 'total_artist_plays'})
     [['artist-name', 'total_artist_plays']]
    )

In [9]:
artist_plays.head()

Unnamed: 0,artist-name,total_artist_plays
0,04)],6
1,2,1606
2,58725ab=>,23
3,80lİ yillarin tÜrkÇe sÖzlÜ aŞk Şarkilari,70
4,amy winehouse,23


In [10]:
user_plays_up = user_plays.merge(artist_plays, left_on = 'artist-name', right_on = 'artist-name', how = 'left')

Remove Possible Noise

In [11]:
user_play_final = user_plays_up.query('total_artist_plays >= 4000')
user_play_final.head()

Unnamed: 0,users,artist-name,tot_plays,total_artist_plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty blowtorch,2137,25651
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099,3704875
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897,180391
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717,410725
4,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706,90498


More Filtering

In [12]:
combined = user_play_final.merge(user_info, left_on = 'users', right_on = 'users', how = 'left')
us_data = combined.query('country == \'United States\'')
us_data.head()

Unnamed: 0,users,artist-name,tot_plays,total_artist_plays,country
180,00007a47085b9aab8af55f52ec8846ac479ac4fe,devendra banhart,456,2366807,United States
181,00007a47085b9aab8af55f52ec8846ac479ac4fe,boards of canada,407,6115545,United States
182,00007a47085b9aab8af55f52ec8846ac479ac4fe,cocorosie,386,2194862,United States
183,00007a47085b9aab8af55f52ec8846ac479ac4fe,aphex twin,213,4248296,United States
184,00007a47085b9aab8af55f52ec8846ac479ac4fe,animal collective,203,3495537,United States


remove duplicates

In [13]:
if not us_data[us_data.duplicated(['users', 'artist-name'])].empty:
    initial_rows = us_data.shape[0]
    
    print ('Initial dataframe shape {0}'.format(us_data.shape))
    us_data = us_data.drop_duplicates(['users', 'artist-name'])
    current_rows = us_data.shape[0]
    print ('New dataframe shape {0}'.format(us_data.shape))
    print ('Removed {0} rows'.format(initial_rows - current_rows))

Initial dataframe shape (3155035, 5)
New dataframe shape (3155028, 5)
Removed 7 rows


In [14]:
wide_artist_data = us_data.pivot(index = 'artist-name', columns = 'users', values = 'tot_plays').fillna(0)

In [15]:
#wide_artist_data.to_csv('out.csv')

In [16]:
#wide_artist_data = pd.read_csv('out.csv')

In [17]:
#wide_artist_data_sparse = csr_matrix(wide_artist_data.values)

In [18]:
from scipy.sparse import csr_matrix

def save_sparse_csr(filename,array):
    np.savez(filename,data = array.data ,indices=array.indices,
             indptr =array.indptr, shape=array.shape )

def load_sparse_csr(filename):
    loader = np.load(filename)
    return csr_matrix((loader['data'], loader['indices'], loader['indptr']),
                         shape = loader['shape'])

#save_sparse_csr('lastfm_sparse_artist_matrix.npz', wide_artist_data_sparse)

In [19]:
wide_artist_data_sparse = load_sparse_csr("lastfm_sparse_artist_matrix.npz")

In [20]:
from sklearn.neighbors import NearestNeighbors

knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
knn.fit(wide_artist_data_sparse)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

In [21]:
#query_index = np.random.choice(wide_artist_data.shape[0])
#distances, indices = knn.kneighbors(wide_artist_data.iloc[query_index, :].values.reshape(1, -1), n_neighbors = 6)

#for i in range(0, len(distances.flatten())):
#    if i == 0:
#        print ('Recommendations for {0}:\n'.format(wide_artist_data.index[query_index]))
#    else:
#        print ('{0}: {1}, with distance of {2}:'.format(i, wide_artist_data.index[indices.flatten()[i]], distances.flatten()[i]))

fuzzy character matching

In [43]:
import string
from fuzzywuzzy import fuzz

In [44]:
list1 = []
def print_artist_recommendations(query_artist, artist_plays_matrix, knn_model, k):
    query_index = None
    ratio_tuples = []
    
    for i in artist_plays_matrix.index:
        ratio = fuzz.ratio(i.lower(), query_artist.lower())
        if ratio >= 75:
            current_query_index = artist_plays_matrix.index.tolist().index(i)
            ratio_tuples.append((i, ratio, current_query_index))
    
    print ('Possible matches: {0}\n'.format([(x[0], x[1]) for x in ratio_tuples]))
    
    try:
        query_index = max(ratio_tuples, key = lambda x: x[1])[2] # get the index of the best artist match in the data
    except:
        print ('Your artist didn\'t match any artists in the data. Try again')
        return None
    
    distances, indices = knn_model.kneighbors(artist_plays_matrix.iloc[query_index, :].values.reshape(1, -1), n_neighbors = k + 1)
    
    for i in range(0, len(distances.flatten())):
        if i == 0:
            print ('Recommendations for {0}:\n'.format(artist_plays_matrix.index[query_index]))           
        else:
            print ('{0}: {1}, with distance of {2}'.format(i, artist_plays_matrix.index[indices.flatten()[i]], distances.flatten()[i]))
            list1.append(artist_plays_matrix.index[indices.flatten()[i]]) 
    return None

In [45]:
a = []
input_string = input("Enter artists separated by comma ")
artist_list  = input_string.split(",")
print("Print all artists names")
for name in artist_list:
    a.append(name)
print(a)

Enter artists separated by comma eminem, green day, fleet foxes
Print all artists names
['eminem', ' green day', ' fleet foxes']


In [46]:
for i in range(len(a)):
    print_artist_recommendations(a[i], wide_artist_data, knn, k = 10)

Possible matches: [('eminem', 100)]

Recommendations for eminem:

1: 2pac, with distance of 0.5966543583257085
2: d12, with distance of 0.598203539778597
3: 50 cent, with distance of 0.6113038949594899
4: dr. dre, with distance of 0.6479407762511962
5: ludacris, with distance of 0.6794799361735397
6: lil wyte, with distance of 0.6997210186814351
7: obie trice, with distance of 0.706351213491091
8: bone thugs-n-harmony, with distance of 0.729129261249249
9: the game, with distance of 0.7294030377946903
10: ja rule, with distance of 0.732213961914209
Possible matches: [('green day', 95), ('u2 and green day', 77)]

Recommendations for green day:

1: foxboro hot tubs, with distance of 0.25733563636575385
2: the network, with distance of 0.2860840742261094
3: pinhead gunpowder, with distance of 0.5279067271690281
4: the frustrators, with distance of 0.6759663463358357
5: lucky boys confusion, with distance of 0.7554772221436907
6: nofx, with distance of 0.809201802366501
7: the offspring, w

In [47]:
#final_list = zip()
print("The Recommendations are :", list1[0:3],list1[10:13],list1[20:23])

The Recommendations are : ['2pac', 'd12', '50 cent'] ['foxboro hot tubs', 'the network', 'pinhead gunpowder'] ['stile antico', 'greg maroney', 'liz story']
