In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv('lastfm-dataset-360K/usersha1-artmbid-artname-plays.tsv',sep = '\t', header=None) 

In [2]:
# delete data with missing values
df.dropna(subset=[2], inplace=True)
df.reset_index(drop=True, inplace=True)

# add my own data
my_df = pd.read_csv('janelleek_lastfm_data.txt',sep = '\t', header=None) 
frames = [my_df, df]
final_df = pd.concat(frames, ignore_index=True)
#display(df)

# map artist ids to artist names
artist_ids = dict(zip(final_df[1].tolist(), final_df[2].tolist()))

In [23]:
# convert listen counts to artist ratings for each user

data_idx = 0
ratings = [0] * len(final_df)
counts = final_df[0].value_counts()

cur_user = None
count = 0
for idx, data in final_df.iterrows():
    
    if data[0] != cur_user:
        cur_user = data[0]
        count = 0
        
    if count < 5:
        ratings[idx] = 5
        
    elif count < 10:
        ratings[idx] = 4.5
        
    elif count < 20:
        ratings[idx] = 4
    
    elif count < 30:
        ratings[idx] = 3.5
        
    else:
        ratings[idx] = 3
        
    count += 1
    
final_df[4] = ratings

In [24]:
display(final_df)

Unnamed: 0,0,1,2,3,4
0,janelleek,20244d07-534f-4eff-b4d4-930878889970,Taylor Swift,1867,5.0
1,janelleek,9f81247f-7f57-42f3-a8ba-75bef554e591,Big Thief,1106,5.0
2,janelleek,763cb144-afdb-471e-bd86-d4f5b9b58641,Adrianne Lenker,824,5.0
3,janelleek,4a42d940-ad63-4b06-a871-f6e37eab4c6f,Field Medic,789,5.0
4,janelleek,01d3c51b-9b98-418a-8d8e-37f6fab59d8c,Sufjan Stevens,778,5.0
...,...,...,...,...,...
17535494,"sep 20, 2008",7ffd711a-b34d-4739-8aab-25e045c246da,turbostaat,12,3.0
17535495,"sep 20, 2008",9201190d-409f-426b-9339-9bd7492443e2,cuba missouri,11,3.0
17535496,"sep 20, 2008",e7cf7ff9-ed2f-4315-aca8-bcbd3b2bfa71,little man tate,11,3.0
17535497,"sep 20, 2008",f6f2326f-6b25-4170-b89d-e235b25508e8,sigur rós,10,3.0


In [25]:
from surprise import Dataset, NormalPredictor, Reader, accuracy, SVD
from surprise.model_selection import cross_validate, train_test_split

reader = Reader(rating_scale=(1,5))
# final_df column 0 is user id, column 1 is artist id, and column 4 is rating
data = Dataset.load_from_df(final_df[[0, 1, 4]], reader)

In [26]:
algo = SVD()

cross_validate(algo, data, measures=["RMSE", "MAE"], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.6745  0.6747  0.6752  0.6747  0.6750  0.6748  0.0002  
MAE (testset)     0.5706  0.5708  0.5712  0.5708  0.5711  0.5709  0.0002  
Fit time          93.60   118.00  98.65   123.30  117.51  110.21  11.79   
Test time         9.51    38.71   9.94    10.34   31.29   19.96   12.51   


{'test_rmse': array([0.67452989, 0.67469305, 0.67521328, 0.67471303, 0.67496352]),
 'test_mae': array([0.57057833, 0.57080495, 0.57124021, 0.57082237, 0.57109521]),
 'fit_time': (93.59975123405457,
  118.00345492362976,
  98.65175819396973,
  123.3032009601593,
  117.5145628452301),
 'test_time': (9.510699033737183,
  38.71044301986694,
  9.938780307769775,
  10.341867923736572,
  31.29385209083557)}

In [27]:
from collections import defaultdict

# code from surprise documentation 
# https://surprise.readthedocs.io/en/stable/FAQ.html?highlight=read_item_names()#raw-inner-note
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [28]:
#mean = trainset._global_mean

artists_rated = [None] * 48

for idx, data in final_df[:48].iterrows():
    
    artists_rated[idx] = data[1]

mean = 3.52
my_testset = [None] * (trainset.n_items - 48)
idx = 0
for i_id in trainset.all_items():
    
    r_id = trainset.to_raw_iid(i_id)
    if r_id not in artists_rated:
        my_testset[idx] = ('janelleek', r_id, mean)
        idx += 1

#print(my_testset[:5])

In [29]:
testset = trainset.build_testset()
predictions_test = algo.test(testset)
predictions = algo.test(my_testset)

rmse_test = accuracy.rmse(predictions_test)
my_rmse = accuracy.rmse(predictions)

#print(f"Test RMSE: {rmse_test}, My RMSE: {my_rmse}")

RMSE: 0.6429
RMSE: 0.1507


In [30]:
top_n = get_top_n(predictions, n=25)

ct = 0
for artist, rate in top_n['janelleek']:
   
    ct+= 1
    print(f"{ct}. {rate}: {artist_ids[artist]}")

1. 4.5300003737281225: 동방신기
2. 4.355627327920331: zen café
3. 4.3249087958275645: perkele
4. 4.31106822124001: la polla records
5. 4.3043146200338525: die Ärzte
6. 4.286490354245064: vetusta morla
7. 4.277251129346024: rush limbaugh
8. 4.268251682639694: costas andreou
9. 4.264273845518622: böhse onkelz
10. 4.263643680101826: züri west
11. 4.258713425230485: z-ro
12. 4.254065325476954: block out
13. 4.248389293975972: brian houston
14. 4.24511957766629: los hermanos
15. 4.243181881364825: wirtz
16. 4.242404172401508: los hermanos
17. 4.239494508753434: whiskeytown
18. 4.236831920295854: music together
19. 4.23598560916699: simone dinnerstein
20. 4.2354544292823695: sort stue
21. 4.225619237656125: big bang
22. 4.225249215337129: 上海アリス幻樂団
23. 4.224429329908503: super junior
24. 4.218649785560569: doctor deseo
25. 4.213363129047933: ortega cartel
