# Building a song recommender

In [1]:
%matplotlib inline

import pandas
from sklearn.cross_validation import train_test_split
import numpy as np
import time
from sklearn.externals import joblib
import Recommenders as Recommenders
import Evaluation as Evaluation



# Load the data

In [2]:
#This step might take time to download data from external sources
triplets_file = 'https://static.turi.com/datasets/millionsong/10000.txt' # user information
songs_metadata_file = 'https://static.turi.com/datasets/millionsong/song_data.csv'# song information

song_df_1 = pandas.read_table(triplets_file,header=None)
song_df_1.columns = ['user_id', 'song_id', 'listen_count']

#Read song metadata
song_df_2 =  pandas.read_csv(songs_metadata_file)

#Merge the two dataframes above to create input dataframe for recommender systems
song_df = pandas.merge(song_df_1, song_df_2.drop_duplicates(['song_id']), on="song_id", how="left") 

  """


# Explore data

Music data shows how many times a user listened to a song, as well as the details of the song.

In [3]:
song_df_2.head()

Unnamed: 0,song_id,title,release,artist_name,year
0,SOQMMHC12AB0180CB8,Silent Night,Monster Ballads X-Mas,Faster Pussy cat,2003
1,SOVFVAK12A8C1350D9,Tanssi vaan,Karkuteillä,Karkkiautomaatti,1995
2,SOGTUKN12AB017F4F1,No One Could Ever,Butter,Hudson Mohawke,2006
3,SOBNYVR12A8C13558C,Si Vos Querés,De Culo,Yerba Brava,2003
4,SOHSBXH12A8C13B0DF,Tangle Of Aspens,Rene Ablaze Presents Winter Sessions,Der Mystic,0


## Length of the dataset

In [4]:
len(song_df)

2000000

## Create a subset of the dataset

In [5]:
song_df = song_df.head(10000)

#Merge song title and artist_name columns to make a merged column
song_df['song'] = song_df['title'].map(str) + " - " + song_df['artist_name']

## Top most popular songs in the dataset

In [6]:
song_grouped = song_df.groupby(['song']).agg({'listen_count': 'count'}).reset_index()
grouped_sum = song_grouped['listen_count'].sum()
song_grouped['percentage']  = song_grouped['listen_count'].div(grouped_sum)*100
song_grouped.sort_values(['listen_count', 'song'], ascending = [0,1])

Unnamed: 0,song,listen_count,percentage
3660,Sehr kosmisch - Harmonia,45,0.45
4678,Undo - Björk,32,0.32
5105,You're The One - Dwight Yoakam,32,0.32
1071,Dog Days Are Over (Radio Edit) - Florence + Th...,28,0.28
3655,Secrets - OneRepublic,28,0.28
4378,The Scientist - Coldplay,27,0.27
4712,Use Somebody - Kings Of Leon,27,0.27
3476,Revelry - Kings Of Leon,26,0.26
1387,Fireflies - Charttraxx Karaoke,24,0.24
1862,Horn Concerto No. 4 in E flat K495: II. Romanc...,23,0.23


## Count number of unique users

In [7]:
users = song_df['user_id'].unique()
len(users) # Total number of unique users

365

## Count number of unique songs

In [8]:
songs = song_df['song'].unique()
len(songs) # Total number of unique songs

5151

## Count number of unique albums

In [9]:
albums = song_df['release'].unique()
len(albums) # Total number of unique albums

3103

## Count number of unique artists

In [10]:
artists = song_df['artist_name'].unique()
len(artists) # Total number of unique artists

1994

## Count number of total play counts

In [11]:
listen_counts = song_df['listen_count']
sum(listen_counts) # Total number of play counts

29911

# Building a Recommender System

## Split the data into train and test (test = 20%)

In [12]:
train_data, test_data = train_test_split(song_df, test_size = 0.20, random_state=0) # Split the data
print(train_data.head(5)) # Visualize a sample of the training data

                                       user_id             song_id  \
7389  94d5bdc37683950e90c56c9b32721edb5d347600  SOXNZOW12AB017F756   
9275  1012ecfd277b96487ed8357d02fa8326b13696a5  SOXHYVQ12AB0187949   
2995  15415fa2745b344bce958967c346f2a89f792f63  SOOSZAZ12A6D4FADF8   
5316  ffadf9297a99945c0513cd87939d91d8b602936b  SOWDJEJ12A8C1339FE   
356   5a905f000fc1ff3df7ca807d57edb608863db05d  SOAMPRJ12A8AE45F38   

      listen_count                 title  \
7389             2      Half Of My Heart   
9275             1  The Beautiful People   
2995             1     Sanctify Yourself   
5316             4     Heart Cooks Brain   
356             20                 Rorol   

                                                release      artist_name  \
7389                                     Battle Studies       John Mayer   
9275             Antichrist Superstar (Ecopac Explicit)   Marilyn Manson   
2995                             Glittering Prize 81/92     Simple Minds   
5316  Ever

## Popularity-Based Model

### Train the Popularity-Based Model

In [13]:
pm = Recommenders.popularity_recommender_py()
pm.create(train_data, 'user_id', 'song')

### Use the popularity model to make some recommendations

In [14]:
user_id = users[49] # test with user: dd67a78d5f9d8140a0d83849441ca1807f7ea790
user_test = song_df['user_id']==user_id
#song_df[user_test] # Entire listinig history of the user
#train_data[user_test] # Training data for the respective user
#test_data[user_test] # Test data to the respective user
pm.recommend(user_id) # Recommendations

Unnamed: 0,user_id,song,score,Rank
3194,2aa3b8c9f60070025940183cdd44602086d7b535,Sehr kosmisch - Harmonia,37,1.0
4083,2aa3b8c9f60070025940183cdd44602086d7b535,Undo - Björk,27,2.0
931,2aa3b8c9f60070025940183cdd44602086d7b535,Dog Days Are Over (Radio Edit) - Florence + Th...,24,3.0
4443,2aa3b8c9f60070025940183cdd44602086d7b535,You're The One - Dwight Yoakam,24,4.0
3034,2aa3b8c9f60070025940183cdd44602086d7b535,Revelry - Kings Of Leon,21,5.0
3189,2aa3b8c9f60070025940183cdd44602086d7b535,Secrets - OneRepublic,21,6.0
4112,2aa3b8c9f60070025940183cdd44602086d7b535,Use Somebody - Kings Of Leon,21,7.0
1207,2aa3b8c9f60070025940183cdd44602086d7b535,Fireflies - Charttraxx Karaoke,20,8.0
1577,2aa3b8c9f60070025940183cdd44602086d7b535,Hey_ Soul Sister - Train,19,9.0
1626,2aa3b8c9f60070025940183cdd44602086d7b535,Horn Concerto No. 4 in E flat K495: II. Romanc...,19,10.0


## Build a CF Music Recommender System

We now create an item similarity based collaborative filtering model that allows us to make personalized recommendations to each user. 

### Train the CF Model

In [20]:
is_model = Recommenders.item_similarity_recommender_py()
is_model.create(test_data, 'user_id', 'song')

### Use the CF model to make some song recommendations

In [16]:
#Print the songs for the user in training data -- # test with user: dd67a78d5f9d8140a0d83849441ca1807f7ea790
user_id = users[49]
user_items = is_model.get_user_items(user_id)
#
print("------------------------------------------------------------------------------------")
print("Training data songs for the user userid: %s:" % user_id)
print("------------------------------------------------------------------------------------")

for user_item in user_items:
    print(user_item)

print("----------------------------------------------------------------------")
print("Recommendation process going on:")
print("----------------------------------------------------------------------")

#Recommend songs for the user using personalized model
is_model.recommend(user_id)

------------------------------------------------------------------------------------
Training data songs for the user userid: 2aa3b8c9f60070025940183cdd44602086d7b535:
------------------------------------------------------------------------------------
The Laws Have Changed - The New Pornographers
Prisoner Of Love - Seventh Key
Ray Gun - the bird and the bee
It's Christmas So We'll Stop - Frightened Rabbit
The Bleeding Heart Show - The New Pornographers
Tuesday Moon - Neutral Milk Hotel
They Might Follow You - Tiny Vipers
Spider Cider - Man Man
I'm Not A Loser - Descendents
Mia - Emmy The Great
Whalebones - Man Man
Bratislava - Beirut
Illuminati - Fatboy Slim
The Prize - Soltero
Cassius (album version) - Foals
Star Bodies - The New Pornographers
Songs Of The Season - Soltero
Footloose And Fancy Free - Camera Obscura
Cat Claw - The Kills
The List - Metric
Chemo Limo (Remastered Version) - Regina Spektor
Somebody Told Me - The Killers
It's Christmas So We'll Stop (Choir Version) - The Pr

Unnamed: 0,user_id,song,score,rank
0,2aa3b8c9f60070025940183cdd44602086d7b535,Poison Oak (Album Version) - Bright Eyes,0.063179,1
1,2aa3b8c9f60070025940183cdd44602086d7b535,Last Day Of Magic - The Kills,0.062765,2
2,2aa3b8c9f60070025940183cdd44602086d7b535,The Flowers (Album Version) - Regina Spektor,0.062765,3
3,2aa3b8c9f60070025940183cdd44602086d7b535,Again & Again - the bird and the bee,0.058417,4
4,2aa3b8c9f60070025940183cdd44602086d7b535,Your Hand In Mine - Explosions In The Sky,0.058417,5
5,2aa3b8c9f60070025940183cdd44602086d7b535,Old Soul Song - Bright Eyes,0.05379,6
6,2aa3b8c9f60070025940183cdd44602086d7b535,Baby (Album Version) - Devendra Banhart,0.052309,7
7,2aa3b8c9f60070025940183cdd44602086d7b535,Gold Mine Gutted - Bright Eyes,0.050725,8
8,2aa3b8c9f60070025940183cdd44602086d7b535,Arc Of Time (time Code) (Album Version) - Brig...,0.050725,9
9,2aa3b8c9f60070025940183cdd44602086d7b535,Fisher Of Men - M Ward,0.050725,10


# Comparing both models performance

We now formally compare the popularity and the personalized models using precision-recall curves. 

## Use the above precision recall calculator class to calculate the evaluation measures

In [25]:
start = time.time()

#Define what percentage of users to use for precision recall calculation
user_sample = 0.20

#Instantiate the precision_recall_calculator class
pr = Evaluation.precision_recall_calculator(test_data, train_data, pm, is_model)

#Call method to calculate precision and recall values
(pm_avg_precision_list, pm_avg_recall_list, ism_avg_precision_list, ism_avg_recall_list) = pr.calculate_measures(user_sample)

end = time.time()
print(pr.calculate_measures(user_sample)) # Output the precision and recall of both models for each cutoff point 
print(end - start) # Output the total time to run output recommendations


Length of user_test_and_training:319
Length of user sample:63
Getting recommendations for user:eb1ad31d040406c6428c5f4420b0bc709b1c5350
No. of unique songs for the user: 5
no. of unique songs in the training set: 1605
Non zero values in cooccurence_matrix :72
Getting recommendations for user:1a39cf33853fd23d2242cf7b13cc8eb445befdd7
No. of unique songs for the user: 1
no. of unique songs in the training set: 1605
Non zero values in cooccurence_matrix :1
Getting recommendations for user:0a004c08b700e4edb74b44c2dbceca4280760a9a
No. of unique songs for the user: 4
no. of unique songs in the training set: 1605
Non zero values in cooccurence_matrix :23
Getting recommendations for user:2341121b6d6d2020303f02053dad60586e41034d
No. of unique songs for the user: 2
no. of unique songs in the training set: 1605
Non zero values in cooccurence_matrix :81
Getting recommendations for user:1b704d4cddabea8258bd93497fcb73eab32fa592
No. of unique songs for the user: 34
no. of unique songs in the training 

Non zero values in cooccurence_matrix :173
Getting recommendations for user:cc55e399781439435a046bcb1a4c78dc2c7ec0f3
No. of unique songs for the user: 3
no. of unique songs in the training set: 1605
Non zero values in cooccurence_matrix :9
Getting recommendations for user:a1380d458c15706b9d5282304db81a5a78352e96
No. of unique songs for the user: 3
no. of unique songs in the training set: 1605
Non zero values in cooccurence_matrix :9
Getting recommendations for user:956999576244ad42d6d41faac8505fbef0a4ccc1
No. of unique songs for the user: 4
no. of unique songs in the training set: 1605
Non zero values in cooccurence_matrix :16
Getting recommendations for user:53ba380d234fd6022818340983570354ee207f6b
No. of unique songs for the user: 1
no. of unique songs in the training set: 1605
Non zero values in cooccurence_matrix :1
Getting recommendations for user:42c9fba813dd9ca8ca3673277a0f923b6845a7a6
No. of unique songs for the user: 8
no. of unique songs in the training set: 1605
Non zero val

Non zero values in cooccurence_matrix :21
Getting recommendations for user:e3937c7c32f5b68422808a854a4a7a824ee448a5
No. of unique songs for the user: 32
no. of unique songs in the training set: 1605
Non zero values in cooccurence_matrix :1233
Getting recommendations for user:497f5a58ffeaa953d619e95ca5b8736e74b99127
No. of unique songs for the user: 5
no. of unique songs in the training set: 1605
Non zero values in cooccurence_matrix :162
Getting recommendations for user:37029a65b9925fb3c1964774fcab695b82955f76
No. of unique songs for the user: 11
no. of unique songs in the training set: 1605
Non zero values in cooccurence_matrix :177
Getting recommendations for user:a520488fcf049bbb5cd847cfa4f884c740692780
No. of unique songs for the user: 8
no. of unique songs in the training set: 1605
Non zero values in cooccurence_matrix :64
Getting recommendations for user:2aa3b8c9f60070025940183cdd44602086d7b535
No. of unique songs for the user: 9
no. of unique songs in the training set: 1605
Non 

Non zero values in cooccurence_matrix :103
([0.031746031746031744, 0.031746031746031744, 0.031746031746031744, 0.027777777777777776, 0.025396825396825393, 0.023809523809523808, 0.020408163265306117, 0.017857142857142856, 0.015873015873015876, 0.014285714285714284], [0.01020408163265306, 0.011854083282654708, 0.02931440074297217, 0.03195990338847482, 0.039896411324982756, 0.04307101449958593, 0.04307101449958593, 0.04307101449958593, 0.04307101449958593, 0.04307101449958593], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
162.716238022
