# Importing all needed libraries

In [1]:
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import Recommenders
from file_functions import load_dataset

# Downloading datasets

**For the first time, the function can take a long time, since it needs to download two large enough datasets.**

We check that the data files are downloaded, if not downloaded, then and saved to a file, if downloaded, then just read from the file.

We are using https://static.turi.com/datasets/millionsong/10000.txt set, as it is a subset of a million set of songs. Million songs set holds more than 600 GB of data, while its subset takes up far less. Also, to get more information about songs, we are using https://static.turi.com/datasets/millionsong/song_data.csv. With this set, we have a song name, artist name, and album name — all other things we don't need, as we don't make a deep analysis of the song.

In [2]:
songs = load_dataset('./data', 'https://static.turi.com/datasets/millionsong/10000.txt', 'https://static.turi.com/datasets/millionsong/song_data.csv', 'song.csv')

# Data analysis and simple data preprocessing

## Basic analysis

In [3]:
songs.head()

Unnamed: 0,user_id,song_id,listen_count,title,release,artist_name,year,danceability,valence,tempo,duration_ms,acousticness,instrumentalness,liveness,speechiness,loudness,energy
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,Entre Dos Aguas,Flamenco Para Niños,Paco De Lucia,1976,0.458,0.657,-12.02,0.0346,0.933,0.878,0.0842,0.914,102.45,361867
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1,Stronger,Graduation,Kanye West,2007,0.617,0.717,-7.858,0.153,0.00564,0.0,0.408,0.49,103.992,311867
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1,Constellations,In Between Dreams,Jack Johnson,2005,0.705,0.309,-12.596,0.0363,0.476,3.8e-05,0.114,0.426,122.039,201653
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1,Learn To Fly,There Is Nothing Left To Lose,Foo Fighters,1999,0.465,0.919,-4.025,0.0408,1.8e-05,2.1e-05,0.262,0.537,135.997,235293
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODDNQT12A6D4F5F7E,5,Apuesta Por El Rock 'N' Roll,Antología Audiovisual,Héroes del Silencio,2007,0.444,0.889,-3.756,0.105,0.273,0.000278,0.193,0.633,169.907,231307


We drop all None values from the dataset.

We output the number of empty values before deleting them because, after that, they will be removed from the dataset, so that we will receive zeros.

In [4]:
print(songs.isnull().sum())
songs.dropna(inplace=True)

user_id             0
song_id             0
listen_count        0
title               0
release             0
artist_name         0
year                0
danceability        0
valence             0
tempo               0
duration_ms         0
acousticness        0
instrumentalness    0
liveness            0
speechiness         0
loudness            0
energy              0
dtype: int64


In [5]:
songs.describe()

Unnamed: 0,listen_count,year,danceability,valence,tempo,duration_ms,acousticness,instrumentalness,liveness,speechiness,loudness,energy
count,234.0,234.0,234.0,234.0,234.0,234.0,234.0,234.0,234.0,234.0,234.0,234.0
mean,2.064103,1753.777778,0.574808,0.67721,-7.597098,0.076218,0.215911,0.156351,0.204659,0.499094,123.423034,249660.846154
std,2.310675,661.083986,0.155861,0.213139,3.581928,0.091867,0.285248,0.306041,0.180266,0.243177,24.718143,65798.537484
min,1.0,0.0,0.162,0.0636,-29.512,0.0225,7e-06,0.0,0.0293,0.0366,67.604,65667.0
25%,1.0,1996.0,0.47075,0.5355,-9.3055,0.0346,0.00555,3e-06,0.0902,0.316,104.02875,209110.0
50%,1.0,2003.0,0.572,0.711,-7.06,0.0441,0.05705,0.000782,0.132,0.5015,122.8965,241129.5
75%,2.0,2007.0,0.69175,0.86,-5.20475,0.074025,0.347,0.0546,0.28575,0.68375,136.251,285883.25
max,18.0,2010.0,0.949,0.99,-1.629,0.784,0.982,0.972,0.991,0.978,208.571,600293.0


In [6]:
songs.columns

Index(['user_id', 'song_id', 'listen_count', 'title', 'release', 'artist_name',
       'year', 'danceability', 'valence', 'tempo', 'duration_ms',
       'acousticness', 'instrumentalness', 'liveness', 'speechiness',
       'loudness', 'energy'],
      dtype='object')

# Basic preprocess of data

We are adding a couple of new columns to our datasheet, as we want to save the past columns so that we can list the song names in the end.

This couple of columns is encoded columns of user id song id, album name and artist_name. We change user_id to our encoded user_id, as it is coded in start dataset, so that it wouldn't give us any information.

In [7]:
le = LabelEncoder()
songs['user_id'] = le.fit_transform(songs['user_id'])
songs['year'] = pd.to_numeric(songs['year'])
songs['song_id'] = le.fit_transform(songs['song_id'])
songs['encoded_artist_name'] = le.fit_transform(songs['artist_name'])
songs['encoded_release'] = le.fit_transform(songs['release'])
songs.head()

Unnamed: 0,user_id,song_id,listen_count,title,release,artist_name,year,danceability,valence,tempo,duration_ms,acousticness,instrumentalness,liveness,speechiness,loudness,energy,encoded_artist_name,encoded_release
0,8,11,2,Entre Dos Aguas,Flamenco Para Niños,Paco De Lucia,1976,0.458,0.657,-12.02,0.0346,0.933,0.878,0.0842,0.914,102.45,361867,78,49
1,8,25,1,Stronger,Graduation,Kanye West,2007,0.617,0.717,-7.858,0.153,0.00564,0.0,0.408,0.49,103.992,311867,48,58
2,8,26,1,Constellations,In Between Dreams,Jack Johnson,2005,0.705,0.309,-12.596,0.0363,0.476,3.8e-05,0.114,0.426,122.039,201653,34,73
3,8,36,1,Learn To Fly,There Is Nothing Left To Lose,Foo Fighters,1999,0.465,0.919,-4.025,0.0408,1.8e-05,2.1e-05,0.262,0.537,135.997,235293,28,147
4,8,37,5,Apuesta Por El Rock 'N' Roll,Antología Audiovisual,Héroes del Silencio,2007,0.444,0.889,-3.756,0.105,0.273,0.000278,0.193,0.633,169.907,231307,30,13


## Replace zero's from dataset

In [8]:
song_df = songs
song_df.replace(0, 0.00000001, inplace=True)

## Create a subset of the dataset

In [9]:
#Merge song title and artist_name columns to make a merged column
song_df['song'] = song_df['title'].map(str) + " - " + song_df['artist_name']

## Showing the most popular songs in the dataset

In [10]:
song_grouped = song_df.groupby(['song']).agg({'listen_count': 'count'}).reset_index()
grouped_sum = song_grouped['listen_count'].sum()
song_grouped['percentage']  = song_grouped['listen_count'].div(grouped_sum)*100
song_grouped.sort_values(['listen_count', 'song'], ascending = [0,1])

Unnamed: 0,song,listen_count,percentage
88,High Life - Daft Punk,2,0.854701
92,Human After All - Daft Punk,2,0.854701
149,Right Back - Sublime,2,0.854701
195,The Real Slim Shady - Eminem,2,0.854701
0,16 Candles - The Crests,1,0.427350
...,...,...,...
225,Yellow - Coldplay,1,0.427350
226,You And Me Jesus - Jake Hess,1,0.427350
227,You Get What You Give - New Radicals,1,0.427350
228,You Know What You Are? - Nine Inch Nails,1,0.427350


## Count number of unique users in the dataset

Count number of unique users in the dataset

In [11]:
users = song_df['user_id'].unique()
len(users)

12

## Quiz 1. Count the number of unique songs in the dataset

In [12]:
###Fill in the code here
songs = song_df['song'].unique()
len(songs)

230

## Create a song recommender

In [13]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(song_df, test_size = 0.20, random_state=0)
train_data.head()

Unnamed: 0,user_id,song_id,listen_count,title,release,artist_name,year,danceability,valence,tempo,duration_ms,acousticness,instrumentalness,liveness,speechiness,loudness,energy,encoded_artist_name,encoded_release,song
75,11.0,38.0,2,Come As You Are,Nirvana,Nirvana,1e-08,0.5,0.824,-5.846,0.0388,0.00016,0.00161,0.0916,0.539,120.125,218920,75.0,99.0,Come As You Are - Nirvana
33,8.0,215.0,1,City Love,Any Given Thursday,John Mayer,1e-08,0.651,0.548,-6.036,0.0304,0.00576,5.58e-05,0.145,0.411,67.604,240200,38.0,14.0,City Love - John Mayer
123,7.0,108.0,2,The Prime Time Of Your Life,Human After All,Daft Punk,2005.0,0.503,0.507,-7.243,0.181,4.5e-05,0.514,0.288,0.0987,128.606,263240,19.0,69.0,The Prime Time Of Your Life - Daft Punk
63,1.0,65.0,4,Missing You,Love To Love,John Waite,1984.0,0.552,0.552,-9.736,0.0364,0.052,1e-08,0.046,0.593,208.571,269760,39.0,84.0,Missing You - John Waite
209,1e-08,133.0,5,The Bachelor and the Bride,Her Majesty The Decemberists,The Decemberists,2003.0,0.385,0.717,-7.906,0.0364,0.0556,7.86e-06,0.134,0.505,164.505,252987,98.0,65.0,The Bachelor and the Bride - The Decemberists


## Simple popularity-based recommender class (Can be used as a black box)

### Create an instance of popularity based recommender class

In [14]:
pm = Recommenders.popularity_recommender_py()
pm.create(train_data, 'user_id', 'song')

### Use the popularity model to make some predictions

In [15]:
user_id = users[5]
pm.recommend(user_id)

Unnamed: 0,user_id,song,score,Rank
71,1.0,High Life - Daft Punk,2,1.0
75,1.0,Human After All - Daft Punk,2,2.0
119,1.0,Right Back - Sublime,2,3.0
154,1.0,The Real Slim Shady - Eminem,2,4.0
0,1.0,16 Candles - The Crests,1,5.0
1,1.0,83 - John Mayer,1,6.0
2,1.0,Against The Peruvian Monster - Man Man,1,7.0
3,1.0,All My Friends - LCD Soundsystem,1,8.0
4,1.0,All That We Perceive - Thievery Corporation,1,9.0
5,1.0,All The Things That Go To Make Heaven And Eart...,1,10.0


### Quiz 2: Use the popularity based model to make predictions for the following user id (Note the difference in recommendations from the first user id).

In [16]:
###Fill in the code here
user_id = users[8]
pm.recommend(user_id)

Unnamed: 0,user_id,song,score,Rank
71,5.0,High Life - Daft Punk,2,1.0
75,5.0,Human After All - Daft Punk,2,2.0
119,5.0,Right Back - Sublime,2,3.0
154,5.0,The Real Slim Shady - Eminem,2,4.0
0,5.0,16 Candles - The Crests,1,5.0
1,5.0,83 - John Mayer,1,6.0
2,5.0,Against The Peruvian Monster - Man Man,1,7.0
3,5.0,All My Friends - LCD Soundsystem,1,8.0
4,5.0,All That We Perceive - Thievery Corporation,1,9.0
5,5.0,All The Things That Go To Make Heaven And Eart...,1,10.0


## Build a song recommender with personalization

We now create an item similarity based collaborative filtering model that allows us to make personalized recommendations to each user.

### Create an instance of item similarity based recommender class

In [17]:
is_model = Recommenders.item_similarity_recommender_py()
is_model.create(train_data, 'user_id', 'song')

### Use the personalized model to make some song recommendations

In [18]:
#Print the songs for the user in training data
user_id = users[5]
user_items = is_model.get_user_items(user_id)
#
print("------------------------------------------------------------------------------------")
print("Training data songs for the user userid: %s:" % user_id)
print("------------------------------------------------------------------------------------")

for user_item in user_items:
    print(user_item)

print("----------------------------------------------------------------------")
print("Recommendation process going on:")
print("----------------------------------------------------------------------")

#Recommend songs for the user using personalized model
is_model.recommend(user_id)

------------------------------------------------------------------------------------
Training data songs for the user userid: 1.0:
------------------------------------------------------------------------------------
Missing You - John Waite
Ya Nada Queda - Kudai
Ghosts 'n' Stuff (Original Instrumental Mix) - Deadmau5
Forgive Me - Leona Lewis
Somebody To Love - Justin Bieber
My Dad's Gone Crazy - Eminem / Hailie Jade
Without Me - Eminem
16 Candles - The Crests
The Real Slim Shady - Eminem
Push It - Salt-N-Pepa
Just Lose It - Eminem
----------------------------------------------------------------------
Recommendation process going on:
----------------------------------------------------------------------
No. of unique songs for the user: 11
no. of unique songs in the training set: 183
Non zero values in cooccurence_matrix :154


Unnamed: 0,user_id,song,score,rank
0,1.0,Nightvision - Daft Punk,0.045455,1
1,1.0,Riverside - Sidney Samson,0.045455,2
2,1.0,Fresh - Daft Punk,0.045455,3
3,1.0,Human After All - Daft Punk,0.045455,4
4,1.0,Too Long - Daft Punk,0.045455,5
5,1.0,Short Circuit - Daft Punk,0.045455,6
6,1.0,Digital Love - Daft Punk,0.045455,7
7,1.0,Technologic - Daft Punk,0.045455,8
8,1.0,Electric Feel - MGMT,0.045455,9
9,1.0,Criminal - Eminem,0.045455,10


### Quiz 3. Use the personalized model to make recommendations for the following user id. (Note the difference in recommendations from the first user id.)

In [19]:
user_id = users[7]
#Fill in the code here
user_items = is_model.get_user_items(user_id)
#
print("------------------------------------------------------------------------------------")
print("Training data songs for the user userid: %s:" % user_id)
print("------------------------------------------------------------------------------------")

for user_item in user_items:
    print(user_item)

print("----------------------------------------------------------------------")
print("Recommendation process going on:")
print("----------------------------------------------------------------------")

#Recommend songs for the user using personalized model
is_model.recommend(user_id)


------------------------------------------------------------------------------------
Training data songs for the user userid: 6.0:
------------------------------------------------------------------------------------
Yellow - Coldplay
Trouble - Coldplay
The Scientist - Coldplay
In My Place - Coldplay
Lost! - Coldplay
Strawberry Swing - Coldplay
Swallowed In The Sea - Coldplay
----------------------------------------------------------------------
Recommendation process going on:
----------------------------------------------------------------------
No. of unique songs for the user: 7
no. of unique songs in the training set: 183
Non zero values in cooccurence_matrix :49


Unnamed: 0,user_id,song,score,rank
0,6.0,Stadium Love - Metric,0.0,1
1,6.0,Crystal Blue Persuasion - Tommy James And The ...,0.0,2
2,6.0,Nightvision - Daft Punk,0.0,3
3,6.0,Baby - the bird and the bee,0.0,4
4,6.0,Just Lose It - Eminem,0.0,5
5,6.0,Head Rolls Off - Frightened Rabbit,0.0,6
6,6.0,Angel On My Shoulder (EDX Radio Edit) - Kaskade,0.0,7
7,6.0,The Old Saloon - The Lonely Island,0.0,8
8,6.0,Caring Is Creepy - The Shins,0.0,9
9,6.0,Neon - John Mayer,0.0,10


### We can also apply the model to find similar songs to any song in the dataset

In [20]:
is_model.get_similar_items(['U Smile - Justin Bieber'])

no. of unique songs in the training set: 183
Non zero values in cooccurence_matrix :0


Unnamed: 0,user_id,song,score,rank
0,,Stadium Love - Metric,0.0,1
1,,Crystal Blue Persuasion - Tommy James And The ...,0.0,2
2,,Nightvision - Daft Punk,0.0,3
3,,Baby - the bird and the bee,0.0,4
4,,Just Lose It - Eminem,0.0,5
5,,Head Rolls Off - Frightened Rabbit,0.0,6
6,,Angel On My Shoulder (EDX Radio Edit) - Kaskade,0.0,7
7,,The Old Saloon - The Lonely Island,0.0,8
8,,Caring Is Creepy - The Shins,0.0,9
9,,Neon - John Mayer,0.0,10


### Quiz 4. Use the personalized recommender model to get similar songs for the following song.

In [21]:
song = 'Yellow - Coldplay'
###Fill in the code here
is_model.get_similar_items([song])

no. of unique songs in the training set: 183
Non zero values in cooccurence_matrix :7


Unnamed: 0,user_id,song,score,rank
0,,Swallowed In The Sea - Coldplay,1.0,1
1,,Strawberry Swing - Coldplay,1.0,2
2,,Lost! - Coldplay,1.0,3
3,,In My Place - Coldplay,1.0,4
4,,The Scientist - Coldplay,1.0,5
5,,Trouble - Coldplay,1.0,6
6,,Stadium Love - Metric,0.0,7
7,,Crystal Blue Persuasion - Tommy James And The ...,0.0,8
8,,Nightvision - Daft Punk,0.0,9
9,,Baby - the bird and the bee,0.0,10


# Quantitative comparison between the models

We now formally compare the popularity and the personalized models using precision-recall curves. 

## Class to calculate precision and recall (This can be used as a black box)

In [22]:
#Evaluation.precision_recall_calculator

## Use the above precision recall calculator class to calculate the evaluation measures

In [23]:
import time
import Evaluation
start = time.time()

#Define what percentage of users to use for precision recall calculation
user_sample = 0.05

#Instantiate the precision_recall_calculator class
pr = Evaluation.precision_recall_calculator(test_data, train_data, pm, is_model)

#Call method to calculate precision and recall values
(pm_avg_precision_list, pm_avg_recall_list, ism_avg_precision_list, ism_avg_recall_list) = pr.calculate_measures(user_sample)

end = time.time()
print(end - start)

Length of user_test_and_training:8
Length of user sample:0


ZeroDivisionError: float division by zero

## Code to plot precision recall curve

In [None]:
import pylab as pl

#Method to generate precision and recall curve
def plot_precision_recall(m1_precision_list, m1_recall_list, m1_label):
    pl.clf()    
    pl.plot(m1_recall_list, m1_precision_list, label=m1_label)
    pl.xlabel('Recall')
    pl.ylabel('Precision')
    pl.ylim([0.0, 0.10])
    pl.xlim([0.0, 0.10])
    pl.title('Precision-Recall curve')
    #pl.legend(loc="upper right")
    pl.legend(loc=9, bbox_to_anchor=(0.5, -0.2))
    pl.show()


In [None]:
print("Plotting precision recall curves.")

plot_precision_recall(pm_avg_precision_list, pm_avg_recall_list, "popularity_model")
plot_precision_recall(ism_avg_precision_list, ism_avg_recall_list, "item_similarity_model")
