# Importing all needed libraries

In [1]:
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import Recommenders
from file_functions import load_dataset

# Downloading datasets

**For the first time, the function can take a long time, since it needs to download two large enough datasets.**

We check that the data files are downloaded, if not downloaded, then and saved to a file, if downloaded, then just read from the file.

We are using https://static.turi.com/datasets/millionsong/10000.txt set, as it is a subset of a million set of songs. Million songs set holds more than 600 GB of data, while its subset takes up far less. Also, to get more information about songs, we are using https://static.turi.com/datasets/millionsong/song_data.csv. With this set, we have a song name, artist name, and album name — all other things we don't need, as we don't make a deep analysis of the song.

In [2]:
songs = load_dataset('./data', 'https://static.turi.com/datasets/millionsong/10000.txt', 'https://static.turi.com/datasets/millionsong/song_data.csv', 'song.csv')

# Data analysis and simple data preprocessing

## Basic analysis

In [3]:
songs.head()

Unnamed: 0,user_id,song_id,listen_count,title,release,artist_name,year,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,Entre Dos Aguas,Flamenco Para Niños,Paco De Lucia,1976,0.458,0.657,-12.02,0.0346,0.933,0.878,0.0842,0.914,102.45,361867
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1,Stronger,Graduation,Kanye West,2007,0.617,0.717,-7.858,0.153,0.00564,0.0,0.408,0.49,103.992,311867
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1,Constellations,In Between Dreams,Jack Johnson,2005,0.705,0.309,-12.596,0.0363,0.476,3.8e-05,0.114,0.426,122.039,201653
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1,Learn To Fly,There Is Nothing Left To Lose,Foo Fighters,1999,0.465,0.919,-4.025,0.0408,1.8e-05,2.1e-05,0.262,0.537,135.997,235293
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODDNQT12A6D4F5F7E,5,Apuesta Por El Rock 'N' Roll,Antología Audiovisual,Héroes del Silencio,2007,0.444,0.889,-3.756,0.105,0.273,0.000278,0.193,0.633,169.907,231307


We drop all None values from the dataset.

We output the number of empty values before deleting them because, after that, they will be removed from the dataset, so that we will receive zeros.

In [4]:
print(songs.isnull().sum())
songs.dropna(inplace=True)

user_id             0
song_id             0
listen_count        0
title               0
release             0
artist_name         0
year                0
danceability        0
energy              0
loudness            0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
duration_ms         0
dtype: int64


In [5]:
songs.describe()

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
count,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0
mean,0.584333,0.6759,-6.971167,0.06365,0.255849,0.072793,0.257853,0.557133,120.127333
std,0.127129,0.214958,2.849295,0.064232,0.304903,0.232663,0.236601,0.205369,27.517803
min,0.3,0.245,-12.596,0.0259,1.8e-05,0.0,0.058,0.155,81.047
25%,0.487,0.529,-8.30375,0.033425,0.0199,2.3e-05,0.1035,0.42625,96.76325
50%,0.598,0.715,-6.35,0.0387,0.0836,0.00061,0.137,0.553,116.059
75%,0.69675,0.84025,-4.766,0.05305,0.44975,0.009323,0.3385,0.669,142.01725
max,0.804,0.959,-2.383,0.306,0.933,0.947,0.886,0.951,169.907


In [6]:
songs.columns

Index(['user_id', 'song_id', 'listen_count', 'title', 'release', 'artist_name',
       'year', 'danceability', 'energy', 'loudness', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'duration_ms'],
      dtype='object')

# Basic preprocess of data

We are adding a couple of new columns to our datasheet, as we want to save the past columns so that we can list the song names in the end.

This couple of columns is encoded columns of user id song id, album name and artist_name. We change user_id to our encoded user_id, as it is coded in start dataset, so that it wouldn't give us any information.

In [23]:
le = LabelEncoder()
songs['user_id'] = le.fit_transform(songs['user_id'])
songs['year'] = pd.to_numeric(songs['year'])
songs['song_id'] = le.fit_transform(songs['song_id'])
songs['encoded_artist_name'] = le.fit_transform(songs['artist_name'])
songs['encoded_release'] = le.fit_transform(songs['release'])
songs

Unnamed: 0,user_id,song_id,listen_count,title,release,artist_name,year,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,encoded_artist_name,encoded_release
0,0,0,2,Entre Dos Aguas,Flamenco Para Niños,Paco De Lucia,1976,0.458,0.657,-12.02,0.0346,0.933,0.878,0.0842,0.914,102.45,361867,13,7
1,0,1,1,Stronger,Graduation,Kanye West,2007,0.617,0.717,-7.858,0.153,0.00564,0.0,0.408,0.49,103.992,311867,11,9
2,0,2,1,Constellations,In Between Dreams,Jack Johnson,2005,0.705,0.309,-12.596,0.0363,0.476,3.8e-05,0.114,0.426,122.039,201653,6,10
3,0,3,1,Learn To Fly,There Is Nothing Left To Lose,Foo Fighters,1999,0.465,0.919,-4.025,0.0408,1.8e-05,2.1e-05,0.262,0.537,135.997,235293,3,23
4,0,4,5,Apuesta Por El Rock 'N' Roll,Antología Audiovisual,Héroes del Silencio,2007,0.444,0.889,-3.756,0.105,0.273,0.000278,0.193,0.633,169.907,231307,4,2
5,0,5,1,Paper Gangsta,The Fame Monster,Lady GaGa,2008,0.756,0.841,-5.051,0.0755,0.0496,1e-06,0.238,0.569,99.008,263360,12,21
6,0,6,1,Stacked Actors,There Is Nothing Left To Lose,Foo Fighters,1999,0.502,0.934,-5.145,0.05,0.00203,0.00102,0.234,0.67,134.898,256440,3,23
7,0,7,1,Love Shack,Original Hits - Rock,The B-52's,1989,0.715,0.817,-6.227,0.0532,0.0325,2e-06,0.851,0.871,133.858,321573,15,17
8,0,8,1,Clarity,As/Is: Cleveland/Cincinnati_ OH - 8/03-8/04/04,John Mayer,0,0.666,0.713,-3.551,0.0384,0.172,0.032,0.177,0.491,94.71,271427,9,4
9,0,9,1,The Old Saloon,Incredibad,The Lonely Island,2009,0.592,0.959,-2.383,0.306,0.769,3e-05,0.701,0.859,96.015,65667,16,11


## Count number of unique users in the dataset

### Create an instance of popularity based recommender class

In [None]:
pm = Recommenders.popularity_recommender_py()
pm.create(train_data, 'user_id', 'song')

### Use the popularity model to make some predictions

In [None]:
user_id = users[5]
pm.recommend(user_id)

### Quiz 2: Use the popularity based model to make predictions for the following user id (Note the difference in recommendations from the first user id).

In [None]:
###Fill in the code here
user_id = users[8]
pm.recommend(user_id)


## Build a song recommender with personalization

We now create an item similarity based collaborative filtering model that allows us to make personalized recommendations to each user. 

## Class for an item similarity based personalized recommender system (Can be used as a black box)

In [None]:
#Recommenders.item_similarity_recommender_py

### Create an instance of item similarity based recommender class

In [None]:
is_model = Recommenders.item_similarity_recommender_py()
is_model.create(train_data, 'user_id', 'song')

### Use the personalized model to make some song recommendations

In [None]:
#Print the songs for the user in training data
user_id = users[5]
user_items = is_model.get_user_items(user_id)
#
print("------------------------------------------------------------------------------------")
print("Training data songs for the user userid: %s:" % user_id)
print("------------------------------------------------------------------------------------")

for user_item in user_items:
    print(user_item)

print("----------------------------------------------------------------------")
print("Recommendation process going on:")
print("----------------------------------------------------------------------")

#Recommend songs for the user using personalized model
is_model.recommend(user_id)

### Quiz 3. Use the personalized model to make recommendations for the following user id. (Note the difference in recommendations from the first user id.)

In [None]:
user_id = users[7]
#Fill in the code here
user_items = is_model.get_user_items(user_id)
#
print("------------------------------------------------------------------------------------")
print("Training data songs for the user userid: %s:" % user_id)
print("------------------------------------------------------------------------------------")

for user_item in user_items:
    print(user_item)

print("----------------------------------------------------------------------")
print("Recommendation process going on:")
print("----------------------------------------------------------------------")

#Recommend songs for the user using personalized model
is_model.recommend(user_id)


### We can also apply the model to find similar songs to any song in the dataset

In [None]:
is_model.get_similar_items(['U Smile - Justin Bieber'])

### Quiz 4. Use the personalized recommender model to get similar songs for the following song.

In [None]:
song = 'Yellow - Coldplay'
###Fill in the code here
is_model.get_similar_items([song])

# Quantitative comparison between the models

We now formally compare the popularity and the personalized models using precision-recall curves. 

## Class to calculate precision and recall (This can be used as a black box)

In [None]:
#Evaluation.precision_recall_calculator

## Use the above precision recall calculator class to calculate the evaluation measures

In [None]:
import time
import Evaluation
start = time.time()

#Define what percentage of users to use for precision recall calculation
user_sample = 0.05

#Instantiate the precision_recall_calculator class
pr = Evaluation.precision_recall_calculator(test_data, train_data, pm, is_model)

#Call method to calculate precision and recall values
(pm_avg_precision_list, pm_avg_recall_list, ism_avg_precision_list, ism_avg_recall_list) = pr.calculate_measures(user_sample)

end = time.time()
print(end - start)

## Code to plot precision recall curve

In [None]:
import pylab as pl

#Method to generate precision and recall curve
def plot_precision_recall(m1_precision_list, m1_recall_list, m1_label):
    pl.clf()    
    pl.plot(m1_recall_list, m1_precision_list, label=m1_label)
    pl.xlabel('Recall')
    pl.ylabel('Precision')
    pl.ylim([0.0, 0.10])
    pl.xlim([0.0, 0.10])
    pl.title('Precision-Recall curve')
    #pl.legend(loc="upper right")
    pl.legend(loc=9, bbox_to_anchor=(0.5, -0.2))
    pl.show()


In [None]:
print("Plotting precision recall curves.")

plot_precision_recall(pm_avg_precision_list, pm_avg_recall_list, "popularity_model")
plot_precision_recall(ism_avg_precision_list, ism_avg_recall_list, "item_similarity_model")
