# Building a song recommender


# Fire up Libraries

In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.neighbors import NearestNeighbors
from collections import Counter

%matplotlib inline

# Load music data

In [2]:
song_data = pd.read_csv('song_data.csv')

# Explore data

Music data shows how many times a user listened to a song, as well as the details of the song.

In [3]:
song_data.head()

Unnamed: 0,user_id,song_id,listen_count,title,artist,song
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,The Cove,Jack Johnson,The Cove - Jack Johnson
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,Entre Dos Aguas,Paco De Lucia,Entre Dos Aguas - Paco De Lucia
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1,Stronger,Kanye West,Stronger - Kanye West
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1,Constellations,Jack Johnson,Constellations - Jack Johnson
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1,Learn To Fly,Foo Fighters,Learn To Fly - Foo Fighters


In [4]:
song_data.shape

(1116609, 6)

## Verifying missing values

In [5]:
song_data.isnull().sum()

user_id         0
song_id         0
listen_count    0
title           0
artist          0
song            0
dtype: int64

## Showing the most popular songs in the dataset

In [6]:
c = Counter(song_data['song'].values)

In [7]:
c.most_common(10)

[('Sehr kosmisch - Harmonia', 5970),
 ('Undo - Bj\xc3\xb6rk', 5281),
 ("You're The One - Dwight Yoakam", 4806),
 ('Dog Days Are Over (Radio Edit) - Florence + The Machine', 4536),
 ('Revelry - Kings Of Leon', 4339),
 ('Horn Concerto No. 4 in E flat K495: II. Romance (Andante cantabile) - Barry Tuckwell/Academy of St Martin-in-the-Fields/Sir Neville Marriner',
  3949),
 ('Secrets - OneRepublic', 3916),
 ('Tive Sim - Cartola', 3185),
 ('Fireflies - Charttraxx Karaoke', 3171),
 ('Hey_ Soul Sister - Train', 3132)]

## Count number of unique users in the dataset

In [8]:
users = song_data['user_id'].unique()

In [9]:
len(users)

66346

## Count number of unique items (songs) in the dataset

In [10]:
songs = song_data['song'].unique()

In [11]:
len(songs)

9952

# Create a song recommender system

### First, build user-item matrix

In [12]:
song = song_data.drop_duplicates(['user_id', 'song', 'title', 'artist'])
song = song.reset_index(drop=True)

In [13]:
print song_data.shape, song.shape

(1116609, 6) (1115985, 6)


### Due to memory limitations, we will just use 100k rows of data

In [14]:
song  = song.iloc[0:100000,]

In [15]:
song.shape

(100000, 6)

In [16]:
num_users   = len(song['user_id'].unique())
num_songs   = len(song['song_id'].unique())
num_artists = len(song['artist'].unique())
print 'We have %s unique users %s unique songs and %s unique artists' % (num_users, num_songs, num_artists)

We have 5905 unique users 9890 unique songs and 3359 unique artists


### Let's create user-item matrix by pivoting the table

In [17]:
song_pivot = song.pivot(index='user_id', columns='song_id', values = 'listen_count')

In [18]:
song_pivot.shape

(5905, 9890)

In [19]:
song_pivot = song_pivot.fillna(0)

## Implement a kNN method to recommend songs

In [20]:
knn = NearestNeighbors(n_neighbors=20, algorithm='brute', metric='cosine')

In [21]:
recsys_model = knn.fit(song_pivot)

### Let's recommend to a target user

In [22]:
song_pivot = song_pivot.reset_index(drop=True)

In [23]:
user_index = recsys_model.kneighbors(song_pivot.iloc[1:])[1][0]

In [24]:
user_index

array([   1, 2690,  600, 1835, 5785,  345, 1437, 4711, 5784,  556,  817,
       4384, 4910, 4213, 3656, 2907, 2060,  211, 1279, 1864])

In [25]:
all_users = song['user_id'].unique()

In [26]:
target = user_index[0]

In [27]:
neighbors = user_index[1:]

In [28]:
relevant_users = all_users[neighbors]

In [29]:
target_user_data = song[song['user_id'].isin(relevant_users)]

In [30]:
recommendation = pd.DataFrame(target_user_data.groupby(['song'])['listen_count'].sum())

In [31]:
recommendation = pd.DataFrame({'count':recommendation['listen_count'],'song':recommendation.index.tolist()})

In [32]:
recommendation = recommendation.sort_values('count', ascending=False)

In [33]:
recommendation = recommendation.reset_index(drop=True)

### The top 10 recommended songs for the target user

In [34]:
print recommendation[0:10]

   count                                               song
0     43                            Rio - Another Sunny Day
1     40                       Strani Amori - Laura Pausini
2     39  Sinisten tähtien alla - J. Karjalainen & Musta...
3     30                              Nothing - Ryan Leslie
4     29                       Ain't Misbehavin - Sam Cooke
5     27               Frisch und g'sund - Die Mooskirchner
6     24             Just Dance - Lady GaGa / Colby O'Donis
7     23                       Représente - Alliance Ethnik
8     21  All I Do Is Win (feat. T-Pain_ Ludacris_ Snoop...
9     20                     Fireflies - Charttraxx Karaoke


# Quizz questions

### Answer 1

In [35]:
kayneWest = song_data.loc[song_data['artist'] == 'Kanye West']
numUsers = kayneWest['user_id'].unique()
len(numUsers)

2522

In [36]:
fooFighters = song_data.loc[song_data['artist'] == 'Foo Fighters']
numUsers = fooFighters['user_id'].unique()
len(numUsers)

2055

In [37]:
taylorSwift = song_data.loc[song_data['artist'] == 'Taylor Swift']
numUsers = taylorSwift['user_id'].unique()
len(numUsers)

3246

In [38]:
ladyGaGa = song_data.loc[song_data['artist'] == 'Lady GaGa']
numUsers = ladyGaGa['user_id'].unique()
len(numUsers)

2928

### Answer 2

In [39]:
#song.head()
mostLessPopular = pd.DataFrame(song.groupby(['artist'])['listen_count'].sum())
mostLessPopular= mostLessPopular.sort_values(['listen_count', 'artist'], ascending=[True, False])
mostLessPopular.head()
#mostLessPopular.tail()

Unnamed: 0_level_0,listen_count
artist,Unnamed: 1_level_1
When The Empire Falls,1
Umphrey's McGee,1
Time Requiem,1
The Distillers,1
Spin Doctors,1


### Answer 3