In [1]:
# import the usual
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
import hashlib 
import datetime
import surprise
from keras.layers import Input, Embedding, Flatten, Dot, Dense, Concatenate
from keras.models import Model
from surprise import SVD
from sklearn import preprocessing
%matplotlib inline
pd.set_option('display.max_columns', 500)

Using TensorFlow backend.


### LastFm Data Loading

In [2]:
%%time
df = pd.read_csv("../../../Dataset/lastfm/userid-timestamp-artid-artname-traid-traname.tsv", sep="\t", error_bad_lines=False, header = None)

b'Skipping line 2120260: expected 6 fields, saw 8\n'
b'Skipping line 2446318: expected 6 fields, saw 8\n'
b'Skipping line 11141081: expected 6 fields, saw 8\n'
b'Skipping line 11152099: expected 6 fields, saw 12\nSkipping line 11152402: expected 6 fields, saw 8\n'
b'Skipping line 11882087: expected 6 fields, saw 8\n'
b'Skipping line 12902539: expected 6 fields, saw 8\nSkipping line 12935044: expected 6 fields, saw 8\n'
b'Skipping line 17589539: expected 6 fields, saw 8\n'


CPU times: user 31.1 s, sys: 4.4 s, total: 35.5 s
Wall time: 36.6 s


In [3]:
df.columns = ['userid', 'timestamp', 'artistid', 'artist', '1', 'song']

In [4]:
df.head()

Unnamed: 0,userid,timestamp,artistid,artist,1,song
0,user_000001,2009-05-04T23:08:57Z,f1b1cf71-bd35-4e99-8624-24a6e15f133a,Deep Dish,,Fuck Me Im Famous (Pacha Ibiza)-09-28-2007
1,user_000001,2009-05-04T13:54:10Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Composition 0919 (Live_2009_4_15)
2,user_000001,2009-05-04T13:52:04Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Mc2 (Live_2009_4_15)
3,user_000001,2009-05-04T13:42:52Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Hibari (Live_2009_4_15)
4,user_000001,2009-05-04T13:42:11Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Mc1 (Live_2009_4_15)


In [5]:
df['time'] = pd.to_datetime(df['timestamp'])
df.drop(['timestamp'], axis=1, inplace=True)

In [6]:
df.head()

Unnamed: 0,userid,artistid,artist,1,song,time
0,user_000001,f1b1cf71-bd35-4e99-8624-24a6e15f133a,Deep Dish,,Fuck Me Im Famous (Pacha Ibiza)-09-28-2007,2009-05-04 23:08:57+00:00
1,user_000001,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Composition 0919 (Live_2009_4_15),2009-05-04 13:54:10+00:00
2,user_000001,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Mc2 (Live_2009_4_15),2009-05-04 13:52:04+00:00
3,user_000001,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Hibari (Live_2009_4_15),2009-05-04 13:42:52+00:00
4,user_000001,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Mc1 (Live_2009_4_15),2009-05-04 13:42:11+00:00


### Removing artist name and song id (possibly) from the user_songs_df dataframe

In [7]:
user_songs_df = df.drop(['artist'], axis=1, inplace=False)
user_songs_df.columns = ['userid', 'artistid', 'songid', 'song', 'time']
user_songs_df = user_songs_df.drop(['songid'], axis=1, inplace=False)

In [8]:
user_songs_df.head()

Unnamed: 0,userid,artistid,song,time
0,user_000001,f1b1cf71-bd35-4e99-8624-24a6e15f133a,Fuck Me Im Famous (Pacha Ibiza)-09-28-2007,2009-05-04 23:08:57+00:00
1,user_000001,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,Composition 0919 (Live_2009_4_15),2009-05-04 13:54:10+00:00
2,user_000001,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,Mc2 (Live_2009_4_15),2009-05-04 13:52:04+00:00
3,user_000001,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,Hibari (Live_2009_4_15),2009-05-04 13:42:52+00:00
4,user_000001,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,Mc1 (Live_2009_4_15),2009-05-04 13:42:11+00:00


In [9]:
print('Number of songs : ' + str(user_songs_df['song'].nunique()))

Number of songs : 1083471


In [10]:
def get_unique_count(column):
    return len(set(column))

### Filtering songs heard by at least 10 users

In [11]:
# df_songs_heard_10_users = user_songs_df.groupby("song").filter(lambda x: get_unique_count(x['userid'])>10)

In [12]:
# df_songs_heard_10_users.head()

In [13]:
# print('Number of songs heard by at least 10 users : ' + str(df_songs_heard_10_users['song'].nunique()))
# print('Number of users in this dataframe : ' + str(df_songs_heard_10_users['userid'].nunique()))

### Filtering songs heard by at least 100 users

In [14]:
df_songs_heard_100_users = user_songs_df.groupby("song").filter(lambda x: get_unique_count(x['userid'])>100)

In [15]:
df_songs_heard_100_users.head()

Unnamed: 0,userid,artistid,song,time
22,user_000001,45bdb5be-ec03-484f-b58d-d22afc944b24,Clouds,2009-05-03 14:39:20+00:00
26,user_000001,ce559a88-58ba-4d8a-8456-9177412d609c,Seven,2009-05-03 14:07:40+00:00
28,user_000001,ce559a88-58ba-4d8a-8456-9177412d609c,Five,2009-05-03 13:56:25+00:00
30,user_000001,ce559a88-58ba-4d8a-8456-9177412d609c,Three,2009-05-03 13:40:46+00:00
39,user_000001,3d05eb8b-1644-4143-9a61-b28e33c4d85f,Something In The Way,2009-05-02 15:19:46+00:00


In [16]:
print('Number of songs heard by at least 100 users : ' + str(df_songs_heard_100_users['song'].nunique()))
print('Number of users in this dataframe : ' + str(df_songs_heard_100_users['userid'].nunique()))

Number of songs heard by at least 100 users : 4034
Number of users in this dataframe : 990


### Creation of timeslots

In [17]:
df_songs_heard_100_users['hour'] = df_songs_heard_100_users['time'].dt.hour

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [18]:
df_songs_heard_100_users.head()

Unnamed: 0,userid,artistid,song,time,hour
22,user_000001,45bdb5be-ec03-484f-b58d-d22afc944b24,Clouds,2009-05-03 14:39:20+00:00,14
26,user_000001,ce559a88-58ba-4d8a-8456-9177412d609c,Seven,2009-05-03 14:07:40+00:00,14
28,user_000001,ce559a88-58ba-4d8a-8456-9177412d609c,Five,2009-05-03 13:56:25+00:00,13
30,user_000001,ce559a88-58ba-4d8a-8456-9177412d609c,Three,2009-05-03 13:40:46+00:00,13
39,user_000001,3d05eb8b-1644-4143-9a61-b28e33c4d85f,Something In The Way,2009-05-02 15:19:46+00:00,15


In [19]:
#function to return slot number
def slot(hour):
    if 0 <= hour and hour <= 5:
        return 1
    elif 6 <= hour and hour <= 11:
        return 2
    elif 12 <= hour and hour <= 17:
        return 3
    else:
        return 4

In [20]:
%%time
df_songs_heard_100_users['slot'] = df_songs_heard_100_users['hour'].apply( lambda x : slot(x) ) 

CPU times: user 1.8 s, sys: 87.3 ms, total: 1.89 s
Wall time: 1.72 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [21]:
df_songs_heard_100_users.head()

Unnamed: 0,userid,artistid,song,time,hour,slot
22,user_000001,45bdb5be-ec03-484f-b58d-d22afc944b24,Clouds,2009-05-03 14:39:20+00:00,14,3
26,user_000001,ce559a88-58ba-4d8a-8456-9177412d609c,Seven,2009-05-03 14:07:40+00:00,14,3
28,user_000001,ce559a88-58ba-4d8a-8456-9177412d609c,Five,2009-05-03 13:56:25+00:00,13,3
30,user_000001,ce559a88-58ba-4d8a-8456-9177412d609c,Three,2009-05-03 13:40:46+00:00,13,3
39,user_000001,3d05eb8b-1644-4143-9a61-b28e33c4d85f,Something In The Way,2009-05-02 15:19:46+00:00,15,3


In [22]:
df_songs_heard_100_users.shape

(3922101, 6)

#### Creating song ids

In [23]:
df_songs_heard_100_users['songid'] = df_songs_heard_100_users.groupby(['song']).ngroup().add(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [24]:
df_songs_heard_100_users['userid'] = df_songs_heard_100_users.groupby(['userid']).ngroup().add(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [25]:
df_songs_heard_100_users.head()

Unnamed: 0,userid,artistid,song,time,hour,slot,songid
22,1,45bdb5be-ec03-484f-b58d-d22afc944b24,Clouds,2009-05-03 14:39:20+00:00,14,3,619
26,1,ce559a88-58ba-4d8a-8456-9177412d609c,Seven,2009-05-03 14:07:40+00:00,14,3,2896
28,1,ce559a88-58ba-4d8a-8456-9177412d609c,Five,2009-05-03 13:56:25+00:00,13,3,1099
30,1,ce559a88-58ba-4d8a-8456-9177412d609c,Three,2009-05-03 13:40:46+00:00,13,3,3565
39,1,3d05eb8b-1644-4143-9a61-b28e33c4d85f,Something In The Way,2009-05-02 15:19:46+00:00,15,3,3061


In [26]:
df_songs_heard_100_users[df_songs_heard_100_users['song'] == 'Clouds']

Unnamed: 0,userid,artistid,song,time,hour,slot,songid
22,1,45bdb5be-ec03-484f-b58d-d22afc944b24,Clouds,2009-05-03 14:39:20+00:00,14,3,619
212,1,45bdb5be-ec03-484f-b58d-d22afc944b24,Clouds,2009-04-28 13:18:42+00:00,13,3,619
264,1,45bdb5be-ec03-484f-b58d-d22afc944b24,Clouds,2009-04-26 18:37:07+00:00,18,4,619
280,1,45bdb5be-ec03-484f-b58d-d22afc944b24,Clouds,2009-04-26 17:07:08+00:00,17,3,619
1067,1,45bdb5be-ec03-484f-b58d-d22afc944b24,Clouds,2009-04-11 15:53:03+00:00,15,3,619
...,...,...,...,...,...,...,...
18990426,985,1dcc8968-f2cd-441c-beda-6270f70f2863,Clouds,2006-08-06 10:32:53+00:00,10,2,619
19012117,987,24ed5b09-02b1-47fe-bd83-6fa5270039b0,Clouds,2007-01-31 02:35:11+00:00,2,1,619
19012489,987,24ed5b09-02b1-47fe-bd83-6fa5270039b0,Clouds,2007-01-30 00:31:13+00:00,0,1,619
19012885,987,24ed5b09-02b1-47fe-bd83-6fa5270039b0,Clouds,2007-01-27 17:27:21+00:00,17,3,619


In [27]:
# Removing artistid, time and hour columns
input_dataset = df_songs_heard_100_users.drop(['artistid', 'song', 'time','hour'], axis=1, inplace=False)

In [28]:
input_dataset = input_dataset[['userid', 'songid', 'slot']]

In [29]:
input_dataset.head()

Unnamed: 0,userid,songid,slot
22,1,619,3
26,1,2896,3
28,1,1099,3
30,1,3565,3
39,1,3061,3


In [30]:
# input_dataset.head().plot()

In [34]:
input_dataset = input_dataset.groupby(["userid","songid"]).size().reset_index(name="count")

In [35]:
input_dataset.head()

Unnamed: 0,userid,songid,count
0,1,4,1
1,1,10,24
2,1,26,1
3,1,27,5
4,1,30,4


In [36]:
def get_rating(x):
    x['rating'] = pd.cut(x['count'], 5, labels=[1,2,3,4,5], right=True)
    return x

In [37]:
input_dataset = input_dataset.groupby("userid").apply(lambda x : get_rating(x))

In [38]:
input_dataset.head()

Unnamed: 0,userid,songid,count,rating
0,1,4,1,1
1,1,10,24,2
2,1,26,1,1
3,1,27,5,1
4,1,30,4,1


In [39]:
input_dataset.drop(['count'], axis=1, inplace=True)

In [40]:
input_dataset['rating'] = input_dataset['rating'].astype('int')

In [43]:
n_users = input_dataset['userid'].max()
n_songs = len(input_dataset.songid.unique())

In [60]:
input_dataset.head()

Unnamed: 0,userid,songid,rating
0,1,4,1
1,1,10,2
2,1,26,1
3,1,27,1
4,1,30,1


In [95]:
from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import KFold

# Load the movielens-100k dataset
# data = Dataset.load_builtin('ml-100k')

# define a cross-validation iterator
kf = KFold(n_splits=10)
reader = surprise.reader.Reader(sep=',',rating_scale=(0, 5))
algo = SVD()
# input_dataset = surprise.Dataset.load_from_df(input_dataset[['userid', 'songid', 'rating']], reader)
for trainset, testset in kf.split(input_dataset):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

RMSE: 0.4684
RMSE: 0.4537
RMSE: 0.4690
RMSE: 0.4662
RMSE: 0.4637
RMSE: 0.4545
RMSE: 0.4662
RMSE: 0.4679
RMSE: 0.4715
RMSE: 0.4653
