In [48]:
# import the usual
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
import hashlib 
import datetime
import surprise
from surprise import SVD
%matplotlib inline
pd.set_option('display.max_columns', 500)

### LastFm Data Loading

In [2]:
%%time
df = pd.read_csv("../../Dataset/lastfm/userid-timestamp-artid-artname-traid-traname.tsv", sep="\t", error_bad_lines=False, header = None)

b'Skipping line 2120260: expected 6 fields, saw 8\n'
b'Skipping line 2446318: expected 6 fields, saw 8\n'
b'Skipping line 11141081: expected 6 fields, saw 8\n'
b'Skipping line 11152099: expected 6 fields, saw 12\nSkipping line 11152402: expected 6 fields, saw 8\n'
b'Skipping line 11882087: expected 6 fields, saw 8\n'
b'Skipping line 12902539: expected 6 fields, saw 8\nSkipping line 12935044: expected 6 fields, saw 8\n'
b'Skipping line 17589539: expected 6 fields, saw 8\n'


In [3]:
df.columns = ['userid', 'timestamp', 'artistid', 'artist', '1', 'song']

In [4]:
df.head()

Unnamed: 0,userid,timestamp,artistid,artist,1,song
0,user_000001,2009-05-04T23:08:57Z,f1b1cf71-bd35-4e99-8624-24a6e15f133a,Deep Dish,,Fuck Me Im Famous (Pacha Ibiza)-09-28-2007
1,user_000001,2009-05-04T13:54:10Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Composition 0919 (Live_2009_4_15)
2,user_000001,2009-05-04T13:52:04Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Mc2 (Live_2009_4_15)
3,user_000001,2009-05-04T13:42:52Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Hibari (Live_2009_4_15)
4,user_000001,2009-05-04T13:42:11Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Mc1 (Live_2009_4_15)


In [5]:
df['time'] = pd.to_datetime(df['timestamp'])
df.drop(['timestamp'], axis=1, inplace=True)

In [6]:
df.head()

Unnamed: 0,userid,artistid,artist,1,song,time
0,user_000001,f1b1cf71-bd35-4e99-8624-24a6e15f133a,Deep Dish,,Fuck Me Im Famous (Pacha Ibiza)-09-28-2007,2009-05-04 23:08:57+00:00
1,user_000001,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Composition 0919 (Live_2009_4_15),2009-05-04 13:54:10+00:00
2,user_000001,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Mc2 (Live_2009_4_15),2009-05-04 13:52:04+00:00
3,user_000001,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Hibari (Live_2009_4_15),2009-05-04 13:42:52+00:00
4,user_000001,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Mc1 (Live_2009_4_15),2009-05-04 13:42:11+00:00


### Removing artist name and song id (possibly) from the user_songs_df dataframe

In [7]:
user_songs_df = df.drop(['artist'], axis=1, inplace=False)
user_songs_df.columns = ['userid', 'artistid', 'songid', 'song', 'time']
user_songs_df = user_songs_df.drop(['songid'], axis=1, inplace=False)

In [8]:
user_songs_df.head()

Unnamed: 0,userid,artistid,song,time
0,user_000001,f1b1cf71-bd35-4e99-8624-24a6e15f133a,Fuck Me Im Famous (Pacha Ibiza)-09-28-2007,2009-05-04 23:08:57+00:00
1,user_000001,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,Composition 0919 (Live_2009_4_15),2009-05-04 13:54:10+00:00
2,user_000001,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,Mc2 (Live_2009_4_15),2009-05-04 13:52:04+00:00
3,user_000001,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,Hibari (Live_2009_4_15),2009-05-04 13:42:52+00:00
4,user_000001,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,Mc1 (Live_2009_4_15),2009-05-04 13:42:11+00:00


In [9]:
print('Number of songs : ' + str(user_songs_df['song'].nunique()))

Number of songs : 1083471


In [10]:
def get_unique_count(column):
    return len(set(column))

### Filtering songs heard by at least 10 users

In [11]:
df_songs_heard_10_users = user_songs_df.groupby("song").filter(lambda x: get_unique_count(x['userid'])>10)

In [12]:
df_songs_heard_10_users.head()

Unnamed: 0,userid,artistid,song,time
17,user_000001,463a94f1-2713-40b1-9c88-dcc9c0170cae,Elysian Fields,2009-05-03 15:10:18+00:00
18,user_000001,ad0811ea-e213-451d-b22f-fa1a7f9e0226,Planetary Deadlock,2009-05-03 15:04:31+00:00
20,user_000001,6f3d4a7b-45b2-4c08-9306-8d271e92cb4f,Deadly Species,2009-05-03 14:50:51+00:00
21,user_000001,463a94f1-2713-40b1-9c88-dcc9c0170cae,Cold Fusion,2009-05-03 14:46:29+00:00
22,user_000001,45bdb5be-ec03-484f-b58d-d22afc944b24,Clouds,2009-05-03 14:39:20+00:00


In [15]:
print('Number of songs heard by at least 10 users : ' + str(df_songs_heard_10_users['song'].nunique()))
print('Number of users in this dataframe : ' + str(df_songs_heard_10_users['userid'].nunique()))

Number of songs heard by at least 10 users : 72545
Number of users in this dataframe : 992


### Filtering songs heard by at least 100 users

In [16]:
df_songs_heard_100_users = user_songs_df.groupby("song").filter(lambda x: get_unique_count(x['userid'])>100)

In [17]:
df_songs_heard_100_users.head()

Unnamed: 0,userid,artistid,song,time
22,user_000001,45bdb5be-ec03-484f-b58d-d22afc944b24,Clouds,2009-05-03 14:39:20+00:00
26,user_000001,ce559a88-58ba-4d8a-8456-9177412d609c,Seven,2009-05-03 14:07:40+00:00
28,user_000001,ce559a88-58ba-4d8a-8456-9177412d609c,Five,2009-05-03 13:56:25+00:00
30,user_000001,ce559a88-58ba-4d8a-8456-9177412d609c,Three,2009-05-03 13:40:46+00:00
39,user_000001,3d05eb8b-1644-4143-9a61-b28e33c4d85f,Something In The Way,2009-05-02 15:19:46+00:00


In [18]:
print('Number of songs heard by at least 100 users : ' + str(df_songs_heard_100_users['song'].nunique()))
print('Number of users in this dataframe : ' + str(df_songs_heard_100_users['userid'].nunique()))

Number of songs heard by at least 100 users : 4034
Number of users in this dataframe : 990


### Creation of timeslots

In [19]:
df_songs_heard_100_users['hour'] = df_songs_heard_100_users['time'].dt.hour

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [20]:
df_songs_heard_100_users.head()

Unnamed: 0,userid,artistid,song,time,hour
22,user_000001,45bdb5be-ec03-484f-b58d-d22afc944b24,Clouds,2009-05-03 14:39:20+00:00,14
26,user_000001,ce559a88-58ba-4d8a-8456-9177412d609c,Seven,2009-05-03 14:07:40+00:00,14
28,user_000001,ce559a88-58ba-4d8a-8456-9177412d609c,Five,2009-05-03 13:56:25+00:00,13
30,user_000001,ce559a88-58ba-4d8a-8456-9177412d609c,Three,2009-05-03 13:40:46+00:00,13
39,user_000001,3d05eb8b-1644-4143-9a61-b28e33c4d85f,Something In The Way,2009-05-02 15:19:46+00:00,15


In [21]:
#function to return slot number
def slot(hour):
    if 0 <= hour and hour <= 5:
        return 1
    elif 6 <= hour and hour <= 11:
        return 2
    elif 12 <= hour and hour <= 17:
        return 3
    else:
        return 4

In [22]:
%%time
df_songs_heard_100_users['slot'] = df_songs_heard_100_users['hour'].apply( lambda x : slot(x) ) 

CPU times: user 1.9 s, sys: 76 ms, total: 1.98 s
Wall time: 1.6 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [23]:
df_songs_heard_100_users.head()

Unnamed: 0,userid,artistid,song,time,hour,slot
22,user_000001,45bdb5be-ec03-484f-b58d-d22afc944b24,Clouds,2009-05-03 14:39:20+00:00,14,3
26,user_000001,ce559a88-58ba-4d8a-8456-9177412d609c,Seven,2009-05-03 14:07:40+00:00,14,3
28,user_000001,ce559a88-58ba-4d8a-8456-9177412d609c,Five,2009-05-03 13:56:25+00:00,13,3
30,user_000001,ce559a88-58ba-4d8a-8456-9177412d609c,Three,2009-05-03 13:40:46+00:00,13,3
39,user_000001,3d05eb8b-1644-4143-9a61-b28e33c4d85f,Something In The Way,2009-05-02 15:19:46+00:00,15,3


In [24]:
df_songs_heard_100_users.shape

(3922101, 6)

#### Creating song ids

In [286]:
df_songs_heard_100_users['songid'] = df_songs_heard_100_users.groupby(['song']).ngroup().add(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [287]:
df_songs_heard_100_users.head()

Unnamed: 0,userid,artistid,song,time,hour,slot,songid
22,user_000001,45bdb5be-ec03-484f-b58d-d22afc944b24,Clouds,2009-05-03 14:39:20+00:00,14,3,619
26,user_000001,ce559a88-58ba-4d8a-8456-9177412d609c,Seven,2009-05-03 14:07:40+00:00,14,3,2896
28,user_000001,ce559a88-58ba-4d8a-8456-9177412d609c,Five,2009-05-03 13:56:25+00:00,13,3,1099
30,user_000001,ce559a88-58ba-4d8a-8456-9177412d609c,Three,2009-05-03 13:40:46+00:00,13,3,3565
39,user_000001,3d05eb8b-1644-4143-9a61-b28e33c4d85f,Something In The Way,2009-05-02 15:19:46+00:00,15,3,3061


In [289]:
df_songs_heard_100_users[df_songs_heard_100_users['song'] == 'Clouds']

Unnamed: 0,userid,artistid,song,time,hour,slot,songid
22,user_000001,45bdb5be-ec03-484f-b58d-d22afc944b24,Clouds,2009-05-03 14:39:20+00:00,14,3,619
212,user_000001,45bdb5be-ec03-484f-b58d-d22afc944b24,Clouds,2009-04-28 13:18:42+00:00,13,3,619
264,user_000001,45bdb5be-ec03-484f-b58d-d22afc944b24,Clouds,2009-04-26 18:37:07+00:00,18,4,619
280,user_000001,45bdb5be-ec03-484f-b58d-d22afc944b24,Clouds,2009-04-26 17:07:08+00:00,17,3,619
1067,user_000001,45bdb5be-ec03-484f-b58d-d22afc944b24,Clouds,2009-04-11 15:53:03+00:00,15,3,619
...,...,...,...,...,...,...,...
18990426,user_000995,1dcc8968-f2cd-441c-beda-6270f70f2863,Clouds,2006-08-06 10:32:53+00:00,10,2,619
19012117,user_000997,24ed5b09-02b1-47fe-bd83-6fa5270039b0,Clouds,2007-01-31 02:35:11+00:00,2,1,619
19012489,user_000997,24ed5b09-02b1-47fe-bd83-6fa5270039b0,Clouds,2007-01-30 00:31:13+00:00,0,1,619
19012885,user_000997,24ed5b09-02b1-47fe-bd83-6fa5270039b0,Clouds,2007-01-27 17:27:21+00:00,17,3,619


In [290]:
# Removing artistid, time and hour columns
input_dataset = df_songs_heard_100_users.drop(['artistid', 'song', 'time','hour'], axis=1, inplace=False)

In [292]:
input_dataset = input_dataset[['userid', 'songid', 'slot']]

In [293]:
input_dataset.head()

Unnamed: 0,userid,songid,slot
22,user_000001,619,3
26,user_000001,2896,3
28,user_000001,1099,3
30,user_000001,3565,3
39,user_000001,3061,3


### Generating training and testing data

In [294]:
#Group by users and get the size of each group
user_record_counts = input_dataset.groupby("userid").apply(lambda x: x.shape[0])

In [295]:
#Divide dataset into train and test based on users and the number of the songs listened
train_set_keys = []
test_set_keys = []
sum_val = 0
total = input_dataset.shape[0]
for key, value in user_record_counts.items():
    sum_val += value
    fraction = sum_val/total
    if fraction <= 0.8:
        train_set_keys.append(key)        
    else:
        test_set_keys.append(key)

In [296]:
print("Number of users in train set : "+str(len(train_set_keys)))
print("Number of users in test set : "+str(len(test_set_keys)))

Number of users in train set : 792
Number of users in test set : 198


In [298]:
train_set = input_dataset.groupby("userid").filter(lambda x: x['userid'].iloc[0] in train_set_keys)
test_set = input_dataset.groupby("userid").filter(lambda x: x['userid'].iloc[0] in test_set_keys)

In [299]:
print("Number of users in train set : "+str(train_set['userid'].nunique()))
print("Number of users in test set : "+str(test_set['userid'].nunique()))

Number of users in train set : 792
Number of users in test set : 198


In [300]:
print('Fraction of train set : ' + str(train_set.shape[0]/total))

Fraction of train set : 0.7984888711432979


In [306]:
train_set.head()

Unnamed: 0,userid,songid,slot
22,user_000001,619,3
26,user_000001,2896,3
28,user_000001,1099,3
30,user_000001,3565,3
39,user_000001,3061,3


### Creating train dataframes based on slots

In [321]:
grouped_train_set = train_set.groupby('slot')
grouped_test_set = test_set.groupby('slot')

In [322]:
train_first_slot_df = grouped_train_set.get_group(1)
train_second_slot_df = grouped_train_set.get_group(2)
train_third_slot_df = grouped_train_set.get_group(3)
train_fourth_slot_df = grouped_train_set.get_group(4)

test_first_slot_df = grouped_test_set.get_group(1)
test_second_slot_df = grouped_test_set.get_group(2)
test_third_slot_df = grouped_test_set.get_group(3)
test_fourth_slot_df = grouped_test_set.get_group(4)

In [323]:
# train_first_slot_df

In [324]:
print('First train slot shape : ' + str(train_first_slot_df.shape))
print('Second train slot shape : ' + str(train_second_slot_df.shape))
print('Third train slot shape : ' + str(train_third_slot_df.shape))
print('Fourth train slot shape : ' + str(train_fourth_slot_df.shape))
print('')
print("Number of users in train_first_slot_df : " + str(train_first_slot_df['userid'].nunique()))
print("Number of users in train_second_slot_df : " + str(train_second_slot_df['userid'].nunique()))
print("Number of users in train_third_slot_df : " + str(train_third_slot_df['userid'].nunique()))
print("Number of users in train_fourth_slot_df : " + str(train_fourth_slot_df['userid'].nunique()))
print('')
print("Number of songs in train_first_slot_df : " + str(train_first_slot_df['songid'].nunique()))
print("Number of songs in train_second_slot_df : " + str(train_second_slot_df['songid'].nunique()))
print("Number of songs in train_third_slot_df : " + str(train_third_slot_df['songid'].nunique()))
print("Number of songs in train_fourth_slot_df : " + str(train_fourth_slot_df['songid'].nunique()))
print('')
print('First test slot shape : ' + str(test_first_slot_df.shape))
print('Second test slot shape : ' + str(test_second_slot_df.shape))
print('Third test slot shape : ' + str(test_third_slot_df.shape))
print('Fourth test slot shape : ' + str(test_fourth_slot_df.shape))
print('')
print("Number of users in test_first_slot_df : " + str(test_first_slot_df['userid'].nunique()))
print("Number of users in test_second_slot_df : " + str(test_second_slot_df['userid'].nunique()))
print("Number of users in test_third_slot_df : " + str(test_third_slot_df['userid'].nunique()))
print("Number of users in test_fourth_slot_df : " + str(test_fourth_slot_df['userid'].nunique()))
print('')
print("Number of songs in test_first_slot_df : " + str(test_first_slot_df['songid'].nunique()))
print("Number of songs in test_second_slot_df : " + str(test_second_slot_df['songid'].nunique()))
print("Number of songs in test_third_slot_df : " + str(test_third_slot_df['songid'].nunique()))
print("Number of songs in test_fourth_slot_df : " + str(test_fourth_slot_df['songid'].nunique()))


First train slot shape : (596866, 3)
Second train slot shape : (544262, 3)
Third train slot shape : (943222, 3)
Fourth train slot shape : (1047404, 3)

Number of users in train_first_slot_df : 737
Number of users in train_second_slot_df : 762
Number of users in train_third_slot_df : 784
Number of users in train_fourth_slot_df : 780

Number of songs in train_first_slot_df : 4034
Number of songs in train_second_slot_df : 4034
Number of songs in train_third_slot_df : 4034
Number of songs in train_fourth_slot_df : 4034

First test slot shape : (173856, 3)
Second test slot shape : (146390, 3)
Third test slot shape : (221677, 3)
Fourth test slot shape : (248424, 3)

Number of users in test_first_slot_df : 185
Number of users in test_second_slot_df : 188
Number of users in test_third_slot_df : 191
Number of users in test_fourth_slot_df : 193

Number of songs in test_first_slot_df : 4034
Number of songs in test_second_slot_df : 4034
Number of songs in test_third_slot_df : 4034
Number of songs 

In [325]:
# Get user-song-count dataframe for each slot
train_user_song_count_df_first = train_first_slot_df.groupby(["userid","songid"]).size().reset_index(name="count")
train_user_song_count_df_second = train_second_slot_df.groupby(["userid","songid"]).size().reset_index(name="count")
train_user_song_count_df_third = train_third_slot_df.groupby(["userid","songid"]).size().reset_index(name="count")
train_user_song_count_df_fourth = train_fourth_slot_df.groupby(["userid","songid"]).size().reset_index(name="count")

test_user_song_count_df_first = test_first_slot_df.groupby(["userid","songid"]).size().reset_index(name="count")
test_user_song_count_df_second = test_second_slot_df.groupby(["userid","songid"]).size().reset_index(name="count")
test_user_song_count_df_third = test_third_slot_df.groupby(["userid","songid"]).size().reset_index(name="count")
test_user_song_count_df_fourth = test_fourth_slot_df.groupby(["userid","songid"]).size().reset_index(name="count")

## *** Use these dataframes ***

In [None]:
train_user_song_count_df_first
train_user_song_count_df_second
train_user_song_count_df_third
train_user_song_count_df_fourth

test_user_song_count_df_first
test_user_song_count_df_second
test_user_song_count_df_third
test_user_song_count_df_fourth

# --------IGNORE THIS!!!--------

### First slot train and test dataset

In [326]:
train_user_song_count_df_first.head()

Unnamed: 0,userid,songid,count
0,user_000001,10,2
1,user_000001,27,4
2,user_000001,43,1
3,user_000001,91,1
4,user_000001,137,2


In [327]:
test_user_song_count_df_first.head()

Unnamed: 0,userid,songid,count
0,user_000802,2,1
1,user_000802,5,2
2,user_000802,14,2
3,user_000802,51,2
4,user_000802,64,1


In [328]:
# Get the max count for train and test set
train_max_count = train_user_song_count_df_first['count'].max()
test_max_count = test_user_song_count_df_first['count'].max()

In [329]:
print('train_max_count : ' + str(train_max_count))
print('test_max_count : ' + str(test_max_count))

train_max_count : 811
test_max_count : 431


In [330]:
test_user_song_count_df_first[test_user_song_count_df_first['userid'] == 'user_000802']

Unnamed: 0,userid,songid,count
0,user_000802,2,1
1,user_000802,5,2
2,user_000802,14,2
3,user_000802,51,2
4,user_000802,64,1
...,...,...,...
273,user_000802,3900,2
274,user_000802,3912,2
275,user_000802,3981,3
276,user_000802,4030,2


In [331]:
train_user_song_count_df_first.max()

userid    user_000801
songid           4034
count             811
dtype: object

## Surprise IGNORE THIS!

In [95]:
train_user_song_count_df_first.head()

Unnamed: 0,userid,song,count
0,user_000001,15 Step,2
1,user_000001,49 Percent,4
2,user_000001,A Moment Of Clarity,1
3,user_000001,Alice,1
4,user_000001,Always,2


In [109]:
train_user_song_count_df_first['songid'] = train_user_song_count_df_first.groupby(['song']).ngroup().add(1)

In [112]:
train_user_song_count_df_first[train_user_song_count_df_first['song'] == '15 Step']

Unnamed: 0,userid,song,count,songid
0,user_000001,15 Step,2,10
1508,user_000004,15 Step,9,10
4818,user_000019,15 Step,1,10
5437,user_000021,15 Step,2,10
7630,user_000026,15 Step,9,10
...,...,...,...,...
183273,user_000771,15 Step,3,10
185864,user_000779,15 Step,22,10
189561,user_000789,15 Step,4,10
191886,user_000791,15 Step,5,10


In [105]:
# train_user_song_count_df_first.drop(['songid'], axis=1, inplace=True)

In [115]:
training_data = train_user_song_count_df_first.drop(['song'], axis=1, inplace=False)

In [116]:
training_data.head()

Unnamed: 0,userid,count,songid
0,user_000001,2,10
1,user_000001,4,27
2,user_000001,1,43
3,user_000001,1,91
4,user_000001,2,137


In [117]:
training_data = training_data[['userid', 'songid', 'count']]

In [118]:
training_data.head()

Unnamed: 0,userid,songid,count
0,user_000001,10,2
1,user_000001,27,4
2,user_000001,43,1
3,user_000001,91,1
4,user_000001,137,2


In [119]:
test_user_song_count_df_first['songid'] = test_user_song_count_df_first.groupby(['song']).ngroup().add(1)

In [120]:
test_user_song_count_df_first[test_user_song_count_df_first['song'] == '15 Step']

Unnamed: 0,userid,song,count,songid
501,user_000804,15 Step,1,10
1751,user_000811,15 Step,20,10
2706,user_000820,15 Step,1,10
3933,user_000826,15 Step,1,10
4685,user_000829,15 Step,11,10
6480,user_000833,15 Step,1,10
7217,user_000835,15 Step,1,10
8263,user_000837,15 Step,9,10
10208,user_000839,15 Step,1,10
13244,user_000857,15 Step,1,10


In [121]:
testing_data = test_user_song_count_df_first.drop(['song'], axis=1, inplace=False)

In [122]:
testing_data.head()

Unnamed: 0,userid,count,songid
0,user_000802,1,2
1,user_000802,2,5
2,user_000802,2,14
3,user_000802,2,51
4,user_000802,1,64


In [123]:
testing_data = testing_data[['userid', 'songid', 'count']]

In [124]:
testing_data.head()

Unnamed: 0,userid,songid,count
0,user_000802,2,1
1,user_000802,5,2
2,user_000802,14,2
3,user_000802,51,2
4,user_000802,64,1


In [167]:
testing_data['count'] = testing_data['count'].apply(lambda x: x/testing_data['count'].max())

In [168]:
testing_data.head()

Unnamed: 0,userid,songid,count
0,user_000802,2,0.00232
1,user_000802,5,0.00464
2,user_000802,14,0.00464
3,user_000802,51,0.00464
4,user_000802,64,0.00232


In [170]:
training_data['count'] = training_data['count'].apply(lambda x: x/training_data['count'].max())

In [171]:
training_data.head()

Unnamed: 0,userid,songid,count
0,user_000001,10,0.002466
1,user_000001,27,0.004932
2,user_000001,43,0.001233
3,user_000001,91,0.001233
4,user_000001,137,0.002466


In [172]:
training_data['count'].max()

1.0

In [179]:
# Use the famous SVD algorithm.
algo = SVD()
reader = surprise.reader.Reader(sep=',',rating_scale=(0, 1))

In [180]:
train_data = surprise.Dataset.load_from_df(training_data[['userid', 'songid', 'count']], reader)

In [181]:
#Use entire dataset as training data
trainset = train_data.build_full_trainset()

In [182]:
# Training
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1a568de550>

In [183]:
test_data = testing_data

In [186]:
# predict using test data
results = []
output = pd.DataFrame(columns = ['userid', 'songid','score'])
for index, row in test_data.iterrows():
    prediction = algo.predict(row['userid'], row['songid'], verbose=False)
    predicted_rating = prediction.est
    output = output.append(pd.Series([str(row['userid']), str(row['songid']), predicted_rating], index=output.columns), ignore_index=True)

In [243]:
output.head()

Unnamed: 0,userid,songid,score
0,user_000802,2,0.000308
1,user_000802,5,0.004344
2,user_000802,14,0.005623
3,user_000802,51,0.003817
4,user_000802,64,0.002666


In [226]:
output.head()

Unnamed: 0,userid,songid,score
0,user_000802,2,0.000308
1,user_000802,5,0.004344
2,user_000802,14,0.005623
3,user_000802,51,0.003817
4,user_000802,64,0.002666


In [63]:
test_data.shape

(51240, 3)

In [None]:
df.nlargest(5, ['Age']) 

In [283]:
output_songs = set(np.array(output[output['userid']=='user_000802'].nlargest(30, ['score'])['songid']))
testing_songs = set(np.array(testing_data[testing_data['userid']=='user_000802'].nlargest(20, ['count'])['songid'].astype('int')))

In [284]:
a = map(str, list(testing_songs))

In [285]:
len(set(list(a)) & set(list(output_songs)))

1

In [259]:
list(a)

[]

In [245]:
list(output_songs)

['2173', '2338', '2607', '239', '779', '1680', '231', '2984', '1276', '3712']

In [239]:
testing_data[testing_data['userid'] == 'user_000802'].nlargest(50, ['count']) 

Unnamed: 0,userid,songid,count
60,user_000802,888,0.074246
211,user_000802,2974,0.064965
114,user_000802,1640,0.060325
196,user_000802,2813,0.060325
205,user_000802,2888,0.060325
51,user_000802,779,0.055684
178,user_000802,2670,0.055684
6,user_000802,81,0.053364
30,user_000802,480,0.048724
9,user_000802,163,0.044084


In [197]:
set([1,2,3,4]) & set([1,2,5,6])

{1, 2}