In [1]:
# import the usual
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
import hashlib 
import datetime
# import surprise
# from surprise import SVD
%matplotlib inline
pd.set_option('display.max_columns', 500)

### LastFm Data Loading

In [2]:
%%time
df = pd.read_csv(r"F:\Data_Repository\lastfm\userid-timestamp-artid-artname-traid-traname.tsv", sep="\t", error_bad_lines=False, header = None)

b'Skipping line 2120260: expected 6 fields, saw 8\n'
b'Skipping line 2446318: expected 6 fields, saw 8\n'
b'Skipping line 11141081: expected 6 fields, saw 8\n'
b'Skipping line 11152099: expected 6 fields, saw 12\nSkipping line 11152402: expected 6 fields, saw 8\n'
b'Skipping line 11882087: expected 6 fields, saw 8\n'
b'Skipping line 12902539: expected 6 fields, saw 8\nSkipping line 12935044: expected 6 fields, saw 8\n'
b'Skipping line 17589539: expected 6 fields, saw 8\n'


Wall time: 28 s


In [3]:
df.columns = ['userid', 'timestamp', 'artistid', 'artist', '1', 'song']

In [4]:
df.head()

Unnamed: 0,userid,timestamp,artistid,artist,1,song
0,user_000001,2009-05-04T23:08:57Z,f1b1cf71-bd35-4e99-8624-24a6e15f133a,Deep Dish,,Fuck Me Im Famous (Pacha Ibiza)-09-28-2007
1,user_000001,2009-05-04T13:54:10Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Composition 0919 (Live_2009_4_15)
2,user_000001,2009-05-04T13:52:04Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Mc2 (Live_2009_4_15)
3,user_000001,2009-05-04T13:42:52Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Hibari (Live_2009_4_15)
4,user_000001,2009-05-04T13:42:11Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Mc1 (Live_2009_4_15)


In [5]:
df['time'] = pd.to_datetime(df['timestamp'])
df.drop(['timestamp'], axis=1, inplace=True)

In [6]:
df.head()

Unnamed: 0,userid,artistid,artist,1,song,time
0,user_000001,f1b1cf71-bd35-4e99-8624-24a6e15f133a,Deep Dish,,Fuck Me Im Famous (Pacha Ibiza)-09-28-2007,2009-05-04 23:08:57
1,user_000001,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Composition 0919 (Live_2009_4_15),2009-05-04 13:54:10
2,user_000001,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Mc2 (Live_2009_4_15),2009-05-04 13:52:04
3,user_000001,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Hibari (Live_2009_4_15),2009-05-04 13:42:52
4,user_000001,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Mc1 (Live_2009_4_15),2009-05-04 13:42:11


### Removing artist name and song id (possibly) from the user_songs_df dataframe

In [7]:
user_songs_df = df.drop(['artist'], axis=1, inplace=False)
user_songs_df.columns = ['userid', 'artistid', 'songid', 'song', 'time']
user_songs_df = user_songs_df.drop(['songid'], axis=1, inplace=False)

In [8]:
user_songs_df.head()

Unnamed: 0,userid,artistid,song,time
0,user_000001,f1b1cf71-bd35-4e99-8624-24a6e15f133a,Fuck Me Im Famous (Pacha Ibiza)-09-28-2007,2009-05-04 23:08:57
1,user_000001,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,Composition 0919 (Live_2009_4_15),2009-05-04 13:54:10
2,user_000001,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,Mc2 (Live_2009_4_15),2009-05-04 13:52:04
3,user_000001,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,Hibari (Live_2009_4_15),2009-05-04 13:42:52
4,user_000001,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,Mc1 (Live_2009_4_15),2009-05-04 13:42:11


In [9]:
print('Number of songs : ' + str(user_songs_df['song'].nunique()))

Number of songs : 1083471


In [10]:
def get_unique_count(column):
    return len(set(column))

### Filtering songs heard by at least 100 users

In [11]:
df_songs_heard_100_users = user_songs_df.groupby("song").filter(lambda x: get_unique_count(x['userid'])>100)

In [12]:
df_songs_heard_100_users.head()

Unnamed: 0,userid,artistid,song,time
22,user_000001,45bdb5be-ec03-484f-b58d-d22afc944b24,Clouds,2009-05-03 14:39:20
26,user_000001,ce559a88-58ba-4d8a-8456-9177412d609c,Seven,2009-05-03 14:07:40
28,user_000001,ce559a88-58ba-4d8a-8456-9177412d609c,Five,2009-05-03 13:56:25
30,user_000001,ce559a88-58ba-4d8a-8456-9177412d609c,Three,2009-05-03 13:40:46
39,user_000001,3d05eb8b-1644-4143-9a61-b28e33c4d85f,Something In The Way,2009-05-02 15:19:46


In [13]:
print('Number of songs heard by at least 100 users : ' + str(df_songs_heard_100_users['song'].nunique()))
print('Number of users in this dataframe : ' + str(df_songs_heard_100_users['userid'].nunique()))

Number of songs heard by at least 100 users : 4034
Number of users in this dataframe : 990


### Creation of timeslots

In [14]:
df_songs_heard_100_users['hour'] = df_songs_heard_100_users['time'].dt.hour

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [15]:
df_songs_heard_100_users.head()

Unnamed: 0,userid,artistid,song,time,hour
22,user_000001,45bdb5be-ec03-484f-b58d-d22afc944b24,Clouds,2009-05-03 14:39:20,14
26,user_000001,ce559a88-58ba-4d8a-8456-9177412d609c,Seven,2009-05-03 14:07:40,14
28,user_000001,ce559a88-58ba-4d8a-8456-9177412d609c,Five,2009-05-03 13:56:25,13
30,user_000001,ce559a88-58ba-4d8a-8456-9177412d609c,Three,2009-05-03 13:40:46,13
39,user_000001,3d05eb8b-1644-4143-9a61-b28e33c4d85f,Something In The Way,2009-05-02 15:19:46,15


In [16]:
#function to return slot number
def slot(hour):
    if 0 <= hour and hour <= 5:
        return 1
    elif 6 <= hour and hour <= 11:
        return 2
    elif 12 <= hour and hour <= 17:
        return 3
    else:
        return 4

In [17]:
%%time
df_songs_heard_100_users['slot'] = df_songs_heard_100_users['hour'].apply( lambda x : slot(x) ) 

Wall time: 1.61 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [18]:
df_songs_heard_100_users.head()

Unnamed: 0,userid,artistid,song,time,hour,slot
22,user_000001,45bdb5be-ec03-484f-b58d-d22afc944b24,Clouds,2009-05-03 14:39:20,14,3
26,user_000001,ce559a88-58ba-4d8a-8456-9177412d609c,Seven,2009-05-03 14:07:40,14,3
28,user_000001,ce559a88-58ba-4d8a-8456-9177412d609c,Five,2009-05-03 13:56:25,13,3
30,user_000001,ce559a88-58ba-4d8a-8456-9177412d609c,Three,2009-05-03 13:40:46,13,3
39,user_000001,3d05eb8b-1644-4143-9a61-b28e33c4d85f,Something In The Way,2009-05-02 15:19:46,15,3


In [19]:
df_songs_heard_100_users.shape

(3922101, 6)

#### Creating song ids

In [20]:
df_songs_heard_100_users['songid'] = df_songs_heard_100_users.groupby(['song']).ngroup().add(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [21]:
# Removing artistid, time and hour columns
#input_dataset = df_songs_heard_100_users.drop(['artistid', 'song', 'time','hour'], axis=1, inplace=False)
input_dataset = df_songs_heard_100_users.drop(['artistid', 'time','hour'], axis=1, inplace=False)

In [22]:
input_dataset = input_dataset[['userid', 'song', 'slot']]

In [23]:
input_dataset.head()

Unnamed: 0,userid,song,slot
22,user_000001,Clouds,3
26,user_000001,Seven,3
28,user_000001,Five,3
30,user_000001,Three,3
39,user_000001,Something In The Way,3


In [24]:
train_set = input_dataset.copy()
#test_set = input_dataset.groupby("userid").filter(lambda x: x['userid'].iloc[0] in test_set_keys)

In [25]:
print("Number of users in train set : "+str(train_set['userid'].nunique()))
# print("Number of users in test set : "+str(test_set['userid'].nunique()))

Number of users in train set : 990


In [26]:
train_set.head()

Unnamed: 0,userid,song,slot
22,user_000001,Clouds,3
26,user_000001,Seven,3
28,user_000001,Five,3
30,user_000001,Three,3
39,user_000001,Something In The Way,3


### Creating train dataframes based on slots

In [27]:
grouped_train_set = train_set.groupby('slot')
# grouped_test_set = test_set.groupby('slot')

In [28]:
train_first_slot_df = grouped_train_set.get_group(1)
train_second_slot_df = grouped_train_set.get_group(2)
train_third_slot_df = grouped_train_set.get_group(3)
train_fourth_slot_df = grouped_train_set.get_group(4)

# test_first_slot_df = grouped_test_set.get_group(1)
# test_second_slot_df = grouped_test_set.get_group(2)
# test_third_slot_df = grouped_test_set.get_group(3)
# test_fourth_slot_df = grouped_test_set.get_group(4)

In [29]:
# train_first_slot_df

In [30]:
print('First train slot shape : ' + str(train_first_slot_df.shape))
print('Second train slot shape : ' + str(train_second_slot_df.shape))
print('Third train slot shape : ' + str(train_third_slot_df.shape))
print('Fourth train slot shape : ' + str(train_fourth_slot_df.shape))
print('')
print("Number of users in train_first_slot_df : " + str(train_first_slot_df['userid'].nunique()))
print("Number of users in train_second_slot_df : " + str(train_second_slot_df['userid'].nunique()))
print("Number of users in train_third_slot_df : " + str(train_third_slot_df['userid'].nunique()))
print("Number of users in train_fourth_slot_df : " + str(train_fourth_slot_df['userid'].nunique()))
print('')
print("Number of songs in train_first_slot_df : " + str(train_first_slot_df['song'].nunique()))
print("Number of songs in train_second_slot_df : " + str(train_second_slot_df['song'].nunique()))
print("Number of songs in train_third_slot_df : " + str(train_third_slot_df['song'].nunique()))
print("Number of songs in train_fourth_slot_df : " + str(train_fourth_slot_df['song'].nunique()))
# print('')
# print('First test slot shape : ' + str(test_first_slot_df.shape))
# print('Second test slot shape : ' + str(test_second_slot_df.shape))
# print('Third test slot shape : ' + str(test_third_slot_df.shape))
# print('Fourth test slot shape : ' + str(test_fourth_slot_df.shape))
# print('')
# print("Number of users in test_first_slot_df : " + str(test_first_slot_df['userid'].nunique()))
# print("Number of users in test_second_slot_df : " + str(test_second_slot_df['userid'].nunique()))
# print("Number of users in test_third_slot_df : " + str(test_third_slot_df['userid'].nunique()))
# print("Number of users in test_fourth_slot_df : " + str(test_fourth_slot_df['userid'].nunique()))
# print('')
# print("Number of songs in test_first_slot_df : " + str(test_first_slot_df['songid'].nunique()))
# print("Number of songs in test_second_slot_df : " + str(test_second_slot_df['songid'].nunique()))
# print("Number of songs in test_third_slot_df : " + str(test_third_slot_df['songid'].nunique()))
# print("Number of songs in test_fourth_slot_df : " + str(test_fourth_slot_df['songid'].nunique()))


First train slot shape : (770722, 3)
Second train slot shape : (690652, 3)
Third train slot shape : (1164899, 3)
Fourth train slot shape : (1295828, 3)

Number of users in train_first_slot_df : 922
Number of users in train_second_slot_df : 950
Number of users in train_third_slot_df : 975
Number of users in train_fourth_slot_df : 973

Number of songs in train_first_slot_df : 4034
Number of songs in train_second_slot_df : 4034
Number of songs in train_third_slot_df : 4034
Number of songs in train_fourth_slot_df : 4034


In [31]:
# Get user-song-count dataframe for each slot
train_user_song_count_df_first = train_first_slot_df.groupby(["userid","song"]).size().reset_index(name="count")
train_user_song_count_df_second = train_second_slot_df.groupby(["userid","song"]).size().reset_index(name="count")
train_user_song_count_df_third = train_third_slot_df.groupby(["userid","song"]).size().reset_index(name="count")
train_user_song_count_df_fourth = train_fourth_slot_df.groupby(["userid","song"]).size().reset_index(name="count")

# test_user_song_count_df_first = test_first_slot_df.groupby(["userid","songid"]).size().reset_index(name="count")
# test_user_song_count_df_second = test_second_slot_df.groupby(["userid","songid"]).size().reset_index(name="count")
# test_user_song_count_df_third = test_third_slot_df.groupby(["userid","songid"]).size().reset_index(name="count")
# test_user_song_count_df_fourth = test_fourth_slot_df.groupby(["userid","songid"]).size().reset_index(name="count")

In [32]:
train_user_song_count_df_first.head()

Unnamed: 0,userid,song,count
0,user_000001,15 Step,2
1,user_000001,49 Percent,4
2,user_000001,A Moment Of Clarity,1
3,user_000001,Alice,1
4,user_000001,Always,2


In [33]:
(train_user_song_count_df_first.shape, train_user_song_count_df_second.shape,
train_user_song_count_df_third.shape,
train_user_song_count_df_fourth.shape)

((247852, 3), (258462, 3), (363822, 3), (379568, 3))

## *** Use these dataframes *** to export as tsv

In [34]:
train_user_song_count_df_first
train_user_song_count_df_second
train_user_song_count_df_third
train_user_song_count_df_fourth

# test_user_song_count_df_first
# test_user_song_count_df_second
# test_user_song_count_df_third
# test_user_song_count_df_fourth

Unnamed: 0,userid,song,count
0,user_000001,15 Step,1
1,user_000001,Animals,1
2,user_000001,Around The World,1
3,user_000001,Blue Moon,1
4,user_000001,Bodysnatchers,1
5,user_000001,Brazil,1
6,user_000001,Breathe,1
7,user_000001,City Lights,2
8,user_000001,Clouds,1
9,user_000001,Dreaming,1


In [35]:
train_user_song_count_df_first.to_csv (r"F:\Data_Repository\lastfm\df_slot1.tsv",sep='\t',index=False,header=False)
train_user_song_count_df_second.to_csv(r"F:\Data_Repository\lastfm\df_slot2.tsv",sep='\t',index=False,header=False)
train_user_song_count_df_third.to_csv (r"F:\Data_Repository\lastfm\df_slot3.tsv",sep='\t',index=False,header=False)
train_user_song_count_df_fourth.to_csv(r"F:\Data_Repository\lastfm\df_slot4.tsv",sep='\t',index=False,header=False)