In [3]:
import pandas as pd
import numpy as np
import os
import shutil

In [9]:
df_interactions = pd.read_csv('../data/userid_trackid_timestamp.tsv', sep='\t')
print(df_interactions.shape)
df_interactions.head()

(252984396, 3)


Unnamed: 0,user_id,track_id,timestamp
0,51549,iJTBIGHPjgJcT4Bt,2013-01-27 21:42:38
1,51549,iJTBIGHPjgJcT4Bt,2013-01-27 21:38:53
2,51549,iJTBIGHPjgJcT4Bt,2013-01-27 21:35:08
3,51549,iJTBIGHPjgJcT4Bt,2013-01-27 21:31:23
4,51549,iJTBIGHPjgJcT4Bt,2013-01-27 21:27:38


In [10]:
df_interactions['period'] = pd.to_datetime(df_interactions['timestamp']).dt.to_period('Y')
df_interactions = df_interactions[df_interactions['period'] == '2018']
print(df_interactions.shape)
df_interactions.head()

(17285524, 4)


Unnamed: 0,user_id,track_id,timestamp,period
134635346,11095,VKvF5oSWmxCDL97f,2018-11-27 07:38:29,2018
134635347,11095,VKvF5oSWmxCDL97f,2018-11-21 07:48:46,2018
134635348,11095,8keAhBJumHlc8qe9,2018-11-15 22:53:21,2018
134635349,11095,dreywAZScIcoCASF,2018-11-15 05:10:39,2018
134635350,11095,lJKIbZNzpS6IsNgh,2018-11-15 05:06:06,2018


In [11]:
# binarization: we convert the interactions to binary implicit feedback with a threshold of 2 on the interaction counts (reducing false-positive interactions)
def filter_interactions(listening_history: pd.DataFrame, min_interactions: int = 2, verbose: bool = True):
    lhs_count = listening_history.value_counts(subset=['user_id', 'track_id'])
    lhs_count = lhs_count[lhs_count >= min_interactions]
    listening_history = listening_history.set_index(['user_id', 'track_id']).loc[lhs_count.index]
    listening_history = listening_history.reset_index()
    return listening_history

def delete_duplicates(listening_history: pd.DataFrame):
    initial_shape = listening_history.shape
    listening_history = listening_history.drop_duplicates(subset=['user_id', 'track_id'], keep='first')
    print(f"Deleted {initial_shape[0] - listening_history.shape[0]} duplicates")
    return listening_history

df_interactions = filter_interactions(df_interactions, min_interactions=2)
df_interactions = delete_duplicates(df_interactions)
print(df_interactions.shape)
df_interactions.head()

Deleted 11936035 duplicates
(2423168, 4)


Unnamed: 0,user_id,track_id,timestamp,period
0,48507,GJBMg2XCb4yuAS4b,2018-08-01 16:41:53,2018
60316,48507,5yz4FuSP9DGYYCPC,2018-09-12 11:45:34,2018
84817,48507,7ELJ2sVO3n3mZHVM,2018-12-21 12:40:09,2018
109056,48507,kg8DbqKcerjETz9V,2018-06-22 19:47:07,2018
126451,5186,IF0X3GfxmyXpWlGl,2018-04-23 23:35:29,2018


In [16]:
# sparsity of dataset
n_users = df_interactions['user_id'].nunique()
n_items = df_interactions['track_id'].nunique()
n_interactions = df_interactions.shape[0]
sparsity = 1 - n_interactions / (n_users * n_items)
sparsity

0.9976734527042088

In [17]:
n_items

54142

In [18]:
n_users

19237

In [19]:
# export
df_interactions.to_csv('../data/userid_trackid_timestamp_onion.tsv', sep='\t', index=False)