In [10]:
import pandas as pd
import numpy as np
import os
import shutil

In [11]:
PERIOD_YEAR = '2019'

In [12]:
df_interactions = pd.read_csv('../data/userid_trackid_timestamp.tsv', sep='\t')
print(df_interactions.shape)
df_interactions.head()

(252984396, 3)


Unnamed: 0,user_id,track_id,timestamp
0,51549,iJTBIGHPjgJcT4Bt,2013-01-27 21:42:38
1,51549,iJTBIGHPjgJcT4Bt,2013-01-27 21:38:53
2,51549,iJTBIGHPjgJcT4Bt,2013-01-27 21:35:08
3,51549,iJTBIGHPjgJcT4Bt,2013-01-27 21:31:23
4,51549,iJTBIGHPjgJcT4Bt,2013-01-27 21:27:38


In [13]:
df_interactions['year'] = pd.to_datetime(df_interactions['timestamp']).dt.to_period('Y')
df_interactions['month'] = pd.to_datetime(df_interactions['timestamp']).dt.to_period('M')
df_interactions = df_interactions[df_interactions['year'] == PERIOD_YEAR]
print(df_interactions.shape)
df_interactions.head()

(16798748, 5)


Unnamed: 0,user_id,track_id,timestamp,year,month
134640783,5767,ETjDC3M3BNzL3vbX,2019-12-21 11:08:59,2019,2019-12
134640784,5767,Tj7tHraP3IJiu8lS,2019-12-15 17:32:53,2019,2019-12
134640785,5767,9AT7OkPyGi0PwkLh,2019-12-11 19:15:56,2019,2019-12
134640786,5767,VllNteG3w18TMKYP,2019-12-10 18:58:50,2019,2019-12
134640787,5767,IwT5KH3YVXEx3lY5,2019-12-07 14:29:01,2019,2019-12


In [14]:
# binarization: we convert the interactions to binary implicit feedback with a threshold of 2 on the interaction counts (reducing false-positive interactions)
def filter_interactions_binary(listening_history: pd.DataFrame, min_interactions: int = 2, verbose: bool = True):
    lhs_count = listening_history.value_counts(subset=['user_id', 'track_id'])
    lhs_count = lhs_count[lhs_count >= min_interactions]
    listening_history = listening_history.set_index(['user_id', 'track_id']).loc[lhs_count.index]
    listening_history = listening_history.reset_index()
    return listening_history

# filter interactions with a minimum time interval between them (alternative to binarization)
def filter_interactions_interval(listening_history: pd.DataFrame, min_interaction_interval=30):
    listening_history = listening_history.sort_values(by=['user_id', 'timestamp'])
    listening_history['timestamp'] = pd.to_datetime(listening_history['timestamp'])
    listening_history['time_diff'] = listening_history.groupby('user_id')['timestamp'].diff().dt.total_seconds().fillna(0)
    listening_history = listening_history[listening_history['time_diff'] >= min_interaction_interval]
    listening_history = listening_history.drop(columns=['time_diff'])
    return listening_history

def delete_duplicates(listening_history: pd.DataFrame):
    initial_shape = listening_history.shape
    listening_history = listening_history.drop_duplicates(subset=['user_id', 'track_id'], keep='first')
    #print(f"Deleted {initial_shape[0] - listening_history.shape[0]} duplicates")
    return listening_history

#df_interactions = filter_interactions_binary(df_interactions, min_interactions=2)
df_interactions = filter_interactions_interval(df_interactions, min_interaction_interval=30)
df_interactions = delete_duplicates(df_interactions)
print(df_interactions.shape)
df_interactions.head()

(5203080, 5)


Unnamed: 0,user_id,track_id,timestamp,year,month
206044628,0,7uwg0JpB7Dos5kOU,2019-01-01 00:33:59,2019,2019-01
206044782,0,ifgfWxyGQX4aZFDb,2019-01-01 00:50:50,2019,2019-01
206044550,0,jMg6CVPGjHDIjPl9,2019-01-01 01:15:17,2019,2019-01
206044781,0,RWXPvXZRrB0lzHkE,2019-01-01 01:49:18,2019,2019-01
206044549,0,7pP62aDPmC7o93y1,2019-01-01 02:19:57,2019,2019-01


In [15]:
# sparsity of dataset
n_users = df_interactions['user_id'].nunique()
n_items = df_interactions['track_id'].nunique()
n_interactions = df_interactions.shape[0]
sparsity = 1 - n_interactions / (n_users * n_items)
sparsity

0.9945170663050219

In [16]:
n_items

56188

In [17]:
n_users

16889

In [18]:
# export
df_interactions.to_csv(f'../data/userid_trackid_timestamp_{PERIOD_YEAR}.tsv', sep='\t', index=False)