In [2]:
import pandas as pd
import numpy as np
import os
import shutil

In [3]:
PERIOD_YEAR = '2014'

In [6]:
df_interactions = pd.read_csv('../data/userid_trackid_timestamp.tsv', sep='\t')
print(df_interactions.shape)
df_interactions.head()

(50000000, 3)


Unnamed: 0,user_id,track_id,timestamp
0,51549,iJTBIGHPjgJcT4Bt,2013-01-27 21:42:38
1,51549,iJTBIGHPjgJcT4Bt,2013-01-27 21:38:53
2,51549,iJTBIGHPjgJcT4Bt,2013-01-27 21:35:08
3,51549,iJTBIGHPjgJcT4Bt,2013-01-27 21:31:23
4,51549,iJTBIGHPjgJcT4Bt,2013-01-27 21:27:38


In [9]:
df_interactions['year'] = pd.to_datetime(df_interactions['timestamp']).dt.to_period('Y')
df_interactions['month'] = pd.to_datetime(df_interactions['timestamp']).dt.to_period('M')
df_interactions = df_interactions[df_interactions['year'] == PERIOD_YEAR]
print(df_interactions.shape)
df_interactions.head()

(4440666, 5)


Unnamed: 0,user_id,track_id,timestamp,year,month
49534431,17978,3YCjX6VUYmJHiDHs,2014-04-29 15:31:44,2014,2014-04
49591397,24349,2rdKvEqyovlm32gY,2014-04-29 07:55:22,2014,2014-04
49534576,17978,3YCjX6VUYmJHiDHs,2014-04-29 03:49:35,2014,2014-04
49534577,17978,3YCjX6VUYmJHiDHs,2014-04-29 03:36:12,2014,2014-04
49591745,24349,g2CV9mJiWs9bJUsx,2014-04-28 22:03:49,2014,2014-04


In [10]:
# binarization: we convert the interactions to binary implicit feedback with a threshold of 2 on the interaction counts (reducing false-positive interactions)
def filter_interactions(listening_history: pd.DataFrame, min_interactions: int = 2, verbose: bool = True):
    lhs_count = listening_history.value_counts(subset=['user_id', 'track_id'])
    lhs_count = lhs_count[lhs_count >= min_interactions]
    listening_history = listening_history.set_index(['user_id', 'track_id']).loc[lhs_count.index]
    listening_history = listening_history.reset_index()
    return listening_history

def delete_duplicates(listening_history: pd.DataFrame):
    initial_shape = listening_history.shape
    listening_history = listening_history.drop_duplicates(subset=['user_id', 'track_id'], keep='first')
    print(f"Deleted {initial_shape[0] - listening_history.shape[0]} duplicates")
    return listening_history

df_interactions = filter_interactions(df_interactions, min_interactions=2)
df_interactions = delete_duplicates(df_interactions)
print(df_interactions.shape)
df_interactions.head()

Deleted 2687295 duplicates
(673432, 5)


Unnamed: 0,user_id,track_id,timestamp,year,month
0,31441,x4hv0ALTsGICpzry,2014-04-09 10:58:14,2014,2014-04
2996,27941,InE2v5bxFNZ6Mow0,2014-03-23 12:07:38,2014,2014-03
3831,33179,FdlV9QADnS2vD3jE,2014-04-14 15:35:43,2014,2014-04
4654,44706,XIFUDCiXv38XEWiX,2014-03-10 07:17:18,2014,2014-03
5383,68443,0zYSX03Sqv8YHEwv,2014-04-03 18:24:29,2014,2014-04


In [11]:
# sparsity of dataset
n_users = df_interactions['user_id'].nunique()
n_items = df_interactions['track_id'].nunique()
n_interactions = df_interactions.shape[0]
sparsity = 1 - n_interactions / (n_users * n_items)
sparsity

0.9988016329560717

In [12]:
n_items

33148

In [13]:
n_users

16953

In [14]:
# export
df_interactions.to_csv(f'../data/userid_trackid_timestamp_{PERIOD_YEAR}.tsv', sep='\t', index=False)