In [1]:
import pandas as pd
import numpy as np
import os
import shutil

In [2]:
df_interactions = pd.read_csv('../data/userid_trackid_timestamp.tsv', sep='\t')
print(df_interactions.shape)
df_interactions.head()

(252984396, 3)


Unnamed: 0,user_id,track_id,timestamp
0,51549,iJTBIGHPjgJcT4Bt,2013-01-27 21:42:38
1,51549,iJTBIGHPjgJcT4Bt,2013-01-27 21:38:53
2,51549,iJTBIGHPjgJcT4Bt,2013-01-27 21:35:08
3,51549,iJTBIGHPjgJcT4Bt,2013-01-27 21:31:23
4,51549,iJTBIGHPjgJcT4Bt,2013-01-27 21:27:38


In [3]:
df_emma = pd.read_csv('../data/id_emma.tsv', sep='\t')
df_emma

Unnamed: 0,id,wond,tran,tend,nost,peac,joya,ener,sadn,tens
0,XEbyzfafxVpCCLK2,1.594081,-0.667002,-0.150606,-1.513499,-0.816678,0.428617,1.198807,-0.446335,0.612913
1,wpldgYOXl8u6XbLE,1.553820,2.420615,1.212955,-0.130002,0.431165,0.099829,1.764385,-0.196834,-1.268899
2,nltzxmV9aWW7F7z8,0.461760,-0.140861,-0.240865,-0.581115,-0.609828,1.255161,2.990274,-0.503040,-1.268899
3,IIfvFDMGH4mskejp,-0.641098,-0.064888,-0.945242,-1.094666,-0.689187,1.339656,0.696600,-0.628664,-0.671147
4,kXNEvIQ4JLPHMu3H,-0.432962,-0.497407,0.074756,-0.770869,0.790146,0.250230,-0.262315,-0.432901,-0.504749
...,...,...,...,...,...,...,...,...,...,...
504,ZR4UokUqcWBT4ERU,0.586800,2.189592,-0.253315,3.456587,-0.663044,1.923693,2.139593,-0.903227,-0.105909
505,zsDz17XVFUFYh4y7,0.857097,0.131091,2.164516,1.233984,2.145191,-0.777977,-1.442610,-0.055070,-1.054949
506,ZSnDQ7mhvjJLx9IG,-1.160694,-1.318940,-0.307150,-0.417539,-0.161656,-0.601481,-0.900951,-0.395868,-0.305295
507,ZVjUg4rlqeq2zk7C,-0.127027,-0.082542,-0.214522,-0.492390,0.360165,-0.382865,-0.398755,-0.390467,-0.751960


In [4]:
df_emma_interactions = df_interactions[df_interactions['track_id'].isin(df_emma.id)].copy()
print(df_emma_interactions.shape)
df_emma_interactions.head()

(3600728, 3)


Unnamed: 0,user_id,track_id,timestamp
1648,52740,rFJl0J6qKPImZWOQ,2013-03-03 19:41:52
1718,52740,rFJl0J6qKPImZWOQ,2013-02-03 15:55:45
2141,52740,rFJl0J6qKPImZWOQ,2013-02-01 21:45:34
2148,52740,rFJl0J6qKPImZWOQ,2013-01-31 21:59:41
2155,52740,rFJl0J6qKPImZWOQ,2013-01-31 19:15:30


In [5]:
df_emma_interactions['track_id'].nunique()

283

In [6]:
def filter_interactions_binary(listening_history: pd.DataFrame, min_interactions: int = 2, verbose: bool = True):
    lhs_count = listening_history.value_counts(subset=['user_id', 'track_id'])
    lhs_count = lhs_count[lhs_count >= min_interactions]
    listening_history = listening_history.set_index(['user_id', 'track_id']).loc[lhs_count.index]
    listening_history = listening_history.reset_index()
    return listening_history

def filter_interactions_interval(listening_history: pd.DataFrame, min_interaction_interval=30):
    listening_history = listening_history.sort_values(by=['user_id', 'timestamp'])
    listening_history['timestamp'] = pd.to_datetime(listening_history['timestamp'])
    listening_history['time_diff'] = listening_history.groupby('user_id')['timestamp'].diff().dt.total_seconds().fillna(0)
    listening_history = listening_history[listening_history['time_diff'] >= min_interaction_interval]
    listening_history = listening_history.drop(columns=['time_diff'])
    return listening_history

def delete_duplicates(listening_history: pd.DataFrame):
    initial_shape = listening_history.shape
    listening_history = listening_history.drop_duplicates(subset=['user_id', 'track_id'], keep='first')
    #print(f"Deleted {initial_shape[0] - listening_history.shape[0]} duplicates")
    return listening_history

def k_core_filtering(df, user_col='user_id', item_col='track_id', k_user=5, k_item=5, max_iterations=50):
    filtered_df = df.copy()
    
    iteration = 0
    users_removed = 0
    items_removed = 0
    prev_shape = (-1, -1)
    current_shape = filtered_df.shape
    
    # Iterate until convergence or max iterations
    while (prev_shape != current_shape) and (iteration < max_iterations):
        iteration += 1
        prev_shape = current_shape
        
        # Count interactions for each user and item
        user_counts = filtered_df[user_col].value_counts()
        item_counts = filtered_df[item_col].value_counts()
        
        # Find users and items that don't meet the threshold
        users_to_keep = user_counts[user_counts >= k_user].index
        items_to_keep = item_counts[item_counts >= k_item].index
        
        # Filter the dataframe
        users_removed_this_iter = filtered_df[~filtered_df[user_col].isin(users_to_keep)][user_col].nunique()
        items_removed_this_iter = filtered_df[~filtered_df[item_col].isin(items_to_keep)][item_col].nunique()
        
        users_removed += users_removed_this_iter
        items_removed += items_removed_this_iter
        
        # Apply the filter
        filtered_df = filtered_df[
            filtered_df[user_col].isin(users_to_keep) & 
            filtered_df[item_col].isin(items_to_keep)
        ]
        
        current_shape = filtered_df.shape
    
    return filtered_df

In [7]:
#df_interactions_filtered = filter_interactions_binary(df_interactions, min_interactions=2)
##df_interactions_filtered = filter_interactions_interval(df_interactions, min_interaction_interval=30)
#df_interactions_filtered = delete_duplicates(df_interactions_filtered)
#df_interactions_filtered = k_core_filtering(df_interactions_filtered, user_col='user_id', item_col='track_id', k_user=10, k_item=5)

#print(df_interactions_filtered.shape)
#df_interactions_filtered['track_id'].nunique()
#df_interactions_filtered['user_id'].nunique()
#df_interactions_filtered.head()

In [None]:
#df_emma_interactions_filtered = filter_interactions_interval(df_emma_interactions, min_interaction_interval=30)
df_emma_interactions_filtered = filter_interactions_binary(df_emma_interactions, min_interactions=2)
df_emma_interactions_filtered = delete_duplicates(df_emma_interactions_filtered)
#df_emma_interactions_filtered = k_core_filtering(df_emma_interactions_filtered, k_user=5, k_item=5)

print(df_emma_interactions_filtered.shape)
df_emma_interactions_filtered.head()

(238875, 3)


Unnamed: 0,user_id,track_id,timestamp
15977,68154,jmq9rFLdkgwkaqyV,2013-10-12 17:41:39
22649,14342,A6LUkXjb0VE7a4iB,2011-06-28 21:13:58
25458,32136,VAfPQbtXvfEQaFr0,2009-09-03 13:40:31
26930,8062,7kVURZXnP9Qtb7Ib,2007-09-16 21:16:43
29563,34447,HVsxu5jJtOctRg0x,2013-05-28 09:34:31


In [13]:
def get_periods_with_min_plays(df_filtered, df_original, min_plays=5, freq='M'):
    df_filtered['period'] = pd.to_datetime(df_filtered['timestamp']).dt.to_period(freq)
    df_original['period'] = pd.to_datetime(df_original['timestamp']).dt.to_period(freq)
    
    period_track_plays = df_filtered.groupby(['period', 'track_id']).size().reset_index(name='plays')

    period_track_plays = period_track_plays[period_track_plays['plays'] >= min_plays]
    
    period_summary = period_track_plays.groupby('period').agg(
        min_plays=('plays', 'min'),
        tracks_count=('track_id', 'count'),
    )
    
    period_summary['original_plays'] = df_original.groupby('period').size()
    period_summary['fraction'] = period_summary['tracks_count'] / period_summary['original_plays']
    
    return period_summary.sort_values('tracks_count', ascending=False)

# Get qualifying periods
freq = 'M'  # Change this to 'Y' for years, 'W' for weeks, 'D' for days, etc.
min_plays = 5
qualifying_periods = get_periods_with_min_plays(df_emma_interactions_filtered, df_interactions, min_plays=min_plays, freq=freq)
qualifying_periods.head(30)

Unnamed: 0_level_0,min_plays,tracks_count,original_plays,fraction
period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2012-05,5,113,5168591,2.2e-05
2013-10,5,107,3169804,3.4e-05
2013-11,5,103,3063248,3.4e-05
2013-07,5,103,2671264,3.9e-05
2014-02,5,102,2841251,3.6e-05
2012-04,5,101,3874059,2.6e-05
2012-12,5,101,2959242,3.4e-05
2013-09,5,100,2994049,3.3e-05
2012-10,5,100,3259786,3.1e-05
2014-01,5,99,3131099,3.2e-05


In [10]:
# 2019-05	5	191	1443149	0.000132