In [1]:
import pandas as pd
import numpy as np
import os
import tqdm
import shutil

In [2]:
sample = True
data_dir = 'data/m4a/'
features = {
    'audio_embeddings': ['id_musicnn.tsv'],
    'textual_embeddings': ['id_bert.tsv'],
    #'visual_embeddings': ['id_resnet.tsv']
}
features_files = ['id_musicnn.tsv']

In [6]:
if sample:
    output_dir = 'data/sample/'
    df_interactions = pd.read_csv('data/m4a/userid_trackid_timestamp.tsv', sep='\t', nrows=100000)
else:
    output_dir = 'data/m4a/'
    df_interactions = pd.read_csv('data/m4a/userid_trackid_timestamp.tsv', sep='\t')
    
df_interactions['rating'] = 5
sample_items = df_interactions['track_id'].unique()

# map item and users to numbers
df_interactions['user_id_int'] = df_interactions['user_id'].astype('category').cat.codes
df_interactions['track_id_int'] = df_interactions['track_id'].astype('category').cat.codes

df_interactions[['user_id_int', 'track_id_int', 'rating', 'timestamp']].to_csv(os.path.join(output_dir, 'interactions.tsv'), index=False, sep='\t', header=False)

df_interactions.head()

Unnamed: 0,user_id,track_id,timestamp,rating,user_id_int,track_id_int
0,51549,iJTBIGHPjgJcT4Bt,2013-01-27 21:42:38,5,66,10157
1,51549,iJTBIGHPjgJcT4Bt,2013-01-27 21:38:53,5,66,10157
2,51549,iJTBIGHPjgJcT4Bt,2013-01-27 21:35:08,5,66,10157
3,51549,iJTBIGHPjgJcT4Bt,2013-01-27 21:31:23,5,66,10157
4,51549,iJTBIGHPjgJcT4Bt,2013-01-27 21:27:38,5,66,10157


In [7]:
item_id_map = df_interactions[['track_id', 'track_id_int']].drop_duplicates()
item_id_map.set_index('track_id', inplace=True)
item_id_map.head()

Unnamed: 0_level_0,track_id_int
track_id,Unnamed: 1_level_1
iJTBIGHPjgJcT4Bt,10157
LCItxaUrpHk6QYuy,4877
VXVSlV3nA5jgYOW1,7250
DQ9EMmQndbcKKbBo,3158
d6hSPGsvbBx2mcPR,8982


In [8]:
for feature_type in features:
    for file in features[feature_type]:
        print('Processing', file)
        df = pd.read_csv(os.path.join(data_dir, file), index_col=0, sep='\t')
        df = df.merge(item_id_map, left_index=True, right_index=True)
        feature_name = file.split('.')[0].split('_')[1]
        if sample:
            df = df.reindex(sample_items).dropna()
            folder_path = os.path.join('data/sample/', feature_type, feature_name)
        else:
            folder_path = os.path.join(data_dir, feature_type, feature_name)
        if os.path.exists(folder_path):
            shutil.rmtree(folder_path)
        os.makedirs(folder_path, exist_ok=True)
        for index, row in tqdm.tqdm(df.iterrows()):
            np.save(os.path.join(folder_path, str(int(row['track_id_int'])) + '.npy'), row.values)

Processing id_musicnn.tsv


14270it [00:31, 457.17it/s]


Processing id_bert.tsv


14270it [00:29, 488.36it/s]
