In [20]:
import pandas as pd
import numpy as np
import os
import tqdm
import shutil

In [21]:
dataset = 'onion' # onion or emma
input_data_dir = f'../data/'

features = {
    'emotion_embeddings': ['id_emma.tsv', 'id_gems.tsv'],
    'audio_embeddings': ['id_maest.tsv', 'id_jukebox.tsv', 'id_musicnn.tsv'],
    'textual_embeddings': ['id_bert.tsv', 'id_lyrics_word2vec.tsv'],
    'visual_embeddings': ['id_resnet.tsv', 'id_vgg19.tsv', 'id_incp.tsv'],
}

output_dir = f'../dataset/{dataset}/'

os.makedirs(output_dir, exist_ok=True)

In [22]:
df_interactions = pd.read_csv(f'../data/userid_trackid_timestamp_{dataset}.tsv', sep='\t')
    
df_interactions['rating'] = 5
sample_items = df_interactions['track_id'].unique()

# map item and users to numbers
df_interactions['user_id_int'] = df_interactions['user_id'].astype('category').cat.codes
df_interactions['track_id_int'] = df_interactions['track_id'].astype('category').cat.codes


print(df_interactions.shape)
df_interactions.head()

(661036, 8)


Unnamed: 0,user_id,track_id,timestamp,year,month,rating,user_id_int,track_id_int
0,105881,ccuPZ7Z9obk3Efg8,2014-03-05 11:44:07,2014,2014-03,5,32040,20397
1,34658,Kq5zEzdS2atiI6yA,2014-03-29 10:10:01,2014,2014-03,5,16051,11091
2,40093,CWJYGGxMa4ppYUvD,2014-03-13 03:04:50,2014,2014-03,5,18002,6718
3,40093,rYKZ2u8TGNLDuFFx,2014-03-17 19:53:20,2014,2014-03,5,18002,28419
4,4899,LtbprkKTPfQwCdwI,2014-03-19 23:24:40,2014,2014-03,5,2822,11655


In [23]:
# track ids from feature file
df = pd.read_csv(input_data_dir + features['emotion_embeddings'][1], sep='\t')
feat_track_ids = df['id'].values
feat_track_ids

array(['04q3VppIQEET5rzy', '07xF9Q0K1t3ist7K', '0JbYcELIqpMPvAHk', ...,
       'zzyyPUs7hC9Nz2e1', 'zzz0n04uuTUA7fNh', 'zzzj3LYaZtYtbzSr'],
      dtype=object)

In [24]:
# remove interactions with items not in the feature files
df_interactions = df_interactions[df_interactions['track_id'].isin(feat_track_ids)]
df_interactions.shape

(661024, 8)

In [25]:
#df_interactions[['user_id', 'track_id', 'rating', 'timestamp']].to_csv(os.path.join(output_dir, 'interactions.tsv'), index=False, sep='\t', header=False)
df_interactions[['user_id_int', 'track_id_int', 'rating', 'timestamp']].to_csv(os.path.join(output_dir, 'interactions.tsv'), index=False, sep='\t', header=False)

In [26]:
item_id_map = df_interactions[['track_id', 'track_id_int']].drop_duplicates()
item_id_map.set_index('track_id', inplace=True)
item_id_map.head()

Unnamed: 0_level_0,track_id_int
track_id,Unnamed: 1_level_1
ccuPZ7Z9obk3Efg8,20397
Kq5zEzdS2atiI6yA,11091
CWJYGGxMa4ppYUvD,6718
rYKZ2u8TGNLDuFFx,28419
LtbprkKTPfQwCdwI,11655


In [27]:
del df_interactions

In [28]:
for feature_type in features:

    for file in features[feature_type]:
        if dataset == 'onion' and 'emma' in file:
            continue

        print('Processing', file)
        df = pd.read_csv(os.path.join(input_data_dir, file), index_col=0, sep='\t')
        df = df.merge(item_id_map, left_index=True, right_index=True)
        feature_name = file.split('.')[0].split('_')[1]
            
        folder_path = os.path.join(output_dir, feature_type, feature_name)

        if os.path.exists(folder_path):
            shutil.rmtree(folder_path)
        os.makedirs(folder_path, exist_ok=True)

        for index, row in tqdm.tqdm(df.iterrows()):
            np.save(os.path.join(folder_path, str(int(row['track_id_int'])) + '.npy'), row.values[:-1])

Processing id_gems.tsv


33057it [03:01, 182.54it/s]


Processing id_maest.tsv


33057it [03:02, 181.08it/s]


Processing id_jukebox.tsv


33056it [05:08, 107.25it/s]


Processing id_musicnn.tsv


33056it [03:05, 178.29it/s]


Processing id_bert.tsv


33057it [03:07, 176.54it/s]


Processing id_lyrics_word2vec.tsv


33057it [03:05, 178.28it/s]


Processing id_resnet.tsv


29887it [04:07, 120.99it/s]


Processing id_vgg19.tsv


29887it [05:53, 84.58it/s]


Processing id_incp.tsv


29887it [04:12, 118.26it/s]
