In [46]:
import numpy as np
import pickle
import os
from tqdm import tqdm
import json
import discogs_client
import csv
import pandas as pd
from util.util_data import *

In [47]:
def save_obj_curr_folder(obj, name):
    with open(name, 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj_curr_folder(name):
    with open(name, 'rb') as f:
        return pickle.load(f)

In [48]:

## TXT format

def save_list_as_txt(obj, path):
    with open(path, 'w') as f:
        for item in obj:
            f.write("%s\n" % item)

def load_txt_to_list(path):
    with open(path, 'r') as f:
        lines = [line.rstrip('\n') for line in f.readlines()]
    return lines



## CSV format

def save_list_as_csv(obj, path, _column_names):
    df = pd.DataFrame(obj, columns=_column_names)
    df.to_csv(path, index=False)
    
def load_csv_to_list(path, _delim=','):
    with open(path) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=_delim)
        csv_list = list(csv_reader)
    return csv_list

def load_csv_to_df(path, out_format='tuple', _delim=','):
    curr_df = pd.read_csv(path, delimiter=_delim)
    if out_format == 'tuple':
        return [tuple(x) for x in curr_df.values]
    elif out_format == 'dict':
        return curr_df.to_dict().values()


    
## JSON format
    
def save_dict_as_json(obj, path):
    with open(path, 'w') as outfile:  
        json.dump(obj, outfile)

def load_json_to_dict(path):
    f = open(path, encoding="latin-1")
    js = f.read()
    f.close()
    return json.loads(js)

# (1) MSD with Last.fm metadata

In [3]:
mel_base_dir = '/media/iu/MSD/mel128/'
# lastfm_full_track_to_tags_txt_dict = load_obj_curr_folder('data_lastfm/lastfm_full_track_to_tags_txt_dict.p')
msd_to_spotify_file_path = './data_lastfm/msd_to_spotify.tsv'

In [4]:
msd_id_to_tag_vector = pickle.load(open('/media/iu/MSD/MSD_mel_split/MSD_split/msd_id_to_tag_vector.cP', 'rb'))
MSD_id_to_7D_id = pickle.load(open('/media/iu/MSD/MSD_mel_split/MSD_split/MSD_id_to_7D_id.pkl', 'rb'))
sevenD_id_to_path = pickle.load(open('/media/iu/MSD/MSD_mel_split/MSD_split/7D_id_to_path.pkl', 'rb'))
filtered_list_train = pickle.load(open('/media/iu/MSD/MSD_mel_split/MSD_split/filtered_list_train.cP', 'rb'))
filtered_list_test = pickle.load(open('/media/iu/MSD/MSD_mel_split/MSD_split/filtered_list_test.cP', 'rb'))

In [4]:
mpd_data_path = '/media/irene/dataset/mpd.v1/data'
mpd_filenames = os.listdir(mpd_data_path)

In [6]:
len(lastfm_full_track_to_tags_txt_dict), len(MSD_id_to_7D_id)

(505216, 1000000)

In [7]:
msd_id_to_spotify_id_dict = dict()
duplicates_msd = 0
duplicates_msd_list = []
with open(msd_to_spotify_file_path) as file:
    curr_line = file.readline()
    info = curr_line.split('\t')
    print(info)
    curr_line = file.readline()
    while curr_line:
        info = curr_line.split('\t')
        try:
            _ = msd_id_to_spotify_id_dict[info[1]]
            duplicates_msd += 1
            duplicates_msd_list.append((info[1], info[3]))
            curr_line = file.readline()
        except:
            msd_id_to_spotify_id_dict[info[1]] = info[3]
            curr_line = file.readline()
            continue
        

['dzr_id', 'msd_id', 'msd_origin', 'spotify_id', 'spotify_origin\n']


In [5]:
spotify_id_to_msd_id_dict = dict()
duplicates_spo = 0
duplicates_spo_list = []
line_cnt = 1
with open(msd_to_spotify_file_path) as file:
    curr_line = file.readline()
    info = curr_line.split('\t')
    print(info)
    curr_line = file.readline()
    while curr_line:
        line_cnt += 1
        info = curr_line.split('\t')
        try:
            _ = spotify_id_to_msd_id_dict[info[3]]
            duplicates_spo += 1
            duplicates_spo_list.append((info[3], info[1]))
            curr_line = file.readline()
            continue
        except:
            spotify_id_to_msd_id_dict[info[3]] = info[1]
            
            curr_line = file.readline()
            continue

['dzr_id', 'msd_id', 'msd_origin', 'spotify_id', 'spotify_origin\n']


In [10]:
line_cnt

410272

In [8]:
len(msd_id_to_spotify_id_dict), len(spotify_id_to_msd_id_dict)

(396745, 362889)

In [None]:
track_ids_list = []
track_ids_set = set()
track_ids_desc_list = []
track_ids_desc_set = set()

for filename in tqdm(sorted(mpd_filenames)):
    if filename.startswith("mpd.slice.") and filename.endswith(".json"):
        curr_pl_info_dir = os.sep.join(('/media/irene/dataset/mpd.v1/pl_info', filename.replace(".json", "")))
        curr_json_dir = os.sep.join((mpd_data_path, filename))
        
        f = open(curr_json_dir, encoding="latin-1")
        js = f.read()
        f.close()
        mpd_slice = json.loads(js)
        
        for idx in range(len(mpd_slice['playlists'])):
    
            curr_tracks = mpd_slice['playlists'][idx]['tracks']
            for _track in curr_tracks:
                curr_tid = _track['track_uri'].split(':')[-1]
                track_ids_list.append(curr_tid)
                track_ids_set.add(curr_tid)
                
                if 'description' in list(mpd_slice['playlists'][idx].keys()):
                    track_ids_desc_list.append(curr_tid)
                    track_ids_desc_set.add(curr_tid)

            

In [13]:
len(track_ids_list), len(track_ids_set), len(track_ids_desc_list), len(track_ids_desc_set)

(66346428, 2262292, 1375515, 240690)

In [29]:
# track_ids_list = []
# track_ids_desc_list = []

In [14]:
mpd_unique_track_ids_list = list(track_ids_set)
mpd_desc_unique_track_ids_list = list(track_ids_desc_set)

In [15]:
len(mpd_unique_track_ids_list), len(mpd_desc_unique_track_ids_list)

(2262292, 240690)

In [21]:
with open('./mpd_handling/mpd_unique_track_ids_list.txt', 'w') as f:  
    json.dump(mpd_unique_track_ids_list, f)

with open('./mpd_handling/mpd_desc_unique_track_ids_list.txt', 'w') as f:  
    json.dump(mpd_desc_unique_track_ids_list, f)


In [16]:
len(mpd_desc_unique_track_ids_list)

240690

In [17]:
msd_ids_from_tsv = list(msd_id_to_spotify_id_dict.keys())
spotify_ids_from_tsv = list(spotify_id_to_msd_id_dict.keys())

In [18]:
len(msd_ids_from_tsv), len(spotify_ids_from_tsv)

(396745, 362889)

In [19]:
msd_ids_from_tsv[0]

'TRXGJMT12903CC4793'

In [20]:
mpd_desc_unique_track_ids_list[0]

'1wiG5lgoZ6h73NtDIkFcEn'

In [21]:
spotify_ids_from_tsv[0]

'6XenAoRSGmBLX3IK5vT2WE'

In [22]:
len(spotify_id_to_msd_id_dict)

362889

In [23]:
matching_spotify_ids_desc = []
for _sid in mpd_desc_unique_track_ids_list:
    try:
        matching_spotify_ids_desc.append([_sid, spotify_id_to_msd_id_dict[_sid]])
    except:
        continue                                    

In [24]:
len(matching_spotify_ids_desc)

7963

In [25]:
len(mpd_unique_track_ids_list)

2262292

In [None]:
matching_spotify_ids = []
for _sid in mpd_unique_track_ids_list:
    try:
        matching_spotify_ids.append([_sid, spotify_id_to_msd_id_dict[_sid]])
    except:
        continue                                    

In [None]:
len(matching_spotify_ids)

In [None]:
'''
spotify ids in full MPD (1m) : 2262292
spotify ids in matching tsv file : 362889 (removing duplicates from 410272)
spotify ids in MPD with description (18760) : 240690
spotify ids appearing in BOTH tsv (362889) and MPD with description (240690) : 7963 
spotify ids appearing in BOTH tsv (362889) and  full MPD (2262292) : 70391 
'''

In [14]:
pl_with_tracks_on_msd_10 = []
pl_with_tracks_on_msd_30 = []
pl_with_tracks_on_msd_50 = []
pl_with_tracks_on_msd_70 = []
num_tracks_list = []
         
for filename in tqdm(sorted(mpd_filenames)):
    if filename.startswith("mpd.slice.") and filename.endswith(".json"):
        curr_pl_info_dir = os.sep.join(('/media/irene/dataset/mpd.v1/pl_info', filename.replace(".json", "")))
        curr_json_dir = os.sep.join((mpd_data_path, filename))
        
        f = open(curr_json_dir, encoding="latin-1")
        js = f.read()
        f.close()
        mpd_slice = json.loads(js)
        
        for idx in range(len(mpd_slice['playlists'])):
            
            curr_tracks = mpd_slice['playlists'][idx]['tracks']
            _filtered_tracks = []
            for _track in curr_tracks:
                curr_tid = _track['track_uri'].split(':')[-1]
                try:
                    _msd_id = spotify_id_to_msd_id_dict[curr_tid]
                    _filtered_tracks.append(_msd_id)
                except:
                    pass
            num_tracks_list.append([len(_filtered_tracks), len(curr_tracks)])
            if len(_filtered_tracks) >= 0.1 * len(curr_tracks):
                pl_with_tracks_on_msd_10.append(mpd_slice['playlists'][idx])
            if len(_filtered_tracks) >= 0.3 * len(curr_tracks):
                pl_with_tracks_on_msd_30.append(mpd_slice['playlists'][idx])
            if len(_filtered_tracks) >= 0.5 * len(curr_tracks):
                pl_with_tracks_on_msd_50.append(mpd_slice['playlists'][idx])
            if len(_filtered_tracks) >= 0.7 * len(curr_tracks):
                pl_with_tracks_on_msd_70.append(mpd_slice['playlists'][idx])

100%|██████████| 1000/1000 [04:58<00:00,  3.35it/s]


In [16]:
pl_with_tracks_on_msd = []


In [17]:
len(pl_with_tracks_on_msd_10),len(pl_with_tracks_on_msd_30),len(pl_with_tracks_on_msd_50),len(pl_with_tracks_on_msd_70)

(143183, 3470, 264, 15)

In [18]:
len(num_tracks_list)

1000000

In [13]:
save_list_as_txt(num_tracks_list, './num_tracks_list_msd.txt')

In [19]:
save_dict_as_json(pl_with_tracks_on_msd_10, './data_pl_dzrmeta/pl_with_tracks_on_msd_10.json')
save_dict_as_json(pl_with_tracks_on_msd_30, './data_pl_dzrmeta/pl_with_tracks_on_msd_30.json')
save_dict_as_json(pl_with_tracks_on_msd_50, './data_pl_dzrmeta/pl_with_tracks_on_msd_50.json')
save_dict_as_json(pl_with_tracks_on_msd_70, './data_pl_dzrmeta/pl_with_tracks_on_msd_70.json')

In [None]:
pl_with_tracks_on_msd_10 = []
pl_with_tracks_on_msd_30 = []
pl_with_tracks_on_msd_50 = []
pl_with_tracks_on_msd_70 = []

In [None]:
num_tracks_list = []

In [21]:
len(list(spotify_id_to_msd_id_dict.keys()))

362889

# (2) DZR metadata

In [31]:
len(mpd_desc_unique_track_ids_list)

240690

In [49]:
tmp_list = load_csv_to_df('./data_pl_dzrmeta/reco_tags_mpd_extend.csv')

In [50]:
tmp_list[0]

(517168622, '7phWuVdFOthZj7xHAVPRwE', 'jazz', 'genre')

In [51]:
spotify_id_to_metadata_dict = dict()
spotify_id_set = set()
dzr_feature_name_set = set()

for _item in tmp_list:
    spotify_id_set.add(_item[1])
    spotify_id_to_metadata_dict[_item[1]] = dict()
    dzr_feature_name_set.add(_item[-1]) 

In [52]:
dzr_feature_name_set

{'audience',
 'audio_content',
 'belief',
 'category',
 'decade',
 'formation',
 'genre',
 'influence',
 'instrument',
 'lang',
 'location',
 'location:city',
 'location:continent',
 'location:region',
 'mood',
 'mood:activity',
 'mood:celebration',
 'mood:day',
 'mood:day_moment',
 'mood:emotion',
 'mood:moment',
 'mood:season',
 'mood:situation',
 'mood:weather',
 'movie:genre',
 'musical_form',
 'period',
 'record_type',
 'role',
 'singer_type'}

In [53]:
len(spotify_id_to_metadata_dict), len(spotify_id_set)

(96773, 96773)

In [54]:
for _s_id in list(spotify_id_set):
    for _feature in dzr_feature_name_set:
        spotify_id_to_metadata_dict[_s_id][_feature] = []

In [55]:
spotify_id_to_metadata_dict[list(spotify_id_set)[0]]

{'audience': [],
 'audio_content': [],
 'belief': [],
 'category': [],
 'decade': [],
 'formation': [],
 'genre': [],
 'influence': [],
 'instrument': [],
 'lang': [],
 'location': [],
 'location:city': [],
 'location:continent': [],
 'location:region': [],
 'mood': [],
 'mood:activity': [],
 'mood:celebration': [],
 'mood:day': [],
 'mood:day_moment': [],
 'mood:emotion': [],
 'mood:moment': [],
 'mood:season': [],
 'mood:situation': [],
 'mood:weather': [],
 'movie:genre': [],
 'musical_form': [],
 'period': [],
 'record_type': [],
 'role': [],
 'singer_type': []}

In [56]:
# (517168622, '7phWuVdFOthZj7xHAVPRwE', 'jazz', 'genre')
for _item in tmp_list:
    spotify_id_to_metadata_dict[_item[1]][_item[-1]].append(_item[2])

In [57]:
spotify_id_to_metadata_dict['7p3fdx7dopTkUmZH7Ucedb']

{'audience': [],
 'audio_content': [],
 'belief': [],
 'category': ['instrumental'],
 'decade': ['00s'],
 'formation': [],
 'genre': ['jazz', 'soul', 'blues', 'rock'],
 'influence': [],
 'instrument': [],
 'lang': ['en'],
 'location': ['us'],
 'location:city': ['new_orleans'],
 'location:continent': ['north_america'],
 'location:region': [],
 'mood': [],
 'mood:activity': [],
 'mood:celebration': [],
 'mood:day': [],
 'mood:day_moment': [],
 'mood:emotion': [],
 'mood:moment': [],
 'mood:season': [],
 'mood:situation': [],
 'mood:weather': [],
 'movie:genre': [],
 'musical_form': [],
 'period': [],
 'record_type': [],
 'role': [],
 'singer_type': []}

In [14]:
save_dict_as_json(spotify_id_to_metadata_dict, './data_pl_dzrmeta/spotify_id_to_metadata_dict.json')

In [42]:
len(mpd_desc_unique_track_ids_list), len(mpd_unique_track_ids_list)

(240690, 2262292)

In [15]:
len(spotify_id_to_metadata_dict)

96773

In [16]:
dzr_metadata_key_list = list(spotify_id_to_metadata_dict['7p3fdx7dopTkUmZH7Ucedb'].keys())

In [17]:
print(dzr_metadata_key_list)

['mood:celebration', 'location:region', 'belief', 'musical_form', 'decade', 'role', 'period', 'instrument', 'mood:situation', 'influence', 'genre', 'audience', 'record_type', 'formation', 'mood:weather', 'category', 'movie:genre', 'mood:day_moment', 'lang', 'mood:activity', 'location:continent', 'singer_type', 'audio_content', 'location:city', 'location', 'mood', 'mood:moment', 'mood:day', 'mood:season', 'mood:emotion']


In [18]:
# pl_with_tracks_on_dzrmeta_10 = []
# pl_with_tracks_on_dzrmeta_30 = []
pl_with_tracks_on_dzrmeta_50 = []
pl_with_tracks_on_dzrmeta_70 = []
num_tracks_list_dzrmeta = []
         
for filename in tqdm(sorted(mpd_filenames)):
    if filename.startswith("mpd.slice.") and filename.endswith(".json"):
        curr_pl_info_dir = os.sep.join(('/media/irene/dataset/mpd.v1/pl_info', filename.replace(".json", "")))
        curr_json_dir = os.sep.join((mpd_data_path, filename))
        
        f = open(curr_json_dir, encoding="latin-1")
        js = f.read()
        f.close()
        mpd_slice = json.loads(js)
        
        for idx in range(len(mpd_slice['playlists'])):
            
            curr_tracks = mpd_slice['playlists'][idx]['tracks']
            _filtered_tracks = []
            for _track in curr_tracks:
                curr_tid = _track['track_uri'].split(':')[-1]
                try:
                    _ = spotify_id_to_metadata_dict[curr_tid]
                    _filtered_tracks.append(curr_tid)
                except:
                    pass
            num_tracks_list_dzrmeta.append([len(_filtered_tracks), len(curr_tracks)])
#             if len(_filtered_tracks) >= 0.1 * len(curr_tracks):
#                 pl_with_tracks_on_dzrmeta_10.append(mpd_slice['playlists'][idx])
#             if len(_filtered_tracks) >= 0.3 * len(curr_tracks):
#                 pl_with_tracks_on_dzrmeta_30.append(mpd_slice['playlists'][idx])
            if len(_filtered_tracks) >= 0.5 * len(curr_tracks):
                pl_with_tracks_on_dzrmeta_50.append(mpd_slice['playlists'][idx])
            if len(_filtered_tracks) >= 0.7 * len(curr_tracks):
                pl_with_tracks_on_dzrmeta_70.append(mpd_slice['playlists'][idx])



100%|██████████| 1000/1000 [05:01<00:00,  3.32it/s]


In [19]:
len(num_tracks_list_dzrmeta)

1000000

In [22]:
# len(pl_with_tracks_on_dzrmeta_10),len(pl_with_tracks_on_dzrmeta_30),

len(pl_with_tracks_on_dzrmeta_50),len(pl_with_tracks_on_dzrmeta_70)

(137247, 9680)

In [21]:
save_list_as_txt(num_tracks_list_dzrmeta, './num_tracks_list_dzrmeta.txt')
save_dict_as_json(pl_with_tracks_on_dzrmeta_50, './data_pl_dzrmeta/pl_with_tracks_on_dzrmeta_50.json')
save_dict_as_json(pl_with_tracks_on_dzrmeta_70, './data_pl_dzrmeta/pl_with_tracks_on_dzrmeta_70.json')

In [33]:
spotify_ids_in_mpd_and_recom = []
for _pl in pl_with_tracks_on_dzrmeta_50:
    for _s_id in _pl['tracks']:
        curr_s_id = _s_id['track_uri'].split(':')[-1]
        try:
            _ = spotify_id_to_metadata_dict[curr_s_id]
            spotify_ids_in_mpd_and_recom.append(curr_s_id)
        except:
            pass

In [34]:
len(spotify_ids_in_mpd_and_recom)

3606971

In [None]:
# categorizing context of playlist 

In [58]:
sets_on_dzr_metadata = dict()
total_num_features_on_dzr_metadata = dict()
num_tracks_with_each_dzr_metadata = dict()
for feature in dzr_feature_name_set:
    sets_on_dzr_metadata[feature] = set()
    total_num_features_on_dzr_metadata[feature] = 0
    num_tracks_with_each_dzr_metadata[feature] = 0

In [59]:
for _sid in spotify_ids_in_mpd_and_recom:
    curr_metadata = spotify_id_to_metadata_dict[_sid]
    for feature in dzr_feature_name_set:
        for _f in curr_metadata[feature]:
            sets_on_dzr_metadata[feature].add(_f)
            total_num_features_on_dzr_metadata[feature] += 1
        if len(curr_metadata[feature]) > 0:
            num_tracks_with_each_dzr_metadata[feature] += 1

In [45]:
save_list_as_txt(sets_on_dzr_metadata, './data_pl_dzrmeta/sets_on_dzr_metadata.txt')
save_dict_as_json(total_num_features_on_dzr_metadata, './data_pl_dzrmeta/total_num_features_on_dzr_metadata_50.json')
save_dict_as_json(num_tracks_with_each_dzr_metadata, './data_pl_dzrmeta/num_tracks_with_each_dzr_metadata_50.json')

In [43]:
save_list_as_txt(spotify_ids_in_mpd_and_recom, './data_pl_dzrmeta/spotify_ids_in_mpd_and_recom.txt')

In [44]:
len(spotify_ids_in_mpd_and_recom)

3606971

In [37]:
sets_on_dzr_metadata

{'audience': {'mainstream', 'new_releases'},
 'audio_content': {'fast_pulse', 'slow_pulse', 'strong_pulse'},
 'belief': {'christian', 'jewish', 'political', 'religious'},
 'category': {'a_capella',
  'acoustic',
  'animal',
  'anthem',
  'audiobook',
  'classic',
  'cover',
  'crossover',
  'disney',
  'electric',
  'experimental',
  'hits',
  'indie',
  'instrumental',
  'kids',
  'modern',
  'nouvelle_scene',
  'oldschool',
  'poetry',
  'prayer',
  'safe_for_kids',
  'sound_effects',
  'soundtrack_movies',
  'soundtrack_tv',
  'speech',
  'spoken',
  'traditionnal',
  'underground'},
 'decade': {'00s',
  '10s',
  '20s',
  '30s',
  '40s',
  '50s',
  '60s',
  '70s',
  '80s',
  '90s'},
 'formation': {'choral', 'quartet', 'trio'},
 'genre': {'abstract_hip_hop',
  'acid_house',
  'acid_jazz',
  'afrobeat',
  'aggrotech',
  'alternative',
  'alternative_americana',
  'alternative_country',
  'alternative_emo',
  'alternative_metal',
  'alternative_rock',
  'ambeat',
  'americana',
  'anim

In [38]:
total_num_features_on_dzr_metadata

{'audience': 88113,
 'audio_content': 87235,
 'belief': 205150,
 'category': 2946463,
 'decade': 3667608,
 'formation': 171,
 'genre': 7824564,
 'influence': 15815,
 'instrument': 2562252,
 'lang': 3399149,
 'location': 782896,
 'location:city': 116899,
 'location:continent': 752677,
 'location:region': 425660,
 'mood': 209363,
 'mood:activity': 633372,
 'mood:celebration': 119768,
 'mood:day': 63840,
 'mood:day_moment': 71506,
 'mood:emotion': 1762977,
 'mood:moment': 29188,
 'mood:season': 31285,
 'mood:situation': 327847,
 'mood:weather': 2813,
 'movie:genre': 6700,
 'musical_form': 10442,
 'period': 19329,
 'record_type': 125405,
 'role': 39231,
 'singer_type': 224656}

In [39]:
num_tracks_with_each_dzr_metadata

{'audience': 88113,
 'audio_content': 85757,
 'belief': 115972,
 'category': 1745150,
 'decade': 3598600,
 'formation': 171,
 'genre': 3272354,
 'influence': 15815,
 'instrument': 2053880,
 'lang': 3399127,
 'location': 781254,
 'location:city': 116899,
 'location:continent': 752589,
 'location:region': 423608,
 'mood': 189547,
 'mood:activity': 507979,
 'mood:celebration': 119215,
 'mood:day': 12768,
 'mood:day_moment': 71506,
 'mood:emotion': 1001879,
 'mood:moment': 29188,
 'mood:season': 31285,
 'mood:situation': 322101,
 'mood:weather': 2813,
 'movie:genre': 6698,
 'musical_form': 9024,
 'period': 19322,
 'record_type': 125267,
 'role': 39231,
 'singer_type': 212325}