## Available feature resources for MPD / tracks in MPD

We focus on distribution (Variance / Entropy / Homogeniety)

    1) Playlist-level features (from MPD metadata)
        - num_albums
        - num_artists
        - num_tracks
        - num_followers
        - collaborative (or not)


    2) Track-level Audio Characteristics using Spotify API
        - valence
        - dancibility
        - acousticness
        - energy
        - speechiness
        - loudness
        - instrumentalness
        - tempo 
        - mode


    3) Track-level meatadata
        - Last.fm tags 
            - 50 tags presence
            - Full tags 
                - Word embeddings (GloVe)
                - Zero-shot embedding
        - Tagtraum genre annotation
        - Deezer metadata
            - mood, genre, location, etc.



## 1. Dataset preparation

#### From MPD, 

    1) Normalize text (title / description).

    2) Prepare subset that has textual descriptions.

    3) Get song metadata by matching with Last.fm.

    4) Get song metadata from Spotify API request.


In [9]:
from tqdm import tqdm
import html
import pickle
import re
import os
import spotipy
import spotipy.util as sp_util
from spotipy.oauth2 import SpotifyClientCredentials
import numpy as np
import pandas as pd

In [2]:
from bs4 import BeautifulStoneSoup, BeautifulSoup

In [3]:
from util.util_data import *

## 1) Normalize title / description text

In [6]:
def normalize_name(name):
    name = name.lower()
    name = re.sub(r"[.,\/#!$%\^\*;:{}=\_`~()@]", ' ', name)
    name = re.sub(r'\s+', ' ', name).strip()
    return name

In [7]:
mpd_data_path = '/media/irene/dataset/mpd.v1/data'

In [10]:
filenames = os.listdir(mpd_data_path)

In [11]:
len(filenames)

1000

In [12]:
filenames[:10]

['mpd.slice.103000-103999.json',
 'mpd.slice.213000-213999.json',
 'mpd.slice.401000-401999.json',
 'mpd.slice.598000-598999.json',
 'mpd.slice.423000-423999.json',
 'mpd.slice.799000-799999.json',
 'mpd.slice.808000-808999.json',
 'mpd.slice.728000-728999.json',
 'mpd.slice.210000-210999.json',
 'mpd.slice.797000-797999.json']

In [None]:
filenames = os.listdir(mpd_data_path)
pl_with_description = []
num_pl_with_both = 0
unique_title_clean_set = set()
unique_title_org_set = set()
unique_title_norm_set = set()
descriptions_list = [] 
unique_tracks_from_all_pl = set()

for filename in tqdm(sorted(filenames)):
    if filename.startswith("mpd.slice.") and filename.endswith(".json"):
        fullpath = os.sep.join((mpd_data_path, filename))
        f = open(fullpath, encoding="latin-1")
        js = f.read()
        f.close()
        mpd_slice = json.loads(js)
        for idx in range(len(mpd_slice['playlists'])):
            name = mpd_slice['playlists'][idx]['name']
            unique_title_org_set.add(name)

            n_name = normalize_name(name)
            unique_title_norm_set.add(n_name)

            b_name = BeautifulSoup(name).string
            unique_title_clean_set.add(b_name)
            
            if 'description' in list(mpd_slice['playlists'][idx].keys()):
                description = mpd_slice['playlists'][idx]['description']
                description = BeautifulSoup(description).string
                descriptions_list.append(description)
                pl_with_description.append([filename, idx])
                
            

In [None]:
with open('./unique_mpd_pl_title_clean_set.txt', 'w') as f:
    for _title in list(unique_title_clean_set):
        f.writelines(str(_title) + '\n') 


In [14]:
with open('./unique_mpd_pl_title_norm_set.txt', 'w') as f:
    for _title in list(unique_title_norm_set):
        f.writelines(_title + '\n') 


In [17]:
len(unique_title_clean_set), len(unique_title_org_set), len(unique_title_norm_set)

(91584, 92944, 17381)

In [20]:
len(descriptions_list)

18760

## 2) Prepare subset that has textual descriptions

In [8]:
list(mpd_data_all[0].keys())

['playlists', 'info']

In [9]:
mpd_data_all[0]['info']

{'generated_on': '2017-12-03 08:41:42.057563',
 'slice': '103000-103999',
 'version': 'v1'}

In [10]:
print(mpd_data_all[0]['playlists'][0].keys())

dict_keys(['num_edits', 'num_albums', 'num_artists', 'num_followers', 'collaborative', 'duration_ms', 'num_tracks', 'pid', 'modified_at', 'name', 'tracks'])


In [10]:
print(mpd_data_all[0]['playlists'][0]['tracks'][0].keys())

dict_keys(['duration_ms', 'track_uri', 'album_name', 'artist_name', 'track_name', 'artist_uri', 'pos', 'album_uri'])


In [29]:
len(mpd_data_all)

10

In [28]:
for i in range(len(mpd_data_all)):
    print(len(mpd_data_all[i]['playlists']))

1000
1000
1000
1000
1000
1000
1000
1000
1000
1000


In [22]:
json_key_set = set()
for k in range(len(mpd_data_all)):
    for idx in range(len(mpd_data_all[k]['playlists'])):
        curr_list_info = mpd_data_all[k]['playlists'][idx]
        for _key in list(mpd_data_all[k]['playlists'][idx].keys()):
            json_key_set.add(_key)

print(json_key_set)

{'duration_ms', 'collaborative', 'modified_at', 'num_albums', 'name', 'num_edits', 'num_artists', 'description', 'num_tracks', 'pid', 'num_followers', 'tracks'}


In [25]:
pl_with_title = []
for k in range(len(mpd_data_all)):
    for idx in range(len(mpd_data_all[k]['playlists'])):
        curr_list_info = mpd_data_all[k]['playlists'][idx]
        if 'name' in list(mpd_data_all[k]['playlists'][idx].keys()):
            name = mpd_data_all[k]['playlists'][idx]['name']
            name = BeautifulSoup(name).string
            pl_with_title.append(mpd_data_all[k]['playlists'][idx])


In [26]:
len(pl_with_title)

10000

In [10]:
pl_with_description = []
for idx in range(len(mpd_data_all[0]['playlists'])):
    curr_list_info = mpd_data_all[0]['playlists'][idx]
    if 'description' in list(mpd_data_all[0]['playlists'][idx].keys()):
        description = mpd_data_all[0]['playlists'][idx]['description']
        description = BeautifulSoup(description).string
        print(description)
        pl_with_description.append(mpd_data_all[0]['playlists'][idx])


you are an ocean in which i am so willing to drown
a collection of some of my favorite k-r&b and k-hiphop songs
Give a little listen, see the world from my eyes...
Just for driving
Get lost in the vibes, but pay attention to the road yo, some freaky accidents been happening lately and you need to be safe and stuff. Wear seatbelts too, and don't text and drive.
elation.
Sunny day playlist. Jump in your convertible and jam
Because I have no life.
untraditional instumentals & beautiful vocals
Mulholland Drive
Grateful Dead and Jerry Garcia side projects
Sleep to me. Think to me. Behave to me.
never will delete this


In [13]:
filenames = os.listdir(mpd_data_path)

In [14]:
len(filenames)

1000

In [12]:
filenames = os.listdir(mpd_data_path)
pl_with_description = []
count = 0
for filename in tqdm(sorted(filenames)):
    if filename.startswith("mpd.slice.") and filename.endswith(".json"):
        fullpath = os.sep.join((mpd_data_path, filename))
        f = open(fullpath)
        js = f.read()
        f.close()
        mpd_slice = json.loads(js)
        for idx in range(len(mpd_slice['playlists'])):
            curr_list_info = mpd_slice['playlists'][idx]
            try:
                mpd_slice['playlists'][idx]['description'] = mpd_slice['playlists'][idx]['description'].replace('&#x27;', "'").replace('&#x2F;', '/')
                pl_with_description.append(mpd_slice['playlists'][idx])
                count += 1
            except:
                continue
                    
print(count)

100%|██████████| 1000/1000 [05:11<00:00,  3.21it/s]

18760





In [13]:
len(pl_with_description)

18760

In [14]:
pl_with_description[0]

{'collaborative': 'false',
 'description': 'chilllll out',
 'duration_ms': 27578241,
 'modified_at': 1495324800,
 'name': 'relax',
 'num_albums': 112,
 'num_artists': 97,
 'num_edits': 35,
 'num_followers': 1,
 'num_tracks': 124,
 'pid': 94,
 'tracks': [{'album_name': 'Chaos And The Calm',
   'album_uri': 'spotify:album:5BxvswQSGWrBbVCdx6mFGO',
   'artist_name': 'James Bay',
   'artist_uri': 'spotify:artist:4EzkuveR9pLvDVFNx6foYD',
   'duration_ms': 260533,
   'pos': 0,
   'track_name': 'Let It Go',
   'track_uri': 'spotify:track:13HVjjWUZFaWilh2QUJKsP'},
  {'album_name': 'All The Little Lights',
   'album_uri': 'spotify:album:0kjXSwSLVipUi4lSEeF0yl',
   'artist_name': 'Passenger',
   'artist_uri': 'spotify:artist:0gadJ2b9A4SKsB1RFkBb66',
   'duration_ms': 233973,
   'pos': 1,
   'track_name': 'All the Little Lights',
   'track_uri': 'spotify:track:4jpwHDjemKbmNy0pyWDHpr'},
  {'album_name': 'The Script',
   'album_uri': 'spotify:album:5gCo4z8XZ0T16nJmnw9wnG',
   'artist_name': 'The Scr

### -->   * 18760 playlists with textual description

In [15]:
with open('./mpd_pl_with_description.p', 'wb') as fp:
    pickle.dump(pl_with_description, fp)

In [206]:
all_pl_with_description = pickle.load(open('mpd_pl_with_description.p', 'rb'))

In [17]:
pl_with_description[100]['description']

"When you're looking for some smooth songs."

In [19]:
for i in range(50,60):
    print(BeautifulSoup(pl_with_description[i]['description']).string)

or for my adrenaline rush events
he's got a still small voice. you better be quiet.
#ObnoxiousName
some rock and alternative music that i really love at the age of 17 :)
I grew older. I saw it in a dream.
a compilation of music found throughout 2017
work out
Songs for the Revolution
Good playlist for parties, could be longer but it is not
Yea!


## 3) Get song metadata by matching with Last.fm.

In [27]:
mel_base_dir = '/media/iu/MSD/mel128/'
lastfm_full_track_to_tags_txt_dict = load_obj_curr_folder('data_lastfm/lastfm_full_track_to_tags_txt_dict.p')
msd_to_spotify_file_path = './msd_to_spotify.tsv'

In [22]:
len(lastfm_full_track_to_tags_txt_dict)

505216

In [23]:
list(lastfm_full_track_to_tags_txt_dict.keys())[:5]

['TRVQJRL128F930347F',
 'TRMZLME128F92F6024',
 'TRVNGYA128F425E207',
 'TRSBWUX128F14585AC',
 'TRSIFCN128F92E077D']

In [24]:
lastfm_full_track_to_tags_txt_dict['TRVQJRL128F930347F']

['french',
 'tomber',
 'francais',
 'toll',
 'misc',
 'givehimanotherchance',
 'beachtlich',
 'bedeutend',
 'bewundernswert',
 'brilliantly',
 'vorgemerkt',
 'beeindruckend',
 'justgreat',
 'pukka',
 'investigate',
 'bestof',
 'bluesy',
 'bluesrock',
 'awesome',
 'singersongwriter',
 'geralddepalmas',
 'chansonfrancaise',
 'wishlist',
 'friends']

In [25]:
msd_id_to_tag_vector = pickle.load(open('/media/iu/MSD/MSD_mel_split/MSD_split/msd_id_to_tag_vector.cP', 'rb'))
MSD_id_to_7D_id = pickle.load(open('/media/iu/MSD/MSD_mel_split/MSD_split/MSD_id_to_7D_id.pkl', 'rb'))
sevenD_id_to_path = pickle.load(open('/media/iu/MSD/MSD_mel_split/MSD_split/7D_id_to_path.pkl', 'rb'))
filtered_list_train = pickle.load(open('/media/iu/MSD/MSD_mel_split/MSD_split/filtered_list_train.cP', 'rb'))
filtered_list_test = pickle.load(open('/media/iu/MSD/MSD_mel_split/MSD_split/filtered_list_test.cP', 'rb'))

In [27]:
lastfm_full_tracks = list(lastfm_full_track_to_tags_txt_dict.keys())

In [28]:
msd_id_to_file_path_dict = dict()
err_list = []

for idx in range(len(lastfm_full_tracks)):
    msd_id = lastfm_full_tracks[idx]
    try:
        msd_id_to_file_path_dict[msd_id] = sevenD_id_to_path[MSD_id_to_7D_id[msd_id]]
    except Exception:
        err_list.append(msd_id)
        pass

In [29]:
audio_filtered_track_file_path_list_sorted = load_obj_curr_folder("data_lastfm/audio_filtered_track_file_path_list_sorted_29sec.p") 

In [30]:
# For temporary use 
audio_filtered_track_file_path_list_sorted_dict = dict()
for _i in audio_filtered_track_file_path_list_sorted:
    audio_filtered_track_file_path_list_sorted_dict[_i] = ''

In [31]:
af_msd_id_to_file_path_dict = dict()
for k, v in tqdm(msd_id_to_file_path_dict.items()):
    if v in af_msd_id_to_file_path_dict:
        print(v)
    else:
        try:
            _ = audio_filtered_track_file_path_list_sorted_dict[v]
            af_msd_id_to_file_path_dict[v] = k
        except:
            continue


 32%|███▏      | 160542/504358 [00:00<00:00, 802639.96it/s]

3/2/321705.clip.mp3


100%|██████████| 504358/504358 [00:00<00:00, 777523.23it/s]

6/7/6715499.clip.mp3
1/0/104485.clip.mp3





In [32]:
len(af_msd_id_to_file_path_dict), len(msd_id_to_file_path_dict)

(406409, 504358)

In [34]:
pl_with_description[0]['tracks'][0]

{'album_name': 'Chaos And The Calm',
 'album_uri': 'spotify:album:5BxvswQSGWrBbVCdx6mFGO',
 'artist_name': 'James Bay',
 'artist_uri': 'spotify:artist:4EzkuveR9pLvDVFNx6foYD',
 'duration_ms': 260533,
 'pos': 0,
 'track_name': 'Let It Go',
 'track_uri': 'spotify:track:13HVjjWUZFaWilh2QUJKsP'}

In [196]:
msd_id_to_spotify_id_dict = dict()
duplicates_msd = 0
duplicates_msd_list = []
with open(msd_to_spotify_file_path) as file:
    curr_line = file.readline()
    info = curr_line.split('\t')
    print(info)
    curr_line = file.readline()
    while curr_line:
        info = curr_line.split('\t')
        try:
            _ = msd_id_to_spotify_id_dict[info[1]]
            duplicates_msd += 1
            duplicates_msd_list.append((info[1], info[3]))
            curr_line = file.readline()
        except:
            msd_id_to_spotify_id_dict[info[1]] = info[3]
            curr_line = file.readline()
            continue
        

['dzr_id', 'msd_id', 'msd_origin', 'spotify_id', 'spotify_origin\n']


In [197]:
spotify_id_to_msd_id_dict = dict()
duplicates_spo = 0
duplicates_spo_list = []

with open(msd_to_spotify_file_path) as file:
    curr_line = file.readline()
    info = curr_line.split('\t')
    print(info)
    curr_line = file.readline()
    while curr_line:
        info = curr_line.split('\t')
        try:
            _ = spotify_id_to_msd_id_dict[info[3]]
            duplicates_spo += 1
            duplicates_spo_list.append((info[3], info[1]))
            curr_line = file.readline()
            continue
        except:
            spotify_id_to_msd_id_dict[info[3]] = info[1]
            
            curr_line = file.readline()
            continue

['dzr_id', 'msd_id', 'msd_origin', 'spotify_id', 'spotify_origin\n']


In [198]:
duplicates_msd, duplicates_spo

(13526, 47382)

In [199]:
len(msd_id_to_spotify_id_dict), len(spotify_id_to_msd_id_dict)

(396745, 362889)

In [203]:
len(spotify_id_to_msd_id_dict.keys())

362889

In [201]:
save_obj_curr_folder(msd_id_to_spotify_id_dict, './data_lastfm/msd_id_to_spotify_id_dict.p')
save_obj_curr_folder(spotify_id_to_msd_id_dict, './data_lastfm/spotify_id_to_msd_id_dict.p')

In [202]:
msd_id_to_spotify_id_dict = load_obj_curr_folder('./data_lastfm/msd_id_to_spotify_id_dict.p')
spotify_id_to_msd_id_dict = load_obj_curr_folder('./data_lastfm/spotify_id_to_msd_id_dict.p')

In [111]:
duplicates_msd_list[50:60]

[('TRVMWPF12903CF09AF', '2HBeYwjvjb4pifQ9h1SRp1'),
 ('TRYSHDM128F425F3E4', '1PgkeXEWsfxnLWnWIpQV2j'),
 ('TRZYPMA128F425F194', '50x86u5QdSpm493LfaBbxm'),
 ('TREDWOT128F425F19C', '2SVurl0mF7cSZkvCxpO3kf'),
 ('TRJDPZR128F92E6DEF', '5YAejsQX8kFoLK8Mrm1EiG'),
 ('TRNGHYF128F93041CD', '5YAejsQX8kFoLK8Mrm1EiG'),
 ('TRJXIZL12903CC4D99', '3eY0KL2ovM1GFszbMWDsEl'),
 ('TRGOXRC128F42278AD', '5Rqs0ziSKyv738pMQgDCmY'),
 ('TRMTYMH128F4243C4D', '5gexaN536ozXKyEtzum45b'),
 ('TRUBUOM128F4243C64', '0Inm2j453nXbfoQqJVK8nC')]

In [36]:
len(msd_id_to_spotify_id_dict), len(spotify_id_to_msd_id_dict)

(396745, 362889)

In [56]:
client_credentials_manager = SpotifyClientCredentials(client_id='7b64a7f89d0247589b529a5ded859da4', client_secret='0699b27eabc74cc3ba74077edd221f2b')
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [71]:
#  ('TRVTWQH128F92E65F8', '1X2dxHokN5tYxixBEsYXNh'),
#  ('TRVTWQH128F92E65F8', '2Kabx8h3lfKmOsGAnITwdf'),
#  ('TRVTWQH128F92E65F8', '4l7hHzGzb4X2QosbGA8Wu7'),
#  ('TRVTWQH128F92E65F8', '5EKkaq8plvnkZdF1HXXFis'),
#  ('TRVTWQH128F92E65F8', '6hHOhGDuBM9Vthe8DizLke'),

track_name = '4l7hHzGzb4X2QosbGA8Wu7'

In [72]:
info_for_a_track = sp.track(track_name)

In [73]:
info_for_a_track

{'album': {'album_type': 'compilation',
  'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/4F7Q5NV6h5TSwCainz8S5A'},
    'href': 'https://api.spotify.com/v1/artists/4F7Q5NV6h5TSwCainz8S5A',
    'id': '4F7Q5NV6h5TSwCainz8S5A',
    'name': 'Duke Ellington',
    'type': 'artist',
    'uri': 'spotify:artist:4F7Q5NV6h5TSwCainz8S5A'}],
  'available_markets': [],
  'external_urls': {'spotify': 'https://open.spotify.com/album/66XVEGxwEK4mbvsYgNvWXH'},
  'href': 'https://api.spotify.com/v1/albums/66XVEGxwEK4mbvsYgNvWXH',
  'id': '66XVEGxwEK4mbvsYgNvWXH',
  'images': [{'height': 640,
    'url': 'https://i.scdn.co/image/4db6cece42c0d5f24d5fc09314af21eb0c0d57ca',
    'width': 640},
   {'height': 300,
    'url': 'https://i.scdn.co/image/2108e69a3f63f80a5a6265d9a40f6aef60d2f497',
    'width': 300},
   {'height': 64,
    'url': 'https://i.scdn.co/image/1b4b45dee22e6b37c38b6997329617a29a742111',
    'width': 64}],
  'name': 'Duke Ellington and his Famous Orchestra',
  'releas

In [133]:
audio_feature_for_a_track = sp.audio_features(tracks=['2HBeYwjvjb4pifQ9h1SRp1', '0Inm2j453nXbfoQqJVK8nC', '75JFxkI2RXiU7L9VXzMkle'])

In [134]:
audio_analysis_for_a_track = sp.audio_analysis(track_name)

In [135]:
audio_feature_for_a_track

[{'acousticness': 0.454,
  'analysis_url': 'https://api.spotify.com/v1/audio-analysis/2HBeYwjvjb4pifQ9h1SRp1',
  'danceability': 0.595,
  'duration_ms': 241147,
  'energy': 0.771,
  'id': '2HBeYwjvjb4pifQ9h1SRp1',
  'instrumentalness': 1.24e-06,
  'key': 0,
  'liveness': 0.172,
  'loudness': -7.695,
  'mode': 0,
  'speechiness': 0.0421,
  'tempo': 94.963,
  'time_signature': 4,
  'track_href': 'https://api.spotify.com/v1/tracks/2HBeYwjvjb4pifQ9h1SRp1',
  'type': 'audio_features',
  'uri': 'spotify:track:2HBeYwjvjb4pifQ9h1SRp1',
  'valence': 0.701},
 {'acousticness': 0.00227,
  'analysis_url': 'https://api.spotify.com/v1/audio-analysis/0Inm2j453nXbfoQqJVK8nC',
  'danceability': 0.511,
  'duration_ms': 288853,
  'energy': 0.835,
  'id': '0Inm2j453nXbfoQqJVK8nC',
  'instrumentalness': 0.522,
  'key': 8,
  'liveness': 0.132,
  'loudness': -6.617,
  'mode': 1,
  'speechiness': 0.0489,
  'tempo': 97.673,
  'time_signature': 4,
  'track_href': 'https://api.spotify.com/v1/tracks/0Inm2j453nXbfo

In [74]:
msd_id_to_tag_vector = pickle.load(open('/media/iu/MSD/MSD_mel_split/MSD_split/msd_id_to_tag_vector.cP', 'rb'))
MSD_id_to_7D_id = pickle.load(open('/media/iu/MSD/MSD_mel_split/MSD_split/MSD_id_to_7D_id.pkl', 'rb'))
sevenD_id_to_path = pickle.load(open('/media/iu/MSD/MSD_mel_split/MSD_split/7D_id_to_path.pkl', 'rb'))
filtered_list_train = pickle.load(open('/media/iu/MSD/MSD_mel_split/MSD_split/filtered_list_train.cP', 'rb'))
filtered_list_test = pickle.load(open('/media/iu/MSD/MSD_mel_split/MSD_split/filtered_list_test.cP', 'rb'))

In [75]:
#  ('TRJDPZR128F92E6DEF', '5YAejsQX8kFoLK8Mrm1EiG'),
#  ('TRNGHYF128F93041CD', '5YAejsQX8kFoLK8Mrm1EiG'),

sevenD_id_to_path[MSD_id_to_7D_id['TRJDPZR128F92E6DEF']]

'3/7/3702324.clip.mp3'

In [76]:
sevenD_id_to_path[MSD_id_to_7D_id['TRNGHYF128F93041CD']]

'3/8/3857158.clip.mp3'

In [81]:
import IPython.display as ipd

In [85]:
msd_audio_base_dir = '/media/bach2/dataset/MSD/songs/'

In [90]:
ipd.Audio(os.path.join(msd_audio_base_dir, sevenD_id_to_path[MSD_id_to_7D_id['TRJDPZR128F92E6DEF']]))

In [91]:
ipd.Audio(os.path.join(msd_audio_base_dir, sevenD_id_to_path[MSD_id_to_7D_id['TRNGHYF128F93041CD']]))



## Dealing with duplicates 
1. merge into first appearing one
2. doesn't have to be considered since we're getting the audio features only.


In [100]:
len(pl_with_description)

18760

In [101]:
pl_with_description[0]

['mpd.slice.0-999.json', 94]

In [102]:
# create folders
filenames = os.listdir(mpd_data_path)

for filename in tqdm(sorted(filenames)):
    if filename.startswith("mpd.slice.") and filename.endswith(".json"):
        mkdir_path = os.sep.join(('/media/irene/dataset/mpd.v1/pl_info', filename.replace(".json", "")))
        os.mkdir(mkdir_path)

100%|██████████| 1000/1000 [00:00<00:00, 44875.66it/s]


## Last.fm tags

In [105]:
msd50_track_key_to_tag_key_binary_matrix = load_obj_curr_folder('data_lastfm/msd50_track_key_to_tag_key_binary_matrix.p')
msd50_track_key_to_tag_bin_dict = load_obj_curr_folder('data_lastfm/msd50_track_key_to_tag_bin_dict.p')
msd50_track_id_to_key_dict = load_obj_curr_folder('data_lastfm/msd50_track_id_to_key_dict.p')
msd50_track_key_to_id_dict = load_obj_curr_folder('data_lastfm/msd50_track_key_to_id_dict.p')

msd50_tag_ids_in_order = load_obj_curr_folder('data_lastfm/msd50_tag_ids_in_order.p')
msd50_tag_key_to_id_dict = load_obj_curr_folder('data_lastfm/msd50_tag_key_to_id_dict.p')
msd50_track_id_to_file_path_dict = load_obj_curr_folder('data_lastfm/msd50_track_id_to_file_path_dict.p')

In [148]:
lastfm_full_track_to_tags_txt_dict = load_obj_curr_folder('data_lastfm/lastfm_full_track_to_tags_txt_dict.p')

In [104]:
msd50_track_key_to_tag_key_binary_matrix.shape

(241889, 50)

In [149]:
len(lastfm_full_track_to_tags_txt_dict)

505216

In [150]:
list(lastfm_full_track_to_tags_txt_dict.keys())[:2]

['TRUKMOU12903CCE441', 'TRLVEHI128F92FA8FF']

In [None]:
err_matching_lastfm_tid_list = []

for filename in tqdm(sorted(filenames)):
    if filename.startswith("mpd.slice.") and filename.endswith(".json"):
        curr_pl_info_dir = os.sep.join(('/media/irene/dataset/mpd.v1/pl_info', filename.replace(".json", "")))
        curr_json_dir = os.sep.join((mpd_data_path, filename))
        
        f = open(curr_json_dir, encoding="latin-1")
        js = f.read()
        f.close()
        mpd_slice = json.loads(js)
        
        for idx in range(len(mpd_slice['playlists'])):
            _curr_playlist = dict()
            _curr_playlist['pid'] = mpd_slice['playlists'][idx]['pid']
            name = mpd_slice['playlists'][idx]['name']
            _curr_playlist['name'] = normalize_name(name)
            _curr_playlist['num_albums'] = mpd_slice['playlists'][idx]['num_albums']
            _curr_playlist['num_artists'] = mpd_slice['playlists'][idx]['num_artists']
            _curr_playlist['num_tracks'] = mpd_slice['playlists'][idx]['num_tracks']
            _curr_playlist['is_collaborative'] = mpd_slice['playlists'][idx]['collaborative']

            if 'description' in list(mpd_slice['playlists'][idx].keys()):
                description = mpd_slice['playlists'][idx]['description']
                _curr_playlist['description'] = BeautifulSoup(description).string
            else:
                _curr_playlist['description'] = ''
            
            curr_tracks = mpd_slice['playlists'][idx]['tracks']
            
            curr_track_info_list = []
            for _track in curr_tracks:
                _curr_track_info = dict()
                _curr_track_info['tid'] = _track['track_uri'].split(':')[-1]
                _curr_track_id = _curr_track_info['tid']
                _curr_track_info['info'] = sp.track(_curr_track_id)
                _curr_track_info['audio_feature'] = sp.audio_features(tracks=[_curr_track_id])
                
                try:
                    _curr_track_info['audio_path'] = sevenD_id_to_path[MSD_id_to_7D_id[spotify_id_to_msd_id_dict[_curr_track_id]]]
                    _curr_track_info['50_tag_bin'] = msd50_track_key_to_tag_bin_dict[msd50_track_id_to_key_dict[spotify_id_to_msd_id_dict[_curr_track_id]]]
                    _curr_track_info['50_tag_list'] = [msd50_tag_ids_in_order[_i] for _i in np.argwhere(_lastfm_50_tag_bin==1).squeeze()]
                    _curr_track_info['full_tag_list'] = lastfm_full_track_to_tags_txt_dict[spotify_id_to_msd_id_dict[_curr_track_id]]
                    
                except:
                    # print(_curr_track_id + ':' + _track['track_name'] + " can't be matched with Last.fm data.")
                    err_matching_lastfm_tid_list.append(_curr_track_id)
                    _curr_track_info['audio_path'] = ''
                    _curr_track_info['50_tag_bin'] = np.zeros((50,))
                    _curr_track_info['50_tag_list'] = []
                    _curr_track_info['full_tag_list'] = []
                
                curr_track_info_list.append(_curr_track_info)
            
            _curr_playlist['tracks_info'] = curr_track_info_list
            
            save_dict_to_json(_curr_playlist, os.path.join(curr_pl_info_dir, str(_curr_playlist['pid']) + '.p'))
            

In [159]:
len(spotify_id_to_msd_id_dict), len(MSD_id_to_7D_id)

(362889, 1000000)

In [146]:
_lastfm_50_tag_bin = msd50_track_key_to_tag_bin_dict[msd50_track_id_to_key_dict[spotify_id_to_msd_id_dict['75JFxkI2RXiU7L9VXzMkle']]]

_curr_tag_indices = np.argwhere(_lastfm_50_tag_bin==1).squeeze()
print([msd50_tag_ids_in_order[_i] for _i in _curr_tag_indices])

['rock', 'alternative']


In [151]:
_lastfm_full_tag_list = lastfm_full_track_to_tags_txt_dict[spotify_id_to_msd_id_dict['75JFxkI2RXiU7L9VXzMkle']]

print(_lastfm_full_tag_list)

['coldplay', 'rock', 'alternative', 'britpop', 'british', 'alternativerock', 'indie', 'piano', 'mellow', 'love', 'sad', 'beautiful', 'pop', 'melancholy', '00s', 'chillout', 'indierock', 'favorites', 'melancholic', 'thescientist', 'pianorock', 'relaxing', 'softrock', 'ballad', 'chill', 'emotional', 'malevocalists', 'favourites', 'awesome', 'uk', 'favorite', 'acoustic', 'romantic', 'slow', 'bittersweet', 'amazing', 'lovely', 'favoritesongs', 'scientist', 'relax', 'makesmecry', 'nostalgic', 'britpop', '2002', 'emo', 'calm', 'soft', 'easylistening', 'favouritesongs', 'sentimental', 'memories', 'poprock', 'britrock', 'perfect', 'classicrock', 'sadsongs', 'loveit', 'brilliant', 'wickerpark', 'favourite', 'lovesong', 'soundtrack', 'bestsongsever', 'arushofbloodtothehead', 'depressing', 'singersongwriter', 'genius', 'greatsong', 'lovesongs', 'quiet', 'moving', 'dreamy', 'great', 'sweet', 'epic', 'loveatfirstlisten', 'malevocalist', 'cool', 'greatlyrics', 'twilight', 'ambient', 'coldplay', 'cla

In [122]:
_lastfm_50_tag_bin

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

## PL with descriptions

In [11]:
pl_with_description[1]

{'collaborative': 'false',
 'description': 'a collection of some of my favorite k-r&amp;b and k-hiphop songs',
 'duration_ms': 7965897,
 'modified_at': 1507334400,
 'name': 'rhythm & blues',
 'num_albums': 33,
 'num_artists': 22,
 'num_edits': 17,
 'num_followers': 1,
 'num_tracks': 36,
 'pid': 103051,
 'tracks': [{'album_name': 'Band of Dynamic Brothers',
   'album_uri': 'spotify:album:5svDNVLIXjAyPmNqF70yNX',
   'artist_name': 'Dynamic Duo',
   'artist_uri': 'spotify:artist:4nvFFLtv7ZqoTr83387uK4',
   'duration_ms': 224893,
   'pos': 0,
   'track_name': '죽일 놈 [Guilty]',
   'track_uri': 'spotify:track:6TuIgDsaBCFs4bYud1CwJU'},
  {'album_name': 'Luckynumbers',
   'album_uri': 'spotify:album:0rJq9y2Rk52JfSyrvhwJ4u',
   'artist_name': 'Dynamic Duo',
   'artist_uri': 'spotify:artist:4nvFFLtv7ZqoTr83387uK4',
   'duration_ms': 98773,
   'pos': 1,
   'track_name': '아침사랑 [Good Morning Love]',
   'track_uri': 'spotify:track:66Y7ccIYSU3cXjmTRkT0bC'},
  {'album_name': 'Luckynumbers',
   'album_u

In [99]:
audio_feature_for_a_track

[{'acousticness': 0.454,
  'analysis_url': 'https://api.spotify.com/v1/audio-analysis/2HBeYwjvjb4pifQ9h1SRp1',
  'danceability': 0.595,
  'duration_ms': 241147,
  'energy': 0.771,
  'id': '2HBeYwjvjb4pifQ9h1SRp1',
  'instrumentalness': 1.24e-06,
  'key': 0,
  'liveness': 0.172,
  'loudness': -7.695,
  'mode': 0,
  'speechiness': 0.0421,
  'tempo': 94.963,
  'time_signature': 4,
  'track_href': 'https://api.spotify.com/v1/tracks/2HBeYwjvjb4pifQ9h1SRp1',
  'type': 'audio_features',
  'uri': 'spotify:track:2HBeYwjvjb4pifQ9h1SRp1',
  'valence': 0.701},
 {'acousticness': 0.00227,
  'analysis_url': 'https://api.spotify.com/v1/audio-analysis/0Inm2j453nXbfoQqJVK8nC',
  'danceability': 0.511,
  'duration_ms': 288853,
  'energy': 0.835,
  'id': '0Inm2j453nXbfoQqJVK8nC',
  'instrumentalness': 0.522,
  'key': 8,
  'liveness': 0.132,
  'loudness': -6.617,
  'mode': 1,
  'speechiness': 0.0489,
  'tempo': 97.673,
  'time_signature': 4,
  'track_href': 'https://api.spotify.com/v1/tracks/0Inm2j453nXbfo

In [208]:
len(all_pl_with_description)

18760

In [209]:
all_pl_with_description[0]

{'collaborative': 'false',
 'description': 'chilllll out',
 'duration_ms': 27578241,
 'modified_at': 1495324800,
 'name': 'relax',
 'num_albums': 112,
 'num_artists': 97,
 'num_edits': 35,
 'num_followers': 1,
 'num_tracks': 124,
 'pid': 94,
 'tracks': [{'album_name': 'Chaos And The Calm',
   'album_uri': 'spotify:album:5BxvswQSGWrBbVCdx6mFGO',
   'artist_name': 'James Bay',
   'artist_uri': 'spotify:artist:4EzkuveR9pLvDVFNx6foYD',
   'duration_ms': 260533,
   'pos': 0,
   'track_name': 'Let It Go',
   'track_uri': 'spotify:track:13HVjjWUZFaWilh2QUJKsP'},
  {'album_name': 'All The Little Lights',
   'album_uri': 'spotify:album:0kjXSwSLVipUi4lSEeF0yl',
   'artist_name': 'Passenger',
   'artist_uri': 'spotify:artist:0gadJ2b9A4SKsB1RFkBb66',
   'duration_ms': 233973,
   'pos': 1,
   'track_name': 'All the Little Lights',
   'track_uri': 'spotify:track:4jpwHDjemKbmNy0pyWDHpr'},
  {'album_name': 'The Script',
   'album_uri': 'spotify:album:5gCo4z8XZ0T16nJmnw9wnG',
   'artist_name': 'The Scr

In [None]:
unique_track_ids_from_pl_with_desc = set()
for _pl in tqdm(all_pl_with_description):
    for _tr in _pl['tracks']:
        unique_track_ids_from_pl_with_desc.add(_tr['track_uri'].split(':')[-1])

In [213]:
len(unique_track_ids_from_pl_with_desc)

240690

In [212]:
unique_track_ids_from_pl_with_desc = list(unique_track_ids_from_pl_with_desc)

In [214]:
save_obj_curr_folder(unique_track_ids_from_pl_with_desc, './unique_track_ids_from_pl_with_desc.p')

## Loading saved Spotify features for a track

In [215]:
tmp = load_obj_curr_folder('/media/irene/dataset/mpd.v1/tr_info/4WR64ny1oaqUFh9X8SnCQz.p')

In [216]:
tmp.keys()

dict_keys(['audio_feature', 'info'])

In [218]:
tmp['audio_feature']

{'acousticness': 0.00696,
 'analysis_url': 'https://api.spotify.com/v1/audio-analysis/4WR64ny1oaqUFh9X8SnCQz',
 'danceability': 0.512,
 'duration_ms': 168158,
 'energy': 0.814,
 'id': '4WR64ny1oaqUFh9X8SnCQz',
 'instrumentalness': 0.878,
 'key': 10,
 'liveness': 0.0837,
 'loudness': -5.913,
 'mode': 1,
 'speechiness': 0.0337,
 'tempo': 140.391,
 'time_signature': 4,
 'track_href': 'https://api.spotify.com/v1/tracks/4WR64ny1oaqUFh9X8SnCQz',
 'type': 'audio_features',
 'uri': 'spotify:track:4WR64ny1oaqUFh9X8SnCQz',
 'valence': 0.768}

Just checking..

In [221]:
tmp_info = sp.audio_features('4WR64ny1oaqUFh9X8SnCQz')

In [222]:
tmp_info

[{'acousticness': 0.00696,
  'analysis_url': 'https://api.spotify.com/v1/audio-analysis/4WR64ny1oaqUFh9X8SnCQz',
  'danceability': 0.512,
  'duration_ms': 168158,
  'energy': 0.814,
  'id': '4WR64ny1oaqUFh9X8SnCQz',
  'instrumentalness': 0.878,
  'key': 10,
  'liveness': 0.0837,
  'loudness': -5.913,
  'mode': 1,
  'speechiness': 0.0337,
  'tempo': 140.391,
  'time_signature': 4,
  'track_href': 'https://api.spotify.com/v1/tracks/4WR64ny1oaqUFh9X8SnCQz',
  'type': 'audio_features',
  'uri': 'spotify:track:4WR64ny1oaqUFh9X8SnCQz',
  'valence': 0.768}]