# MSDSongID to Sptofy Crosswalk

**Purpose**: to replace song IDs from Million Song Dataset with current Spotify URI IDs  

**Process**:  
- Start with MSD  
- Use publicly available AcousticBrainz crosswalk to find Spotify IDs  
- For those songs that were not found in AcousticBrainz  
  - Search Spotify API directly for Spotify URI IDs  
  - Where multiple IDs exist, prioritize the closest matches that have song URLs  
  

## Setup

In [None]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import boto3
import s3fs
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import time
import pprint
pp = pprint.PrettyPrinter(indent=4)

# SET-UP STANDARD METHOD 
if sys.version_info[0] < 3: 
    from StringIO import StringIO # Python 2.x
else:
    from io import StringIO # Python 3.x

base_dir = 'ENTER HERE'
msd_dir = os.path.join(base_dir, 'millionsongdataset_echonest')
spotify_dir = os.path.join(base_dir, 'spotify')

# AWS S3 Boto CLient
# AWS Credentials
aws_id = 'X'
aws_secret = 'Y'
client = boto3.client('s3', 
                      aws_access_key_id = aws_id,
                      aws_secret_access_key = aws_secret)

# Spotify - Connect using lucyd Spotify app credentials  
my_client_id = 'x'
my_client_secret = 'y'
spotify = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials(client_id = my_client_id,
                                                                              client_secret = my_client_secret))

## AcousticBrainz Database of Crosswalk Files  
`curl ftp://ftp.acousticbrainz.org/pub/acousticbrainz/acousticbrainz/labs/download/msdrosetta/millionsongdataset_echonest.tar.bz2`

In [None]:
# FUNCTION to build FULL TRACK dictionary
def track_dict_builder(filename):
    with open(os.path.join(filename)) as f:
        data_in = json.load(f)
    
    # Song Dict
    data_dict_live = {}
    if len(data_in['response']['songs']) > 0:
        data_dict_live['id'] = data_in['response']['songs'][0]['id']
        data_dict_live['title'] = data_in['response']['songs'][0]['title']
        data_dict_live['artist_name'] = data_in['response']['songs'][0]['artist_name']

        # Artist ID
        for artist_foreign in sample_data['response']['songs'][0]['artist_foreign_ids']:
            if artist_foreign['catalog'] == 'spotify':
                data_dict_live['artist_id'] = artist_foreign['foreign_id']

        # Track Crosswalk Dict
        data_track_dict_live = {}
        for track in data_in['response']['songs'][0]['tracks']:
            if track['catalog'] == "spotify":
                data_track_dict_live[track['id']] = track['foreign_id']
                
        data_dict_live['track_dict'] = data_track_dict_live

    return(data_dict_live)

In [None]:
# Crawl to build FULL crosswalk recursively
result_list = []
error_list = []

for subdir, dirs, files in os.walk(msd_dir):   
        
    for file_loop in files:
        try:
            result = track_dict_builder(filename = os.path.join(subdir, file_loop))
            if result != {}:
                result_list.append(result)
        except:
            error_list.append(os.path.join(subdir, file_loop))

In [None]:
# Count Hits
acousticbrainz_songid_list = []
for obs in result_list:
    spotify_hit_count = 0
    for track in obs['track_dict']:
        spotify_hit_count += 1
    acousticbrainz_songid_list.append({'song_id':obs['id'],'ab_spotify_hit_count':spotify_hit_count})
acousticbrainz_songid_df = pd.DataFrame(acousticbrainz_songid_list)
ab_songid_unique_df = acousticbrainz_songid_df.groupby('song_id').agg('mean')

## Read in MSD Dataset

In [None]:
data_key = 'flat_summary'
data_location = 's3://{}/{}'.format(bucket_name, data_key)
msd_flat = pd.read_csv(data_location)
msd_flat_slim = msd_flat[['song_id', 'track_id', 'title', 'artist', 'spotify_uri']]
# Drop Duplicates
msd_prelim_clean = msd_flat_slim.drop_duplicates('song_id')

## Merge AcousticBrainz on to MSD

In [None]:
msd_ab_hits = msd_prelim_clean.merge(ab_songid_unique_df, 
                                     how = 'left',
                                     on = 'song_id',
                                     validate = 'one_to_one')

### Identify songs missing IDs

In [None]:
msd_missing_spotify = msd_ab_hits[pd.isnull(msd_ab_hits['spotify_uri'])]

## Search Spotify

In [1]:
# Custom Function to clean up search results
def spotify_search_trim_results(full_results):
    trim_list = []
    for track_item in full_results['tracks']['items']:
        trim_dict = {}
        
        # ALBUM
        trim_dict['album'] = {'uri':track_item['album']['uri'],
                              'name':track_item['album']['name']}

        # ARTISTS
        artist_list = []
        for artist in track_item['artists']:
            artist_list.append({'uri':artist['uri'], 
                                'name':artist['name']})
        trim_dict['artists'] = artist_list

        # TRACK
        trim_dict['uri'] = track_item['uri']
        trim_dict['name'] = track_item['name']
        trim_dict['popularity'] = track_item['popularity']
        trim_dict['preview_url'] = track_item['preview_url']
        trim_dict['external_urls'] = {'spotify': track_item['external_urls']['spotify']}
        
        trim_list.append(trim_dict)
    
    return(trim_list)
                          
# Custom Function to search and save trimmed results
def search_and_save(song_id, title_in, artist_in):
    try:
        # Search
        track_search = spotify.search("track:" + title_in + " " + "artist:" + artist_in)
        # Trim
        trim_results = spotify_search_trim_results(full_results = track_search)
        # Save
        with open(os.path.join(spotify_dir, song_id + '.json'), 'w') as f:
            json.dump(trim_results, f)
        status = "success" 
    except:
        status = "error"
    return(status)

In [None]:
# Loop through 
counter = 1
for index, row in msd_missing_spotify.iterrows():
    # Search and Save + Store Status
    status = search_and_save(song_id = row['song_id'], 
                             title_in = row['title'], 
                             artist_in = row['artist'])
    
    # Save status in external JSON
    status_dict = {row['song_id']:status}
    with open(os.path.join(spotify_dir, 'status.json'), 'r+') as file:
        data = json.load(file)
        data.update(status_dict)
        file.seek(0)
        json.dump(data, file)
        
    # Pause each n loop to try to prevent Spotify from blocking me
    if counter % 100 == 0: 
        time.sleep(1)
        print('\r%s' % (counter), end = "\r")
    counter += 1

## Pick ID where Multiple IDs exist

In [None]:
# Pick the best match
def song_id_picker(song_id_filename, song_id_lookup):
    with open(os.path.join(spotify_dir, song_id_filename)) as f:
        song_id_result = json.load(f)
    song_id_in = re.sub('.json', '', song_id_filename)
    query_artist = song_id_lookup.artist[song_id_lookup.song_id == song_id_in].tolist()[0]
    query_title = song_id_lookup.title[song_id_lookup.song_id == song_id_in].tolist()[0]    
    if len(song_id_result) == 0:
        return_id = 'no_matches'
    else:
        spotify_results_artist = []
        spotify_results_title = []
        spotify_results_preview = []

        for song_i in song_id_result:
            # Grab the artist name if in the list
            foundem = False
            for artist_i in song_i['artists']:
                if artist_i['name'] == query_artist:
                    foundem = True 
                    artist_keep = artist_i['name']
            if foundem:
                spotify_results_artist.append(artist_keep)
            else:
                spotify_results_artist.append(song_i['artists'][0]['name'])

            spotify_results_title.append(song_i['name'])
            spotify_results_preview.append(song_i['preview_url'])

        # Pick best
        artist_hits = [i for i, x in enumerate([query_artist == a for a in spotify_results_artist]) if x==True]
        title_hits = [i for i, x in enumerate([query_title == t for t in spotify_results_title]) if x==True]
        preview_hits = [i for i, x in enumerate([p is not None for p in spotify_results_preview]) if x==True]
    
        try:
            top_hit = min(set(artist_hits).intersection(set(title_hits), set(preview_hits)))
            return_id = song_id_result[top_hit]['uri']
        except:
            try:
                return_id = song_id_result[min(set(preview_hits))]['uri'] # Default to first with preview
            except:
                return_id = 'no_matches'
        
    return(return_id)

In [None]:
# Crawl to build FULL crosswalk recursively
spotify_xwalk = []
error_list = []

counter = 1
for subdir, dirs, files in os.walk(spotify_dir):   
        
    for file_loop in files:
        try:
            if counter > 10:
                break
            picked_song = song_id_picker(song_id_filename = file_loop, 
                                         song_id_lookup = msd_ab_hits)
            spotify_xwalk.append({'song_id': re.sub('.json', '', file_loop), 'spotify_uri_search': picked_song})
        except:
            error_list.append(file_loop)
            
        if counter % 1000 == 0: 
            print('\r%s' % (counter), end = "\r")
        counter += 1

## Merge Final Crosswalk

In [None]:
spotify_xwalk_df = pd.DataFrame(spotify_xwalk)
msd_final_xwalk = msd_ab_hits.merge(spotify_xwalk_df, 
                                    how = 'left',
                                    on = 'song_id',
                                    validate = 'one_to_one')

In [None]:
def final_id(row):
    if not pd.isnull(row['spotify_uri_search']):
        val = row['spotify_uri_search']
        
    elif not pd.isnull(row['spotify_uri']):
        val = row['spotify_uri']
    
    else:
        val = 'no_match'
    return val

In [None]:
msd_final_xwalk['spotify_uri_final'] = msd_final_xwalk.apply(final_id, axis = 1)

## Upload to S3 for use in lucyd

In [None]:
# AWS S3 Boto CLient
# AWS Credentials
bucket_name = 'sagemaker-msdsubset'
csv_buffer = StringIO()
msd_spotify_xwalk_upload.to_csv(csv_buffer)
s3_resource = boto3.resource('s3', aws_access_key_id=aws_id,
                             aws_secret_access_key=aws_secret)
s3_resource.Object(bucket_name, 'songid_spotifyuri_crosswalk.csv').put(Body=csv_buffer.getvalue())