In [1]:
import pandas as pd
import pickle

from pymongo import MongoClient

import spotipy
from config import api_access_token

### Connecting to MongoDB

In [2]:
client = MongoClient()
db = client.spotify
db.list_collection_names()

['track_features', 'tracks', 'artists', 'albums', 'track_analysis']

In [6]:
cursor = db.albums.find(
    {}, {'_id': 0, 'album_name': 1, 'artist': 1, 'album_uri': 1})
df = pd.DataFrame(list(cursor))
df.head()

Unnamed: 0,album_name,album_uri,artist
0,#3,spotify:album:2h6MSR1rWemHOtmurgRq3T,Suburban Kids With Biblical Names
1,Melodrama,spotify:album:4oCGmYsAQOWt2ACWTpNUU6,Lorde
2,Melodrama,spotify:album:6tBpgZG9bV76nSuiz42RbA,Lorde
3,Melodrama,spotify:album:2B87zXm9bOWvAJdkJBTpzF,Lorde
4,Melodrama,spotify:album:0RprDFd8KN06rt0jyKXBsC,Lorde


We can see how many artist album combinations we have below.

In [8]:
df.shape[0]

4608

In [72]:
df['artist_album'] = df.album_name + df.artist

Note that, as you can see above, Spotify sometimes has multiple albums of the same name for an artist. This can happen for several reasons. Generally speaking, the same songs appear in each album. We will remove duplicate album/artist combinations below.

In [76]:
df.drop_duplicates('artist_album', inplace=True)
df.shape[0]

4121

## Keeping only the artists that were initially queried.

In [79]:
returned_artists = df.artist.unique().tolist()

In [82]:
pickle_in = open('Data/indie_pop_artists', 'rb')
queried_artists = pickle.load(pickle_in)
pickle_in.close()

Again, we check to make sure that we're only looking at the artists that were both queried and returned.

In [83]:
print(len(queried_artists))
print(len(returned_artists))
print(len(set(queried_artists).intersection(set(returned_artists))))

716
740
540


In [86]:
artists_df = (pd.DataFrame(list(set(queried_artists)
                                .intersection(set(returned_artists)))))
artists_df.columns = ['artist']
artists_df.head()

Unnamed: 0,artist
0,Santigold
1,Voxtrot
2,The Lumineers
3,The Virgins
4,The High Water Marks


In [87]:
df = df.merge(artists_df, on='artist')
df.head()

Unnamed: 0,album_name,album_uri,artist,artist_album
0,#3,spotify:album:2h6MSR1rWemHOtmurgRq3T,Suburban Kids With Biblical Names,#3Suburban Kids With Biblical Names
1,Melodrama,spotify:album:4oCGmYsAQOWt2ACWTpNUU6,Lorde,MelodramaLorde
2,Pure Heroine,spotify:album:0rmhjUgoVa17LZuS8xWQ3v,Lorde,Pure HeroineLorde
3,Pure Heroine (Extended),spotify:album:6rnzvZhe3PA57xKcKLRtJ6,Lorde,Pure Heroine (Extended)Lorde
4,Simple Things Special Edition,spotify:album:4vQcWE7Ce6ORsatrqXkBtt,Zero 7,Simple Things Special EditionZero 7


We now have the list of album URIs for albums that are not duplicates and were both queried and returned.

In [89]:
df.album_uri.nunique()

2657

## Calling Search APIs and adding output to MongoDB 

In [63]:
def get_track_info(track, album_uri):
    """
    Takes in album info and returns a dictionary
    containing relevant information.
    """

    track_info = {}

    album_artist_info = track['artists'][0]
    album_artist_features = ['id', 'name', 'uri']

    track_info['album_uri'] = album_uri

    for feature in album_artist_features:
        try:
            track_info['artist_'+feature] = album_artist_info[feature]
        except:
            pass

    album_feature_list = ['duration_ms', 'explicit',
                          'id', 'name', 'track_number', 'uri']

    for feature in album_feature_list:
        try:
            track_info[feature] = track[feature]
        except:
            pass

    track_info['song_spotify_page'] = track['external_urls']['spotify']

    return track_info

In [64]:
def get_album_tracks(album_uri_s, access_token):
    """
    Takes in an access token and a list of artist URIs
    and writes info for each of those artists' albums to a MongoDB.
    """

    sp = spotipy.Spotify(auth=access_token)

    for album_uri in album_uri_s:

        results = sp.album_tracks(album_uri)
        tracks = results['items']

        while results['next']:
            results = sp.next(results)
            tracks.extend(results['items'])

        for track in tracks:
            db.tracks.insert_one(get_track_info(track, album_uri))

In [94]:
access_token = api_access_token

In [93]:
uri_s = df.album_uri.unique().tolist()
len(uri_s)

2657

In [95]:
get_album_tracks(uri_s, access_token)