In [None]:
%load_ext autoreload
%autoreload 2
from datamodel.Track import Track
from datamodel.User import User
from mongodb import MongoAccess as ma
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from pprint import pprint

# ma.load_kaggle_csvs_into_mongodb()

df_genres = pd.DataFrame(ma.get_collection(ma.coll_genres))
df_years = pd.DataFrame(ma.get_collection(ma.coll_years))
df_tracks = pd.DataFrame(ma.get_collection(ma.coll_tracks))
df_artists = pd.DataFrame(ma.get_collection(ma.coll_artists))
df_albums = pd.DataFrame(ma.get_collection(ma.coll_albums))

a_track_id = '3jW3k9jZRHNZfpDxpbAsbn'

def retrieve_track_attr_dict(id=None):
    if not id: row = df_tracks.sample(1).iloc[0]
    else: row = df_tracks.loc[df_tracks['_id'] == id].iloc[0]  # removed duplicates so there is only 1 match
    return row.to_dict()

Example usage of MongoAccess (ma)

In [None]:
# can use lists of names/ids
pprint(ma.get_tracks_by_ids(['0hbkKFIJm7Z05H8Zl9w30f']))
# or single name/id
pprint(ma.get_tracks_by_names("Don't You Advertise Your Man"))
# or genre
pprint(ma.get_tracks_by_genres('rock', limit=3))
# or label
pprint(ma.get_tracks_by_labels(['Columbia/Legacy'], limit=3))

In [None]:
# filtering on numeric fields ['acousticness', 'danceability', 'duration_ms', 'energy', 'explicit', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'popularity', 'speechiness', 'tempo', 'valence', 'year']

a_filter = [
    { '$match': {
        '$and': [
            { 'tempo': {'$gt': 70, '$lt': 120 } },
            { 'popularity': { '$gte': 10 } } ,
            { 'year': { '$gte': 1970 } } ,
        ] 
    } },
]

another_filter = [
    { '$match': {
        '$or': [
            { 'danceability': {'$gt': 70} },
            { 'key': { '$eq': 5 } } ,
        ] 
    } },
]
# you can write your own custom filter/pipeline with other stages besides match, check MongoAccess.py
pprint(ma.get_tracks_by_filter(a_filter, limit=3))

# can also be chained
pprint(ma.get_tracks_by_genres(['soul', 'permanent wave'], another_filter, limit=3))

User recommendation/similarity testing (wip)

In [None]:
a_user = User()
print(a_user.get_optimal_track_pref())
a_user.update_track_attr_log(Track(retrieve_track_attr_dict()))
print(a_user.get_optimal_track_pref())
a_user.update_track_attr_log(Track(retrieve_track_attr_dict()))
print(a_user.get_optimal_track_pref())
df_tracks.head()

Below this point there is some stuff that might be useful in the future but is/should not be actively used right now

In [None]:
import dotenv
# i'm using a .env file with the following information for the following part to work, not sure if you need all this info but I had it from another spotify project I am working on 
# SPOTIPY_CLIENT_ID=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
# SPOTIPY_CLIENT_SECRET=yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy
# SPOTIPY_REDIRECT_URI=http://sdfsdfsdfsd:8080
# https://developer.spotify.com/documentation/general/guides/app-settings/
dotenv.load_dotenv()

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials, SpotifyOAuth

sp = spotipy.Spotify(auth_manager=SpotifyOAuth())


In [None]:
pprint(sp.album('13CyNzgjCGZWFFxnhNFlYu')['genres'])

Apparently artists dont have an id, names **seem** to be unique and to map 1 to 1 between the tracks and artists (e.g. not two artists that use the same name) collection except the track collection having 12 tracks with artist 'n/a'. Just deleted those and assuming that we can just use names as ids as there are no duplicate names in the artists collection. Does mean that the ids I added to the albums collection are kinda moot.

In [None]:
# this can be adapted if a field is a literal string instead of for example a json list

# pipeline = [
#     { '$project': {'genres':1, '_id': 1}},
# ]
# for doc in list(ma.coll_artists.aggregate(list(pipeline))):
#     ma.coll_artists.update_one({'_id': doc['_id']}, {'$set': {'genres': ast.literal_eval(doc['genres'])}})

In [None]:
# not done yet will be used to be able to get all tracks from a certain genre
# spotify doesn't actually have genres on track/album level https://github.com/spotify/web-api/issues/157
genres = ['blues']
pipeline = [
    # { '$limit' : 100},
    { '$project': {'album_id': 1, 'id': 1}},

    # useless as long as there are no genres in album collection

    # { '$lookup':
    #     {
    #         'from': 'albums',
    #         'let': { 'album_id': "$album_id"},
    #         'pipeline': [
    #             {'$match' : {'$id': '$$album_id'}},
    #             {'$project': {'genres': 1}}
    #         ],
    #         # 'localField': 'album_id',
    #         # 'foreignField': 'id',
    #         'as': 'albums_docs'
    #     }
    # },
    # { '$unwind': '$albums_docs'},
    # { '$set': { 'genres_album': '$albums_docs.genres'}},
    # { '$unset': 'albums_docs'},

    ## { '$unwind': '$artists'}, # https://docs.mongodb.com/manual/reference/operator/aggregation/lookup/
    # { '$lookup':
    #     {
    #         'from': 'artists',
    #         'localField': 'artists',
    #         'foreignField': 'artists',
    #         'as': 'artists_docs'
    #     }
    # },
    # { '$unwind': '$artists_docs'},
    # { '$set': { 'genres_artists': '$artists_docs.genres'}},
    # { '$unset': 'artists_docs'},
    # { '$unwind': '$genres_artists'},
    # { '$group': {
    #     '_id' : '$id',
    #     'genres': { '$addToSet':  "$genres_artists" },
    # }},
        { '$lookup':
            {
                'from': 'albums',
                'localField': 'album_id',
                'foreignField': 'id',
                'as': 'albums_docs'
            }
        },
        { '$unwind': '$albums_docs'},
        { '$set': { 'album_label': '$albums_docs.label'}},
        { '$unset': 'albums_docs'},
        { '$group': {
            '_id' : '$id',
            'label': { '$addToSet':  "$album_label" },
        }},
    # { '$match': {'genres.genres_artists': {'$in': genres}} } 
    # { '$match': { '$or': [{'genres.genres_artists': {'$in': genres}}, {'genres.genres_album': {'$in': genres}}]} } 
]
# pipeline = [
#     { '$limit' : 10000},
#     # { '$project': {'album_id': 1, 'artists': 1, 'id': 1}},
#     { '$lookup':
#         {
#             'from': 'tracks',
#             # 'let': { 'album_id': "$album_id"},
#             # 'pipeline': [
#             #     {'$match' : {'$id': '$$album_id'}},
#             # ],
#             'localField': 'id',
#             'foreignField': 'album_id',
#             'as': 'albums_docs'
#         }
#     },
# ]
# %timeit (list(ma.coll_albums.aggregate(pipeline)))
# %timeit (list(ma.coll_tracks.aggregate(pipeline)))
res = list(ma.coll_tracks.aggregate(pipeline))

In [None]:
# ma.coll_tracks.update_one({'id': '3Q2tKt6gKdn9LUMcHFxNJy'}, {'$set': {'genres': ['lounge', 'adult standards', 'easy listening']}})
# pprint(list(ma.coll_tracks.aggregate([{'$match': {'id': '3Q2tKt6gKdn9LUMcHFxNJy'}}])))
# for doc in res:
    # ma.coll_tracks.update_one({'id': doc['_id']}, {'$set': {'album_label': doc['label']}})

In [None]:
pipeline = [
    {'$project': {'genres': 1}},
]
res = list(ma.coll_artists.aggregate(pipeline))
cntr = 0
for doc in res: cntr += len(doc['genres'])
print(f'cntr: {cntr} genres found in artists collection')

res = list(ma.coll_albums.aggregate(pipeline))
cntr = 0
for doc in res: cntr += len(doc['genres'])
print(f'cntr: {cntr} genres found in albums collection')

In [None]:
# creates the album collection

# def get_album_data(album_id):
#     album = sp.album(album_id)
#     results = album['tracks'] 
#     while results['next']: # if doc right would only happen if album contains more than a 100 songs, which is never according to my googling
#         results = sp.next(results)
#         album['tracks']['items'].extend(results['items'])
        
#     # there is some info in the album that we do not need as we have seperate track and artist collections
#     album_bare = dict()
#     for key in ['id', 'album_type', 'genres', 'label', 'name', 'popularity', 'release_date', 'release_date_precision', 'total_tracks', 'type']:
#         album_bare[key] = album[key]
#     album_bare['artists'] = [{'id':artist['id']} for artist in album['artists']]
#     album_bare['tracks'] = [{'id':track['id']} for track in album['tracks']['items']]
#     return album_bare

# album_list = []
# for track in all_tracks:
#     album_id = sp.track(track['id'])['album']['id']
#     ma.coll_tracks.update_one({'_id': track['_id']}, {'$set': {'album_id': album_id}})
#     album_list.append(album_id)

# album_data_list = [get_album_data(album_id) for album_id in set(album_list)]
# ma.coll_albums.insert_many(album_data_list)