In [1]:
from IPython.core.display import HTML

In [3]:
tutorial_url_src = "http://labrosa.ee.columbia.edu/millionsong/sites/default/files/tutorial3.pdf"
HTML("<iframe src='%s' width=1000 height=1000></iframe" % tutorial_url_src)

In [4]:
import os
import sys
import glob
import time
import datetime
import numpy as np
try:
    import sqlite3
except ImportError:
    print 'you need sqlite3 installed to use this program'
    sys.exit(0)

# Helper Function to ensure string is proper to be used in a SQLite query

In [5]:
def encode_string(s):
    """
    Simple utility function to make sure a string is proper
    to be used in a SQLite query
    """
    return "'"+sreplace("'","''") + "'"

In [9]:
!ls MSongsDB/Tasks_Demos/SQLite

create_artist_similarity_db.py	demo_track_metadata.py
create_artist_terms_db.py	list_all_artists_from_db.py
create_track_metadata_db.py	list_all_tracks_from_db.py
demo_artist_similarity.py	README.txt
demo_artist_term.py


# Connect to DB file

In [19]:
# PATH to track_metadata.db
dbfile = 'MillionSongSubset/AdditionalFiles/subset_track_metadata.db'

In [20]:
conn = sqlite3.connect(dbfile)
c = conn.cursor()

TABLENAME = 'songs'

print '*************** GENERAL SQLITE DEMO ***************'

*************** GENERAL SQLITE DEMO ***************


# Show songs table

In [21]:
q = "SELECT name FROM sqlite_master WHERE type='table' ORDER BY name"
res = c.execute(q)
print "* tables contained in that SQLite file/database (should only be 'songs'):"

* tables contained in that SQLite file/database (should only be 'songs'):


In [22]:
print res.fetchall()

[(u'songs',)]


In [31]:
q = "SELECT name FROM sqlite_master WHERE tbl_name = 'songs' AND type = 'table' "
res = c.execute(q)
print "* get info on column names (original table creation command):"

* get info on column names (original table creation command):


In [32]:
print res.fetchall()[0][0]

songs


# List Indices

In [33]:
# list all the indices
q = "SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='songs' ORDER BY name"
res = c.execute(q)
print "* one of the index we added to the table to make things faster:"
print res.fetchone()

* one of the index we added to the table to make things faster:
(u'idx_artist_id',)


# Select Artists by name: The Beatles

In [36]:
# find the primary key of the query
# by default it's called ROWID, it would have be redefined if our primary key is type INTEGER
q = "SELECT ROWID FROM songs WHERE artist_name='The Beatles'"
res = c.execute(q)
print "* get the primary key (ROWID) of one entry where the artist is The Beatles:"

* get the primary key (ROWID) of one entry where the artist is The Beatles:


In [37]:
print res.fetchone()

(1040,)


In [38]:
q = "SELECT * FROM songs WHERE artist_name='The Beatles' LIMIT 1"
res = c.execute(q)
print "* get all that we have about one track from The Beatles" 

* get all that we have about one track from The Beatles


In [40]:
print res.fetchone()

(u'TRAHSSO128EF347345', u'Derek Taylor - Introduction', u'SORTPSA12A67ADBFE2', u'Here There And Everywhere', u'AR6XZ861187FB4CECD', u'b10bbbfc-cf9e-42e0-be17-e2c3e1d2600d', u'The Beatles', 53.78567, 0.840409662154, 0.840462688027, 0)


# Demo Around Artist ID

In [44]:
# query for all artists in Echo Nest ID
# the column name is 'artist_id'
# DISTINCT makes sures you only get each ID returned once
q = "SELECT DISTINCT artist_id FROM " + TABLENAME
res = c.execute(q)
artists = res.fetchall() # does the actual job of searching the db

print "* found", len(artists), "unique artist IDs, response looks like: "

* found 3888 unique artist IDs, response looks like: 


In [45]:
print artists[:3]

[(u'AR009211187B989185',), (u'AR00A6H1187FB5402A',), (u'AR00LNI1187FB444A5',)]


In [48]:
# more cumbersome: get artist ID, but with one track ID for each
# very useful, it gives you a HDF5 file to query if you 
# wish to have more information for each artist

q = "SELECT artist_id,track_id FROM songs GROUP BY artist_id"
res = c.execute(q)
artist_track_pair = res.fetchone()
print "* one unique artist with some track (chose at random) associated with it:"

* one unique artist with some track (chose at random) associated with it:


In [49]:
print artist_track_pair

(u'AR009211187B989185', u'TRBFYQS128F92E83A0')


In [53]:
# get artists only having one track in the database
q = "SELECT artist_id, track_id FROM songs GROUP BY artist_id HAVING (COUNT(artist_id) = 1)"
q += " ORDER BY RANDOM()"
res = c.execute(q)
artist_track_pair = res.fetchone()
print "* one artist that only has one track in the database"

* one artist that only has one track in the database


In [54]:
print artist_track_pair

(u'ARFIUBJ1241B9CA443', u'TRBFCKE12903CFA9DB')


In [56]:
# get artists with no musicbrainz ID
# of course, we only w*nt once each artist
# for demo purpose, we only ask for two at random
q = "SELECT artist_id, artist_mbid FROM songs WHERE artist_mbid=''"
q += " GROUP BY artist_id ORDER BY RANDOM() LIMIT 2"
res = c.execute(q)
print "* two track id from 'The Beatles,' found by looking up the artist by name:"


* two track id from 'The Beatles,' found by looking up the artist by name:


In [57]:
print res.fetchall()[:2]

[(u'ARTKOGW1241B9CD42B', u''), (u'ARMIQRB12298900AFB', u'')]


In [58]:
# we find all release starting by letter 'T'
# T != t, we're just look at albums starting with T
# here we use DISTINCT instead of GROUP BY artist_id 
# since it's fine we find twice the same artist, as long it is not the same (artist_id, release) pair
q = "SELECT DISTINCT artist_id, release FROM songs WHERE SUBSTR(release, 1, 1)='T'"
res = c.execute(q)
print "* one unique artist/release pair where album starts with capital T:"

* one unique artist/release pair where album starts with capital T:


In [59]:
print res.fetchone()

(u'AR009211187B989185', u'The Other Side of Love')


# Demo around Floats

In [60]:
# get all artists whose artist familiarity is > 0.8
q = "SELECT DISTINCT artist_name, artist_familiarity FROM songs WHERE artist_familiarity >.8"
res.execute(q)
print "* one artist having familiarity >.8"

* one artist having familiarity >.8


In [61]:
print res.fetchone()

(u'Atreyu', 0.800283912372)


In [64]:
# select one artist with highest artist familiarity but no artist_hotnesss
q = "SELECT DISTINCT artist_name , artist_familiarity as af , artist_hotnnnesss as ah"
q += " FROM songs WHERE ah < 0 ORDER BY af"
res.execute(q)

OperationalError: no such column: artist_hotnnnesss