In [1]:
import sqlite3
import pprint as pp
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import f1_score
import datetime

In [2]:
def Timer():
    return datetime.datetime.now()

In [3]:
def createTable(filename, tablename):
    createConn = sqlite3.connect(filename)
    createTableQuery = """ CREATE TABLE IF NOT EXISTS %s (
                                        target text,
                                        similar text
                                    ); """ %(tablename)

    createConn.execute(createTableQuery)
    return createConn

In [4]:
def checkTable(createConn, tablename):
    res =  createConn.execute("Select count(*) from %s;" %(tablename))
    print res.fetchall()

def deleteTable(createConn, tablename):
    createConn.execute("drop table %s;"%(tablename))

In [5]:
def insertTable(createConn, tablename, indices, knn):
    counter = 0
    for currentIndex in indices:
        counter = counter+1
        if counter%10000 == 0:
            print "Inserted %d items" %(counter)
        for index in range(1, len(currentIndex)):
            if index == knn:
                break
    #         print artists[currentIndex[0]][0], artists[currentIndex[index]][0]
            insertQuery = """ INSERT INTO %s VALUES ('%s', '%s'); """ %(tablename, ds_song_loudness_tempo[currentIndex[0]][0], ds_song_loudness_tempo[currentIndex[index]][0])
    #         print insertQuery
            createConn.execute(insertQuery)

In [6]:
con2tracks = sqlite3.connect("data/track_metadata.db")
select_query = "SELECT artist_id,avg(loudness) loud,avg(tempo) as tempo FROM songs GROUP BY artist_id"
song_loudness_tempo = con2tracks.execute(select_query)
ds_song_loudness_tempo = np.array(song_loudness_tempo.fetchall())
con2tracks.close()

In [7]:
ds_lt = ds_song_loudness_tempo[:,1:]
scaler = MinMaxScaler()
scaled_ds_lt = scaler.fit_transform(ds_lt)
print scaled_ds_lt

[[ 0.74781733  0.57890149]
 [ 0.82336493  0.60147307]
 [ 0.63470579  0.35055107]
 ..., 
 [ 0.86312445  0.30390023]
 [ 0.91274474  0.46859984]
 [ 0.74561743  0.58085308]]




In [8]:
start = Timer()
nbrs = NearestNeighbors(n_neighbors=200, algorithm='ball_tree').fit(scaled_ds_lt)
distances, indices = nbrs.kneighbors(scaled_ds_lt)
print "Indices = ", indices
print "Distances = ", distances
end = Timer()
print end - start

Indices =  [[   0 3048 3887 ..., 1146  595 2039]
 [   1  389 2424 ..., 2043 2144 2263]
 [   2 2553 2862 ...,  704 2902  893]
 ..., 
 [3885  238  697 ..., 2623 1161  557]
 [3886 2858  441 ..., 1995   71 1112]
 [3887    0 3048 ..., 3445 1099 1155]]
Distances =  [[ 0.          0.00137235  0.00294079 ...,  0.04783834  0.04794134
   0.04802616]
 [ 0.          0.00358754  0.00665557 ...,  0.05287603  0.05292688
   0.05296496]
 [ 0.          0.00271712  0.00596761 ...,  0.06275931  0.06277694
   0.06289079]
 ..., 
 [ 0.          0.01260172  0.01355879 ...,  0.09015644  0.09042385
   0.09051056]
 [ 0.          0.00565945  0.0143214  ...,  0.07532621  0.07549806
   0.07556594]
 [ 0.          0.00294079  0.0035032  ...,  0.04884964  0.04887829
   0.04891858]]
0:00:00.311914


## Calculating Accuracy

In [9]:
start = Timer()
# deleteTable(createConn, 'similarity_loudness_tempo_balltree_60')
algo_tbl_name = 'similarity_loudness_tempo_balltree_60'
createConn = createTable('data/subset_artist_similarity.db',algo_tbl_name)
insertTable(createConn,algo_tbl_name, indices, 60)
checkTable(createConn,algo_tbl_name)
end = Timer()
print end - start

[(229392,)]
0:00:04.122907


In [10]:
start = Timer()
learning_factor=9.8
total = 0.00
for i in range(0,20):
    res = createConn.execute("SELECT * FROM similarity as actual INNER JOIN %s as predict ON actual.target = predict.target and actual.similar = predict.similar where actual.target='%s';" %(algo_tbl_name, ds_song_loudness_tempo[i][0]))
    final = res.fetchall()
    total = total + len(final)
    end = Timer()
    # print len(final)
print end - start

0:00:17.104531


In [11]:
res = createConn.execute("SELECT count(*) FROM similarity GROUP BY target limit 20;")
counts = res.fetchall()
denominator=0.00
for i in counts:
    denominator = denominator + int(i[0]/learning_factor)

18.0


In [12]:
print "Accuracy for Ball-Tree for 60 Neighbours", total/denominator
createConn.close()

Accuracy for Ball-Tree for 60 Neighbours 0.555555555556


## Ball Tree - 45

In [13]:
start = Timer()
# deleteTable(createConn, 'similarity_loudness_tempo_balltree_60')
algo_tbl_name = 'similarity_loudness_tempo_balltree_45'
createConn = createTable('data/subset_artist_similarity.db',algo_tbl_name)
insertTable(createConn,algo_tbl_name, indices, 45)
checkTable(createConn,algo_tbl_name)
end = Timer()
print end - start

[(171072,)]
0:00:02.992097


In [14]:
start = Timer()
total = 0.00
for i in range(0,20):
    res = createConn.execute("SELECT * FROM similarity as actual INNER JOIN %s as predict ON actual.target = predict.target and actual.similar = predict.similar where actual.target='%s';" %(algo_tbl_name, ds_song_loudness_tempo[i][0]))
    final = res.fetchall()
    total = total + len(final)
    end = Timer()
    # print len(final)
print end - start

0:00:11.644005


In [15]:
res = createConn.execute("SELECT count(*) FROM similarity GROUP BY target limit 20;")
counts = res.fetchall()
denominator=0.00
for i in counts:
    denominator = denominator + int(i[0]/learning_factor)

18.0


In [16]:
print "Accuracy for Ball-Tree for 45 Neighbours", total/denominator
createConn.close()

Accuracy for Ball-Tree for 45 Neighbours 0.5


## Ball Tree - 90

In [17]:
start = Timer()
# deleteTable(createConn, 'similarity_loudness_tempo_balltree_60')
algo_tbl_name = 'similarity_loudness_tempo_balltree_90'
createConn = createTable('data/subset_artist_similarity.db',algo_tbl_name)
insertTable(createConn,algo_tbl_name, indices, 90)
checkTable(createConn,algo_tbl_name)
end = Timer()
print end - start

[(346032,)]
0:00:05.358122


In [18]:
start = Timer()
learning_factor=6.8
total = 0.00
for i in range(0,20):
    res = createConn.execute("SELECT * FROM similarity as actual INNER JOIN %s as predict ON actual.target = predict.target and actual.similar = predict.similar where actual.target='%s';" %(algo_tbl_name, ds_song_loudness_tempo[i][0]))
    final = res.fetchall()
    total = total + len(final)
    end = Timer()
    # print len(final)
print end - start

0:00:23.738095


In [19]:
res = createConn.execute("SELECT count(*) FROM similarity GROUP BY target limit 20;")
counts = res.fetchall()
denominator=0.00
for i in counts:
    denominator = denominator + int(i[0]/learning_factor)

31.0


In [20]:
print "Accuracy for Ball-Tree for 90 Neighbours", total/denominator
createConn.close()

Accuracy for Ball-Tree for 90 Neighbours 0.451612903226
