In [1]:
import os
import glob
import psycopg2
import pandas as pd
from sql_queries import *

In [2]:
conn = psycopg2.connect("host=127.0.0.1 dbname=sparkifydb user=student password=student")
cur = conn.cursor()

In [3]:
loc_song_select = """SELECT
                    -- song_id
                    songs.song_id
                    -- artist_id
                   ,songs.artist_id
                FROM
                    songs
                    JOIN artists
                        ON songs.artist_id = artists.artist_id
                WHERE
                    songs.title = %s
                    AND
                    artists.name = %s
                    AND
                    songs.duration = %s;
"""
             

In [4]:
cur.execute("""SELECT songs.title, artists.name, songs.duration FROM songs JOIN artists ON songs.artist_id = artists.artist_id;""")
songs = pd.DataFrame(cur.fetchall())
songs.head()

Unnamed: 0,0,1,2
0,Kutt Free (DJ Volume Remix),Jinx,407.37914
1,Sono andati? Fingevo di dormire,Montserrat Caballé;Placido Domingo;Vicente Sar...,511.16363
2,A Higher Place (Album Version),Tom Petty,236.17261
3,Ten Tonne,Chase & Status,337.68444
4,Jenny Take a Ride,Mitch Ryder,207.43791


In [5]:
cur.execute(loc_song_select, ('Kutt Free (DJ Volume Remix)', 'Jinx', 407.37914))
test = cur.fetchone()
test

('SOFNOQK12AB01840FC', 'ARNNKDK1187B98BBD5')

# If I only query for songs that are in the songs database, I can get matches for song_id and artist_id

In [6]:
for i, song in songs.iterrows():
    print(song)
    s = song[0]
    a = song[1]
    l = song[2]
    query_params = (s, a, l)
    print(query_params)
    cur.execute(loc_song_select, query_params)
    results = cur.fetchall()
    for r in results:
        print(r,'\n')

0    Kutt Free (DJ Volume Remix)
1                           Jinx
2                        407.379
Name: 0, dtype: object
('Kutt Free (DJ Volume Remix)', 'Jinx', 407.37914)
('SOFNOQK12AB01840FC', 'ARNNKDK1187B98BBD5') 

0                      Sono andati? Fingevo di dormire
1    Montserrat Caballé;Placido Domingo;Vicente Sar...
2                                              511.164
Name: 1, dtype: object
('Sono andati? Fingevo di dormire', 'Montserrat Caballé;Placido Domingo;Vicente Sardinero;Judith Blegen;Sherrill Milnes;Georg Solti', 511.16363)
('SOBAYLL12A8C138AF9', 'ARDR4AC1187FB371A1') 

0    A Higher Place (Album Version)
1                         Tom Petty
2                           236.173
Name: 2, dtype: object
('A Higher Place (Album Version)', 'Tom Petty', 236.17261)
('SOFFKZS12AB017F194', 'ARBEBBY1187B9B43DB') 

0         Ten Tonne
1    Chase & Status
2           337.684
Name: 3, dtype: object
('Ten Tonne', 'Chase & Status', 337.68444)
('SOGVQGJ12AB017F169', 'AR62SOJ1187FB

# Now, open up a log file, loop through entries in the log file and see if any of the songs from the log file are in the songs table

Note, this after processing all the songs available using etl.py. These are the same songs that would be available when trying to find artist_id and song_id during log file processing.

In [7]:
def get_files(filepath):
    all_files = []
    for root, dirs, files in os.walk(filepath):
        files = glob.glob(os.path.join(root,'*.json'))
        for f in files:
            all_files.append(os.path.abspath(f))
    
    return all_files

In [8]:
log_files = get_files('data/log_data')

In [9]:
filepath = log_files[0]

In [10]:
df = pd.read_json(filepath, lines=True)
df.head()

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId
0,Sydney Youngblood,Logged In,Jacob,M,53,Klein,238.07955,paid,"Tampa-St. Petersburg-Clearwater, FL",PUT,NextSong,1540558000000.0,954,Ain't No Sunshine,200,1543449657796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",73
1,Gang Starr,Logged In,Layla,F,88,Griffin,151.92771,paid,"Lake Havasu City-Kingman, AZ",PUT,NextSong,1541057000000.0,984,My Advice 2 You (Explicit),200,1543449690796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",24
2,3OH!3,Logged In,Layla,F,89,Griffin,192.522,paid,"Lake Havasu City-Kingman, AZ",PUT,NextSong,1541057000000.0,984,My First Kiss (Feat. Ke$ha) [Album Version],200,1543449841796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",24
3,RÃÂ¶yksopp,Logged In,Jacob,M,54,Klein,369.81506,paid,"Tampa-St. Petersburg-Clearwater, FL",PUT,NextSong,1540558000000.0,954,The Girl and The Robot,200,1543449895796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",73
4,Kajagoogoo,Logged In,Layla,F,90,Griffin,223.55546,paid,"Lake Havasu City-Kingman, AZ",PUT,NextSong,1541057000000.0,984,Too Shy,200,1543450033796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",24


In [11]:
# filter by NextSong
df = df[df['page']=='NextSong']

# Loop through log file, first print the song name from the log file, then the row where that song is in the songs table.
# Notice that not a single song in the log file is present in the songs database!
# There is a mismatch between the log files and the song files. So frustrating to have spent
# several hours troubleshooting when this possibility should have been communicated with us!

In [19]:
# insert songplay records
for index, row in df.iterrows():
    s = row.song
    a = row.artist
    l = row.length
    query_params = (s, a, l)
    print(s)
    print(songs[(songs[0]==s)])#&(songs[1]==a)&(songs[2]==l)])
    # get songid and artistid from song and artist tables
    cur.execute(song_select, query_params)
    results = cur.fetchone()
    print(results)
    if results:
        songid, artistid = results
    else:
        songid, artistid = s, a

    # insert songplay record
    songplay_data = (row.ts, row.userId, row.level, songid, artistid, row.sessionId, row.location, row.userAgent,)
    print(songplay_data)

Ain't No Sunshine
Empty DataFrame
Columns: [0, 1, 2]
Index: []
None
(1543449657796, '73', 'paid', "Ain't No Sunshine", 'Sydney Youngblood', 954, 'Tampa-St. Petersburg-Clearwater, FL', '"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.78.2 (KHTML, like Gecko) Version/7.0.6 Safari/537.78.2"')
My Advice 2 You (Explicit)
Empty DataFrame
Columns: [0, 1, 2]
Index: []
None
(1543449690796, '24', 'paid', 'My Advice 2 You (Explicit)', 'Gang Starr', 984, 'Lake Havasu City-Kingman, AZ', '"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36"')
My First Kiss (Feat. Ke$ha) [Album Version]
Empty DataFrame
Columns: [0, 1, 2]
Index: []
None
(1543449841796, '24', 'paid', 'My First Kiss (Feat. Ke$ha) [Album Version]', '3OH!3', 984, 'Lake Havasu City-Kingman, AZ', '"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36"')
The Girl and The Robot
Empty DataFrame
Columns: [0, 1, 2]
