#### Use this notebook to develop the ETL process for each of your tables before completing the etl.py file to load the whole datasets.



In [1]:
import os 
import glob
import psycopg2
import pandas as pd
from sql_queries import *

In [13]:
conn = psycopg2.connect("host=127.0.0.1 dbname=sparkifydb user=postgres password=student")
cur = conn.cursor()

In [3]:
def get_files(filepath):
    all_files = []
    for root, dir, files in os.walk(filepath):
        files = glob.glob(os.path.join(root, '*.json'))
        for f in files:
            all_files.append(os.path.abspath(f))
    return all_files
    

## Process song_data

perform ETL on the first dataset song_data, to create the **songs** and **artists** dimension table

Let's perform ETL on a single song file and load a single record into each table to start
 - Use the get_files function provided above to get a list of all song JSON files in data/song_data
 - Select the first song in this list
 - Read the song file and view the data

In [4]:
song_files = get_files('data/song_data')

In [5]:
filepath = song_files[0]
print(filepath)

/Users/itsmuriuki/Desktop/Data_Engineering/Data_Modeling/project_1_RDBMS_Postgres/data/song_data/A/A/A/TRAAAEF128F4273421.json


In [6]:
df = pd.read_json(filepath, typ='series')
df

num_songs                            1
artist_id           AR7G5I41187FB4CE6C
artist_latitude                   None
artist_longitude                  None
artist_location        London, England
artist_name                   Adam Ant
song_id             SONHOTT12A8C13493C
title                  Something Girls
duration                       233.404
year                              1982
dtype: object

## Songs Table

### Extract data for songs table
 - select column for sond_Id, title, artist_id, year and duration 
 - use df.values to select just the values from the dataframe 
 - index to select the first(only) record in the dataframe
 - convert the arry into a list and set it to song_data


In [7]:
songs_data = df[['song_id','title','artist_id', 'year', 'duration']]
songs_data_values = songs_data.values
songs_data_values

array(['SONHOTT12A8C13493C', 'Something Girls', 'AR7G5I41187FB4CE6C',
       1982, 233.40363], dtype=object)

In [8]:
first_record = songs_data_values[:]
first_record

array(['SONHOTT12A8C13493C', 'Something Girls', 'AR7G5I41187FB4CE6C',
       1982, 233.40363], dtype=object)

In [9]:
#convert to a list 
song_data = first_record.tolist()

In [10]:
song_data

['SONHOTT12A8C13493C',
 'Something Girls',
 'AR7G5I41187FB4CE6C',
 1982,
 233.40363]

In [11]:
#testing type 
assert isinstance(song_data, list), 'song_data should be a list'
song_data

['SONHOTT12A8C13493C',
 'Something Girls',
 'AR7G5I41187FB4CE6C',
 1982,
 233.40363]

### Insert record to song table 

Implement the song_table_insert query in *sql_querries.py* and run the cell below to incert a record for this song into the songs table 

Run *create_tables.py* before running the cell below to ensure you have created annd resetted the songs table in the sparkify 

In [14]:
cur.execute(song_table_insert, song_data)
conn.commit()

## Artist Table

#### Extract data from artist table 
 - Select columns for artist_id, name, location, latitude, longitude
 - use df.values to select just the values from the dataframe
 - index to select the first(only) record in the dataframe
 - convert the array to a list and set it to artist_data


In [17]:
artist_data = df[['artist_id','artist_name','artist_location', 'artist_latitude', 'artist_longitude']]
artist_data

artist_id           AR7G5I41187FB4CE6C
artist_name                   Adam Ant
artist_location        London, England
artist_latitude                   None
artist_longitude                  None
dtype: object

In [19]:
# getting the values only 
artist_data_values = artist_data.values
artist_data_values

array(['AR7G5I41187FB4CE6C', 'Adam Ant', 'London, England', None, None],
      dtype=object)

In [21]:
first_record = artist_data_values[:]
first_record


array(['AR7G5I41187FB4CE6C', 'Adam Ant', 'London, England', None, None],
      dtype=object)

In [23]:
#changing it to a list 
artist_data= first_record.tolist()
artist_data

['AR7G5I41187FB4CE6C', 'Adam Ant', 'London, England', None, None]

In [24]:
#testing types
assert isinstance(artist_data, list), 'artist_data should be a list'

artist_data

['AR7G5I41187FB4CE6C', 'Adam Ant', 'London, England', None, None]