In [1]:
import configparser
from datetime import datetime
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format

In [2]:
config = configparser.ConfigParser()

config.read('dl.cfg')

os.environ['AWS_ACCESS_KEY_ID']=config['AWS']['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY']=config['AWS']['AWS_SECRET_ACCESS_KEY']


In [3]:
spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .getOrCreate()

In [4]:
#df = spark.read.json('s3://udacity-dend/song_data')
song_log = spark.read.json('data/song_data/A/*/*/*.json')
song_log.printSchema()
song_log.show(5)

root
 |-- artist_id: string (nullable = true)
 |-- artist_latitude: double (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_longitude: double (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- num_songs: long (nullable = true)
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: long (nullable = true)

+------------------+---------------+-----------------+----------------+--------------------+---------+---------+------------------+--------------------+----+
|         artist_id|artist_latitude|  artist_location|artist_longitude|         artist_name| duration|num_songs|           song_id|               title|year|
+------------------+---------------+-----------------+----------------+--------------------+---------+---------+------------------+--------------------+----+
|ARDR4AC1187FB371A1|           null|                 |            null|Montserrat Caball...|511.16363|   

In [5]:
song_log.createOrReplaceTempView("song_log_table")

In [6]:
spark.sql("select count(1) from song_log_table").show()

+--------+
|count(1)|
+--------+
|      71|
+--------+



In [7]:
# artist table
#spark.sql("select distinct artist_id, artist_name name,artist_location location, artist_latitude latitude, artist_longitude longitude from song_log_table").show()
artist_frame = spark.sql("select distinct artist_id, artist_name name,artist_location location, artist_latitude latitude, artist_longitude longitude from song_log_table")
artist_frame.count()
#artist_frame.write.json("data/artistoutput")
#artist_frame.printSchema()

root
 |-- artist_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- location: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)



In [87]:
artist_frame.createOrReplaceTempView("artist_table")
spark.sql("select * from artist_table").show()

+------------------+--------------------+--------------------+--------+----------+
|         artist_id|                name|            location|latitude| longitude|
+------------------+--------------------+--------------------+--------+----------+
|ARPBNLO1187FB3D52F|            Tiny Tim|        New York, NY|40.71455| -74.00712|
|ARBEBBY1187B9B43DB|           Tom Petty|     Gainesville, FL|    null|      null|
|AR0IAWL1187B9A96D0|        Danilo Perez|              Panama|  8.4177| -80.11278|
|ARMBR4Y1187B9990EB|        David Martin|     California - SF|37.77916|-122.42005|
|ARD0S291187B9B7BF5|             Rated R|                Ohio|    null|      null|
|AR0RCMP1187FB3F427|    Billie Jo Spears|        Beaumont, TX|30.08615| -94.10158|
|ARKRRTF1187B9984DA|    Sonora Santanera|                    |    null|      null|
|ARHHO3O1187B989413|           Bob Azzam|                    |    null|      null|
|ARJIE2Y1187B994AB7|         Line Renaud|                    |    null|      null|
|ARG

In [88]:
# songs table
spark.sql("select distinct song_id, title, artist_id, year, duration from song_log_table").show()
song_frame = spark.sql("select distinct song_id, title, artist_id, year, duration from song_log_table")

+------------------+--------------------+------------------+----+---------+
|           song_id|               title|         artist_id|year| duration|
+------------------+--------------------+------------------+----+---------+
|SOGNCJP12A58A80271|Do You Finally Ne...|ARB29H41187B98F0EF|1972|342.56934|
|SOOJPRH12A8C141995|   Loaded Like A Gun|ARBGXIG122988F409D|   0|173.19138|
|SOFCHDR12AB01866EF|         Living Hell|AREVWGE1187B9B890A|   0|282.43546|
|SOWTBJW12AC468AC6E|Broken-Down Merry...|ARQGYP71187FB44566|   0|151.84934|
|SOGOSOV12AF72A285E|   ¿Dónde va Chichi?|ARGUVEV1187B98BA17|1997|313.12934|
|SOTUKVB12AB0181477|   Blessed Assurance|AR7ZKHQ1187B98DD73|1993|  270.602|
|SOMVWWT12A58A7AE05|Knocked Out Of Th...|ARQ9BO41187FB5CF1F|   0|183.17016|
|SOBEBDG12A58A76D60|        Kassie Jones|ARI3BMM1187FB4255E|   0|220.78649|
|SOILPQQ12AB017E82A|Sohna Nee Sohna Data|AR1ZHYZ1187FB3C717|   0|599.24853|
|SOYMRWW12A6D4FAB14|The Moon And I (O...|ARKFYS91187B98E58F|   0| 267.7024|
|SOBCOSW12A8

In [97]:
song_frame.createOrReplaceTempView("song_table")
spark.sql("select * from song_table").show()

+------------------+--------------------+------------------+----+---------+
|           song_id|               title|         artist_id|year| duration|
+------------------+--------------------+------------------+----+---------+
|SOGNCJP12A58A80271|Do You Finally Ne...|ARB29H41187B98F0EF|1972|342.56934|
|SOOJPRH12A8C141995|   Loaded Like A Gun|ARBGXIG122988F409D|   0|173.19138|
|SOFCHDR12AB01866EF|         Living Hell|AREVWGE1187B9B890A|   0|282.43546|
|SOWTBJW12AC468AC6E|Broken-Down Merry...|ARQGYP71187FB44566|   0|151.84934|
|SOGOSOV12AF72A285E|   ¿Dónde va Chichi?|ARGUVEV1187B98BA17|1997|313.12934|
|SOTUKVB12AB0181477|   Blessed Assurance|AR7ZKHQ1187B98DD73|1993|  270.602|
|SOMVWWT12A58A7AE05|Knocked Out Of Th...|ARQ9BO41187FB5CF1F|   0|183.17016|
|SOBEBDG12A58A76D60|        Kassie Jones|ARI3BMM1187FB4255E|   0|220.78649|
|SOILPQQ12AB017E82A|Sohna Nee Sohna Data|AR1ZHYZ1187FB3C717|   0|599.24853|
|SOYMRWW12A6D4FAB14|The Moon And I (O...|ARKFYS91187B98E58F|   0| 267.7024|
|SOBCOSW12A8

In [90]:
#df = spark.read.json('s3://udacity-dend/song_data')
log_data = spark.read.json('data/log_data/')
log_data.printSchema()
log_data.show(5)

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)

+-----------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-----------------+---------+---------------+------+-------------+--------------------+------+
|     artist|     auth|firstName|gender|itemInSession|lastName|   length|level|            lo

In [91]:
log_data.createOrReplaceTempView("log_data_table")

In [92]:
spark.sql("select count(1) from log_data_table").show()

+--------+
|count(1)|
+--------+
|    8056|
+--------+



In [93]:
# users table
spark.sql("select distinct userId user_id, firstName first_name, lastName last_name, gender, level from log_data_table").show()

+-------+----------+---------+------+-----+
|user_id|first_name|last_name|gender|level|
+-------+----------+---------+------+-----+
|     98|    Jordyn|   Powell|     F| free|
|     34|    Evelin|    Ayala|     F| free|
|     85|   Kinsley|    Young|     F| paid|
|     38|    Gianna|    Jones|     F| free|
|     85|   Kinsley|    Young|     F| free|
|     63|      Ayla|  Johnson|     F| free|
|     37|    Jordan|    Hicks|     F| free|
|      6|   Cecilia|    Owens|     F| free|
|     15|      Lily|     Koch|     F| paid|
|     27|    Carlos|   Carter|     M| free|
|     89|   Kynnedi|  Sanchez|     F| free|
|     21|   Preston|  Sanders|     M| free|
|     57| Katherine|      Gay|     F| free|
|     74|    Braden|   Parker|     M| free|
|     29|Jacqueline|    Lynch|     F| paid|
|     75|    Joseph|Gutierrez|     M| free|
|     61|    Samuel| Gonzalez|     M| free|
|     88|  Mohammad|Rodriguez|     M| free|
|     64|    Hannah|  Calhoun|     F| free|
|     15|      Lily|     Koch|  

In [94]:
# time table
spark.sql("select ts, from_unixtime(ts/1000.0, 'MM-dd-yyyy HH:mm:ss') start_time, from_unixtime(ts/1000.00,'HH') hour, \
from_unixtime(ts/1000.0,'dd') day, from_unixtime(ts/1000.0,'ww') week, \
from_unixtime(ts/1000.0,'MM') month, from_unixtime(ts/1000.0,'yyyy') year, from_unixtime(ts/1000.0,'u') weekday \
           from log_data_table where ts is not null").show()

         

+-------------+-------------------+----+---+----+-----+----+-------+
|           ts|         start_time|hour|day|week|month|year|weekday|
+-------------+-------------------+----+---+----+-----+----+-------+
|1542241826796|11-15-2018 00:30:26|  00| 15|  46|   11|2018|      4|
|1542242481796|11-15-2018 00:41:21|  00| 15|  46|   11|2018|      4|
|1542242741796|11-15-2018 00:45:41|  00| 15|  46|   11|2018|      4|
|1542247071796|11-15-2018 01:57:51|  01| 15|  46|   11|2018|      4|
|1542252577796|11-15-2018 03:29:37|  03| 15|  46|   11|2018|      4|
|1542253449796|11-15-2018 03:44:09|  03| 15|  46|   11|2018|      4|
|1542253460796|11-15-2018 03:44:20|  03| 15|  46|   11|2018|      4|
|1542260074796|11-15-2018 05:34:34|  05| 15|  46|   11|2018|      4|
|1542260277796|11-15-2018 05:37:57|  05| 15|  46|   11|2018|      4|
|1542260935796|11-15-2018 05:48:55|  05| 15|  46|   11|2018|      4|
|1542261224796|11-15-2018 05:53:44|  05| 15|  46|   11|2018|      4|
|1542261356796|11-15-2018 05:55:56

In [99]:
#songplays table
spark.sql("select from_unixtime(a.ts/1000.0, 'MM-dd-yyyy HH:mm:ss') start_time, a.userId  user_id, a.level, a.song, b.song_id, a.artist,\
c.artist_id, a.sessionId  session_id, a.location, a.userAgent user_agent  \
from log_data_table a, song_table b, artist_table c \
          where a.page='NextSong'\
          and a.song = b.title \
          and a.artist = c.name").show()

+-------------------+-------+-----+--------------+------------------+------+------------------+----------+--------------------+--------------------+
|         start_time|user_id|level|          song|           song_id|artist|         artist_id|session_id|            location|          user_agent|
+-------------------+-------+-----+--------------+------------------+------+------------------+----------+--------------------+--------------------+
|11-21-2018 21:56:47|     15| paid|Setanta matins|SOZCTXZ12AB0182364| Elena|AR5KOSW1187FB35FF4|       818|Chicago-Napervill...|"Mozilla/5.0 (X11...|
+-------------------+-------+-----+--------------+------------------+------+------------------+----------+--------------------+--------------------+



In [101]:
#write to file
songplays_frame = spark.sql("select from_unixtime(a.ts/1000.0, 'MM-dd-yyyy HH:mm:ss') start_time, a.userId  user_id, a.level, a.song, b.song_id, a.artist,\
c.artist_id, a.sessionId  session_id, a.location, a.userAgent user_agent  \
from log_data_table a, song_table b, artist_table c \
          where a.page='NextSong'\
          and a.song = b.title \
          and a.artist = c.name")
songplays_frame.write.parquet("output/songplays.parquet")