In [1]:
import configparser
from datetime import datetime
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format, to_date
from pyspark.sql.functions import from_unixtime

In [2]:
spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .getOrCreate()

## 1. Checking Songs Table 

In [3]:
output = "output"

In [4]:
output_data = os.path.join(output, 'songs')

In [5]:
output_data

'output/songs'

In [6]:
songs = spark.read.parquet(output_data)

In [7]:
type(songs)

pyspark.sql.dataframe.DataFrame

In [8]:
songs.columns

['song_id', 'title', 'duration', 'year', 'artist_id']

In [9]:
songs.filter(songs['year']>0).show(5, truncate = False)

+------------------+---------------------------------------------+---------+----+------------------+
|song_id           |title                                        |duration |year|artist_id         |
+------------------+---------------------------------------------+---------+----+------------------+
|SOEKAZG12AB018837E|I'll Slap Your Face (Entertainment USA Theme)|129.85424|2001|ARSVTNL1187B992A91|
|SOAFBCP12A8C13CC7D|King Of Scurf (2007 Digital Remaster)        |301.40036|1972|ARTC1LV1187B9A4858|
|SORRNOC12AB017F52B|The Last Beat Of My Heart (b-side)           |337.81506|2004|ARSZ7L31187FB4E610|
|SOQPWCR12A6D4FB2A3|A Poor Recipe For Civic Cohesion             |118.07302|2005|AR73AIO1187B9AD57B|
|SOBRKGM12A8C139EF6|Welcome to the Pleasuredome                  |821.05424|1985|ARXQBR11187B98A2CC|
+------------------+---------------------------------------------+---------+----+------------------+
only showing top 5 rows



## 2. Checking Artists Table

In [10]:
output_data = os.path.join(output, 'artists')

In [11]:
output_data

'output/artists'

In [12]:
artists = spark.read.parquet(output_data)

In [13]:
type(artists)

pyspark.sql.dataframe.DataFrame

In [14]:
artists.columns

['artist_id',
 'artist_name',
 'artist_location',
 'artist_latitude',
 'artist_longitude']

In [15]:
artists.show(5, truncate = False)

+------------------+-------------------------+---------------------------------+---------------+----------------+
|artist_id         |artist_name              |artist_location                  |artist_latitude|artist_longitude|
+------------------+-------------------------+---------------------------------+---------------+----------------+
|ARTC1LV1187B9A4858|The Bonzo Dog Band       |Goldsmith's College, Lewisham, Lo|51.4536        |-0.01802        |
|ARA23XO1187B9AF18F|The Smithereens          |Carteret, New Jersey             |40.57885       |-74.21956       |
|ARSVTNL1187B992A91|Jonathan King            |London, England                  |51.50632       |-0.12714        |
|AR73AIO1187B9AD57B|Western Addiction        |San Francisco, CA                |37.77916       |-122.42005      |
|ARXQBR11187B98A2CC|Frankie Goes To Hollywood|Liverpool, England               |null           |null            |
+------------------+-------------------------+---------------------------------+--------

## 3. Checking Users Table 

In [16]:
users = os.path.join(output, 'users' )

In [17]:
users = spark.read.parquet(users)

In [18]:
users.columns

['user_id', 'frist_name', 'last_name', 'gender', 'level']

In [19]:
users.show(5)

+-------+----------+---------+------+-----+
|user_id|frist_name|last_name|gender|level|
+-------+----------+---------+------+-----+
|     26|      Ryan|    Smith|     M| free|
|     26|      Ryan|    Smith|     M| free|
|     26|      Ryan|    Smith|     M| free|
|     61|    Samuel| Gonzalez|     M| free|
|     80|     Tegan|   Levine|     F| paid|
+-------+----------+---------+------+-----+
only showing top 5 rows



## 4. Checking Time Table 

In [20]:
time_df = os.path.join(output, 'time_df' )

In [21]:
time_df = spark.read.parquet(time_df)

In [22]:
time_df.columns

['timestamp', 'hour', 'day', 'week', 'day_of_week', 'year', 'month']

In [23]:
time_df.show(5, truncate=False)

+-----------------------+----+---+----+-----------+----+-----+
|timestamp              |hour|day|week|day_of_week|year|month|
+-----------------------+----+---+----+-----------+----+-----+
|2018-11-15 00:30:26.796|0   |15 |46  |Thursday   |2018|11   |
|2018-11-15 00:41:21.796|0   |15 |46  |Thursday   |2018|11   |
|2018-11-15 00:45:41.796|0   |15 |46  |Thursday   |2018|11   |
|2018-11-15 03:44:09.796|3   |15 |46  |Thursday   |2018|11   |
|2018-11-15 05:48:55.796|5   |15 |46  |Thursday   |2018|11   |
+-----------------------+----+---+----+-----------+----+-----+
only showing top 5 rows



rename columns https://stackoverflow.com/questions/34077353/how-to-change-dataframe-column-names-in-pyspark

In [24]:
time_df.select("day_of_week").distinct().show()

+-----------+
|day_of_week|
+-----------+
|  Wednesday|
|    Tuesday|
|     Friday|
|   Thursday|
|   Saturday|
|     Monday|
|     Sunday|
+-----------+



## 5. Checking Songplays Data

In [25]:
songplays = os.path.join(output, 'songplays' )

In [26]:
songplays

'output/songplays'

In [27]:
songplays = spark.read.parquet(songplays)

In [28]:
songplays.columns

['user_Id',
 'location',
 'user_agent',
 'session_id',
 'artist_id',
 'song_id',
 'level',
 'timestamp',
 'year',
 'month']

In [29]:
songplays.show(5)

+-------+--------------------+--------------------+----------+---------+-------+-----+--------------------+----+-----+
|user_Id|            location|          user_agent|session_id|artist_id|song_id|level|           timestamp|year|month|
+-------+--------------------+--------------------+----------+---------+-------+-----+--------------------+----+-----+
|     26|San Jose-Sunnyval...|"Mozilla/5.0 (X11...|       583|     null|   null| free|2018-11-15 00:30:...|2018|   11|
|     26|San Jose-Sunnyval...|"Mozilla/5.0 (X11...|       583|     null|   null| free|2018-11-15 00:41:...|2018|   11|
|     26|San Jose-Sunnyval...|"Mozilla/5.0 (X11...|       583|     null|   null| free|2018-11-15 00:45:...|2018|   11|
|     61|Houston-The Woodl...|"Mozilla/5.0 (Mac...|       597|     null|   null| free|2018-11-15 03:44:...|2018|   11|
|     80|Portland-South Po...|"Mozilla/5.0 (Mac...|       602|     null|   null| paid|2018-11-15 05:48:...|2018|   11|
+-------+--------------------+------------------

## Test

In [30]:
log_data = os.path.join("data_2/log_data/*.json")

In [31]:
log_df =  spark.read.json(log_data)

In [32]:
log_df.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)



In [33]:
song_data = os.path.join("data_2/song_data/*.json")
song_df =  spark.read.json(song_data)

In [34]:
song_df.printSchema()

root
 |-- artist_id: string (nullable = true)
 |-- artist_latitude: double (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_longitude: double (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- num_songs: long (nullable = true)
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: long (nullable = true)



In [35]:
c = song_df.alias('a').join(time_df).select('a.*',
                                            time_df.timestamp, 
                                            time_df.hour)
c.printSchema()

root
 |-- artist_id: string (nullable = true)
 |-- artist_latitude: double (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_longitude: double (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- num_songs: long (nullable = true)
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- hour: integer (nullable = true)



In [36]:
songplays_1 = log_df.join(song_df, 
                        (log_df.length == song_df.duration) & 
                        (log_df.artist == song_df.artist_name) & 
                        (log_df.song == song_df.title)).select(col('userId').alias('user_Id'),
                                                               log_df.location,
                                                               col('userAgent').alias('user_agent'),
                                                               col('sessionId').alias('session_id'),
                                                               song_df.artist_id,
                                                               song_df.song_id,
                                                               log_df.level)

In [37]:
songplays = songplays_1.alias('a').join(time_df).select('a.*',
                                            time_df.timestamp, 
                                            time_df.hour)
songplays.printSchema()

root
 |-- user_Id: string (nullable = true)
 |-- location: string (nullable = true)
 |-- user_agent: string (nullable = true)
 |-- session_id: long (nullable = true)
 |-- artist_id: string (nullable = true)
 |-- song_id: string (nullable = true)
 |-- level: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- hour: integer (nullable = true)



In [38]:
songplays.printSchema()

root
 |-- user_Id: string (nullable = true)
 |-- location: string (nullable = true)
 |-- user_agent: string (nullable = true)
 |-- session_id: long (nullable = true)
 |-- artist_id: string (nullable = true)
 |-- song_id: string (nullable = true)
 |-- level: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- hour: integer (nullable = true)

