In [15]:
# Importing all the necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, desc, asc
from pyspark.sql.functions import sum as Fsum
from pyspark.sql.types import StringType, IntegerType

import datetime

import numpy as np
import pandas as pd
import matplotlib as plt

In [16]:
# creating a spark session
spark = SparkSession.builder.appName('Data Wrangling with Spark SQL').getOrCreate()

In [17]:
# setting the directory for the data
data_dir = 'C:/Users/John/PycharmProjects/sparkdemo/data/'

In [18]:
# Reading in the JSON file as a spark dataframe
df = spark.read.json(data_dir + 'sparkify_log_small.json')

In [19]:
df.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: long (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)



In [20]:
df.createOrReplaceTempView('df_table')

In [23]:
spark.sql('SELECT * FROM df_table LIMIT 2').show()

+-------------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-------------+---------+--------------------+------+-------------+--------------------+------+
|       artist|     auth|firstName|gender|itemInSession|lastName|   length|level|            location|method|    page| registration|sessionId|                song|status|           ts|           userAgent|userId|
+-------------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-------------+---------+--------------------+------+-------------+--------------------+------+
|Showaddywaddy|Logged In|  Kenneth|     M|          112|Matthews|232.93342| paid|Charlotte-Concord...|   PUT|NextSong|1509380319284|     5132|Christmas Tears W...|   200|1513720872284|"Mozilla/5.0 (Win...|  1046|
|   Lily Allen|Logged In|Elizabeth|     F|            7|   Chase|195.23873| free|Shreveport-Bossie...|   PUT|NextSong|1512718541284|     5027|      

In [24]:
spark.sql('''
SELECT * 
FROM df_table 
LIMIT 2
''').show()

+-------------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-------------+---------+--------------------+------+-------------+--------------------+------+
|       artist|     auth|firstName|gender|itemInSession|lastName|   length|level|            location|method|    page| registration|sessionId|                song|status|           ts|           userAgent|userId|
+-------------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-------------+---------+--------------------+------+-------------+--------------------+------+
|Showaddywaddy|Logged In|  Kenneth|     M|          112|Matthews|232.93342| paid|Charlotte-Concord...|   PUT|NextSong|1509380319284|     5132|Christmas Tears W...|   200|1513720872284|"Mozilla/5.0 (Win...|  1046|
|   Lily Allen|Logged In|Elizabeth|     F|            7|   Chase|195.23873| free|Shreveport-Bossie...|   PUT|NextSong|1512718541284|     5027|      

In [25]:
spark.sql('''
    SELECT DISTINCT page
    FROM df_table
    ORDER BY page ASC 
''').show()

+----------------+
|            page|
+----------------+
|           About|
|       Downgrade|
|           Error|
|            Help|
|            Home|
|           Login|
|          Logout|
|        NextSong|
|   Save Settings|
|        Settings|
|Submit Downgrade|
|  Submit Upgrade|
|         Upgrade|
+----------------+



In [26]:
spark.udf.register("get_hour", lambda x: datetime.datetime.fromtimestamp(x/1000.0).hour, IntegerType())

<function __main__.<lambda>(x)>

In [27]:
spark.sql('''
    SELECT *, get_hour(ts) AS hour
    FROM df_table
    LIMIT 1    
''').show()

+-------------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-------------+---------+--------------------+------+-------------+--------------------+------+----+
|       artist|     auth|firstName|gender|itemInSession|lastName|   length|level|            location|method|    page| registration|sessionId|                song|status|           ts|           userAgent|userId|hour|
+-------------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-------------+---------+--------------------+------+-------------+--------------------+------+----+
|Showaddywaddy|Logged In|  Kenneth|     M|          112|Matthews|232.93342| paid|Charlotte-Concord...|   PUT|NextSong|1509380319284|     5132|Christmas Tears W...|   200|1513720872284|"Mozilla/5.0 (Win...|  1046|   3|
+-------------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+----------

In [28]:
spark.sql('''
    SELECT get_hour(ts) as hour, COUNT(userId)
    FROM df_table
    WHERE page = 'NextSong'
    GROUP BY hour
    ORDER BY hour ASC
''').show()

+----+-------------+
|hour|count(userId)|
+----+-------------+
|   0|          337|
|   1|          255|
|   2|          271|
|   3|          307|
|   4|          377|
|   5|          371|
|   6|          505|
|   7|          410|
|   8|          320|
|   9|          341|
|  10|          319|
|  11|          306|
|  12|          344|
|  13|          395|
|  14|          324|
|  15|          185|
|  16|          236|
|  17|          238|
|  18|          285|
|  19|          418|
+----+-------------+
only showing top 20 rows



In [29]:
songs_in_hour = spark.sql('''
    SELECT get_hour(ts) as hour, COUNT(*) as plays_per_hour
    FROM df_table
    WHERE page = 'NextSong'
    GROUP BY hour
    ORDER BY cast(hour as int) ASC    
''')

In [30]:
songs_in_hour.show()

+----+--------------+
|hour|plays_per_hour|
+----+--------------+
|   0|           337|
|   1|           255|
|   2|           271|
|   3|           307|
|   4|           377|
|   5|           371|
|   6|           505|
|   7|           410|
|   8|           320|
|   9|           341|
|  10|           319|
|  11|           306|
|  12|           344|
|  13|           395|
|  14|           324|
|  15|           185|
|  16|           236|
|  17|           238|
|  18|           285|
|  19|           418|
+----+--------------+
only showing top 20 rows



In [31]:
songs_in_hour = spark.sql('''
    SELECT get_hour(T.ts) as hour, COUNT(*) as plays_per_hour
    FROM df_table as T
    WHERE T.page = 'NextSong'
    GROUP BY hour
    ORDER BY cast(hour as int) ASC
''')

In [32]:
songs_in_hour_pd = songs_in_hour.toPandas()
print(songs_in_hour_pd)

    hour  plays_per_hour
0      0             337
1      1             255
2      2             271
3      3             307
4      4             377
5      5             371
6      6             505
7      7             410
8      8             320
9      9             341
10    10             319
11    11             306
12    12             344
13    13             395
14    14             324
15    15             185
16    16             236
17    17             238
18    18             285
19    19             418
20    20             455
21    21             500
22    22             461
23    23             387
