# Data Frame Coding Quiz

Use this Jupyter notebook to find the answers to the quiz in the previous section. There is an answer key in the next part of the lesson.

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf, desc
from pyspark.sql.functions import sum as Fsum
from pyspark.sql.types import IntegerType
from pyspark.sql.window import Window

# TODOS: 
# 1) import any other libraries you might need
# 2) instantiate a Spark session 
# 3) read in the data set located at the path "data/sparkify_log_small.json"
# 4) write code to answer the quiz questions

spark = SparkSession \
        .builder \
        .appName("Data Frame Quiz") \
        .getOrCreate()

df = spark.read.json("data/sparkify_log_small.json")

In [2]:
spark

In [3]:
df.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: long (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)



In [4]:
df.show(3)

+--------------------+---------+---------+------+-------------+---------+---------+-----+--------------------+------+--------+-------------+---------+--------------------+------+-------------+--------------------+------+
|              artist|     auth|firstName|gender|itemInSession| lastName|   length|level|            location|method|    page| registration|sessionId|                song|status|           ts|           userAgent|userId|
+--------------------+---------+---------+------+-------------+---------+---------+-----+--------------------+------+--------+-------------+---------+--------------------+------+-------------+--------------------+------+
|       Showaddywaddy|Logged In|  Kenneth|     M|          112| Matthews|232.93342| paid|Charlotte-Concord...|   PUT|NextSong|1509380319284|     5132|Christmas Tears W...|   200|1513720872284|"Mozilla/5.0 (Win...|  1046|
|          Lily Allen|Logged In|Elizabeth|     F|            7|    Chase|195.23873| free|Shreveport-Bossie...|   PUT

In [5]:
df.select('ts').groupBy('ts').agg({'ts':'count'}).sort('count(ts)', ascending=False).show(5)

+-------------+---------+
|           ts|count(ts)|
+-------------+---------+
|1513821375284|        5|
|1513789235284|        4|
|1513774089284|        3|
|1513777148284|        3|
|1513787586284|        3|
+-------------+---------+
only showing top 5 rows



# Question 1

Which page did user id "" (empty string) NOT visit?

In [6]:
# TODO: write your code to answer question 1
df.filter(df.userId=="").select("page").withColumnRenamed('page', 'blank_pages').dropDuplicates().show()

+-----------+
|blank_pages|
+-----------+
|       Home|
|      About|
|      Login|
|       Help|
+-----------+



# Question 2 - Reflect

What type of user does the empty string user id most likely refer to?


In [7]:
# TODO: use this space to explore the behavior of the user with an empty string

Since users with empty string user id only visited pages (Home, About, Login, Help), they are likely new users who have not registered or are users who have not log in.

# Question 3

How many female users do we have in the data set?

In [8]:
# TODO: write your code to answer question 3
df.filter(df.gender=='F').select('userID').dropDuplicates().count()

462

# Question 4

How many songs were played from the most played artist?

In [9]:
# TODO: write your code to answer question 4

# only NextSong pages have artist names
df.filter(df.artist!="").select('page').dropDuplicates().show()

+--------+
|    page|
+--------+
|NextSong|
+--------+



In [10]:
df.filter(df.page=="NextSong").select("artist").groupBy('artist').agg({'artist':'count'}).show(5)

+--------------------+-------------+
|              artist|count(artist)|
+--------------------+-------------+
|      The Black Keys|           40|
|        STRATOVARIUS|            1|
|      The Chameleons|            1|
|Dashboard Confess...|            3|
|      Jarabe De Palo|            3|
+--------------------+-------------+
only showing top 5 rows



In [11]:
df.filter(df.page=="NextSong").select("artist").groupBy('artist').agg({'artist':'count'})\
.withColumnRenamed('count(artist)', 'ArtistCount').sort('ArtistCount', ascending=False).show(3)

+--------------------+-----------+
|              artist|ArtistCount|
+--------------------+-----------+
|            Coldplay|         83|
|       Kings Of Leon|         69|
|Florence + The Ma...|         52|
+--------------------+-----------+
only showing top 3 rows



# Question 5 (challenge)

How many songs do users listen to on average between visiting our home page? Please round your answer to the closest integer.



In [12]:
# TODO: write your code to answer question 5

# define function to indicate if page is home page
count_homepage = udf(lambda ishome : int(ishome == "Home"), IntegerType())

In [13]:
# partition by user
# UNBOUND PRECEDING means that the window starts from the 1st row till the current row

user_window = Window.partitionBy('userId').orderBy(desc('ts')).rangeBetween(Window.unboundedPreceding, 0)

In [14]:
# withColumn returns a new DataFrame with an added column, typically after performing a column operation
# homevisit tracks if current page is a home page

# period sums up current total number of home page visits for each user ??

cumsum = df.filter((df.page == 'NextSong') | (df.page == 'Home'))\
.select('userId', 'page', 'ts')\
.withColumn('homevisit', count_homepage(col('page')))\
.withColumn('period', Fsum('homevisit').over(user_window))

cumsum.show(20)

+------+--------+-------------+---------+------+
|userId|    page|           ts|homevisit|period|
+------+--------+-------------+---------+------+
|  1436|NextSong|1513783259284|        0|     0|
|  1436|NextSong|1513782858284|        0|     0|
|  2088|    Home|1513805972284|        1|     1|
|  2088|NextSong|1513805859284|        0|     1|
|  2088|NextSong|1513805494284|        0|     1|
|  2088|NextSong|1513805065284|        0|     1|
|  2088|NextSong|1513804786284|        0|     1|
|  2088|NextSong|1513804555284|        0|     1|
|  2088|NextSong|1513804196284|        0|     1|
|  2088|NextSong|1513803967284|        0|     1|
|  2088|NextSong|1513803820284|        0|     1|
|  2088|NextSong|1513803651284|        0|     1|
|  2088|NextSong|1513803413284|        0|     1|
|  2088|NextSong|1513803254284|        0|     1|
|  2088|NextSong|1513803057284|        0|     1|
|  2088|NextSong|1513802824284|        0|     1|
|  2162|NextSong|1513781246284|        0|     0|
|  2162|NextSong|151

In [15]:
cumsum.filter(cumsum.userId == 2294).show(40)

+------+--------+-------------+---------+------+
|userId|    page|           ts|homevisit|period|
+------+--------+-------------+---------+------+
|  2294|NextSong|1513787467284|        0|     0|
|  2294|NextSong|1513786920284|        0|     0|
|  2294|NextSong|1513786737284|        0|     0|
|  2294|NextSong|1513786461284|        0|     0|
|  2294|    Home|1513786339284|        1|     1|
|  2294|NextSong|1513786261284|        0|     1|
|  2294|NextSong|1513786041284|        0|     1|
|  2294|NextSong|1513785806284|        0|     1|
|  2294|NextSong|1513785549284|        0|     1|
|  2294|NextSong|1513785330284|        0|     1|
|  2294|NextSong|1513785086284|        0|     1|
|  2294|NextSong|1513784808284|        0|     1|
|  2294|NextSong|1513784296284|        0|     1|
|  2294|NextSong|1513783984284|        0|     1|
|  2294|NextSong|1513783804284|        0|     1|
|  2294|NextSong|1513783585284|        0|     1|
|  2294|NextSong|1513783319284|        0|     1|
|  2294|NextSong|151

In [16]:
cumsum.filter(cumsum.page == 'NextSong')\
.groupby('userId', 'period')\
.agg({'period':'count'})\
.agg({'count(period)':'avg'}).show()

+------------------+
|avg(count(period))|
+------------------+
| 6.898347107438017|
+------------------+

