In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import isnan, count, when, col, desc, udf, col, sort_array, asc, avg
from pyspark.sql.functions import sum as Fsum
from pyspark.sql import Window

import datetime

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

%matplotlib inline

Create Spark Session

In [None]:
#Note that Spark only allows one Spark context and one Spark session to be defined at any time.
#In the code below, 'GetOrCreate' will either create the Spark session or modify the existing one.

sparkSesh = SparkSession \
    .builder \
    .appName("app Name") \
    .config('config option','config value') \
    .master("local[*]") \
    .getOrCreate()

Look at parameters of Spark context

In [None]:
sparkSesh.sparkContext.getConf().getAll()

Importing a basic file

In [None]:
read_path = "./sparkify_log_small.json"
log_data = sparkSesh.read.json(read_path)

Take a look at the data

In [None]:
#log_data.head()
#log_data.take(2)

#log_data.schema()
log_data.describe() #Similar to 'str' function (structure) in R for describing data frames

#log_data.count()

Drilling down into particular columns

In [None]:
log_data.select("artist").show()
log_data.select("artist").dropDuplicates().sort('artist').show()

More advanced drilldown

In [None]:
# log_data \
#     .where(log_data.artist != "null") \
#     .groupBy('artist') \
#     .count() \
#     .orderBy('count', ascending=False) \
#     .show()


log_data \
    .select(["userId","page","song"]) \
    .where(log_data.userId == "1046") \
    .collect()

Using custom function to create a new column in data frame

In [None]:
get_hour = udf(lambda x: datetime.datetime.fromtimestamp(float(x) / 1000.0). hour)

#Note: this will not evaluate until data is called (lazy evaluation)
log_data = log_data.withColumn("hour", get_hour(log_data.ts))

Filtering sessions events by when users choose 'nextSong', and looking at the hour during which that happened.

In [None]:
song_hour_view = log_data \
    .filter(log_data.page == "NextSong") \
    .groupBy(log_data.hour) \
    .count() \
    .orderBy(log_data.hour.cast("float"))

song_hour_view.show()

Converting to Pandas data frame
>(Question: Why use pandas over spark?) Maybe to use matplotlib

In [None]:
pd_song_hour_view = song_hour_view.toPandas()

In [None]:
plt.scatter(pd_song_hour_view['hour'],pd_song_hour_view['count'])
plt.xlim(-1,24)
plt.ylim(0,1.2 * max(pd_song_hour_view['count']))
plt.xlabel("Hour")
plt.ylabel("# Songs played")

Data cleaning

In [None]:
#This will drop any records where 'userId' or 'sessionId' is missing
clean_log_data = log_data.dropna(how = "any", subset = ["userId","sessionId"])

#Clean out any fields where userId is an empty string
clean_log_data = clean_log_data.filter(clean_log_data["userId"] != "")

clean_log_data.count()

### Additional Analysis: users who have upgraded their service
We'll take a look at the events before and after users decided to upgrade their subscriptions to try and find out why

In [None]:
#simple function for creating a column to flag downgrade events
flag_upgrade_event = udf(lambda x: 1 if x == "Submit Upgrade" else 0, IntegerType())

clean_log_data = clean_log_data.withColumn("upgrade", flag_upgrade_event("page"))
clean_log_data.head()

Definig a window function for partitioning data based on userId and ordering by time.

In [None]:
windowVal = Window \
    .partitionBy("userId") \
    .orderBy(desc('ts')) \
    .rangeBetween(Window.unboundedPreceding,0) #This including all PREVIOUS rows, but no rows after criteria.

clean_log_data = clean_log_data.withColumn("phase", Fsum("upgrade").over(windowVal))

In [None]:
clean_log_data.head()

Now, let's find a random customer who downgraded to try this on

In [None]:
clean_log_data \
    .select(['userId']) \
    .where(clean_log_data.page == "Submit Upgrade") \
    .show()

In [None]:
clean_log_data \
    .select(['userId','firstname','ts','page','level','phase']) \
    .where(clean_log_data.userId == "1232") \
    .sort('ts') \
    .collect()

Write data out

In [None]:
#This is cool - check out how Spark saves the file. For me, it's actually not a CSV at all, it's a FOLDER
#which has some metadata files and then multiple partitioned files. This is pretty cool - Spark is automatically
#partitioning my files, but keeping the abstraction very basic on my level.

write_path = "./sparkify_log_small.csv"
clean_log_data.write.mode(saveMode="overwrite").csv(write_path)
#clean_log_data.write.save(write_path, format = "csv", header = True,)

### Other Spark Commands & Notes:
- `Where()` (alias for `filter()`) filters rows given a certain condition
- Spark SQL offers aggregation commands like `count()`, `min()`, `max()`, `avg()`, and `countDistinct()`
  - You can also use the `agg()` command and specify multiple types of aggregations like this: `agg({"salary":"avg", "age":"max})`
- Window functions are ways of combining the values of *ranges* of rows in a dataframe. When defining the window, we can choose how to sort and group (within the `partitionBy` method) the rows and how wide of a window we'd like to use (described by `rangeBetween` or `rowsBetween`)
- [PySpark User Guide](https://spark.apache.org/docs/latest/api/python/user_guide/index.html)

## Challenges

Which page did user id "" (empty string) NOT visit?

In [None]:
#1. Get ALL pages
pages = log_data.select("page") \
    .dropDuplicates() \
    .toPandas()['page']

pages = list(pages)

#2. Get pages where criteria
emptyId_pages = log_data.select("page") \
    .filter("userId = ''") \
    .dropDuplicates() \
    .toPandas()['page']

emptyId_pages = list(emptyId_pages) #Converting to list

#3. Get non-matches:
list(filter(lambda x: x not in emptyId_pages, pages))

What type of user does the empty string user id most likely refer to?

In [None]:
log_data \
    .select(["firstname","level","page"]) \
    .filter("userId = ''") \
    .show()

#Looks like users who have not yet logged in to the platform.

How many female users do we have in the data set?

In [None]:
log_data \
    .select(["gender","userId"]) \
    .dropDuplicates() \
    .groupBy(log_data['gender']) \
    .count() \
    .show()

How many songs were played from the most played artist?

In [None]:
log_data \
    .select("artist") \
    .groupBy(log_data['artist']) \
    .count() \
    .orderBy(desc("count")) \
    .show()

How many songs do users listen to on average between visiting our home page? Please round your answer to the closest integer.
>Note: Code below is primarily from Udacity instructors with some small changes by me

In [7]:
#Step #1: Creating Ulog_data to mark the boundaries of window function (i.e., is the record "Home"?)
ishome = udf(lambda ishome : int(ishome == 'Home'), IntegerType())

In [45]:
#Step #2: Defining window as needing to partition on userId (so we only look at sessions within each user)
#then order by timestamp (so we can chronologically see session events)
user_window = Window \
    .partitionBy('userID') \
    .orderBy(asc('ts')) \
    .rowsBetween(Window.unboundedPreceding, Window.currentRow)

#Note: More information on 'RangeBetween' here:
# https://spark.apache.org/docs/3.2.0/api/python/reference/api/pyspark.sql.Window.rangeBetween.html
#the window is a 'traveling' window which progresses through the data set according to the variable(s) in the 'orderBy' clause

In [46]:
#Step 3: Cool! Now we're creating a subset of the data frame with ONLY the events we care about (home visits OR songplays)
#This negates the need to somehow mark songplays - if it's not home, we know what it is.
#The 'homevisit' column is marking (with 0 or 1) whether the row corresponds to a home visit. The 'period' column then uses
#this to create mini session intervals in-between home visits. Every time that 'homevisit' is 1 (and we have reached the homepage
# again), 'period' increments by 1. This means that for each user, their entire session will be broken up according to how many
# times they visit 'home', with each visit meaning that all subsequent page visits will have a unique ID (1,1,1,HOME,2,2,HOME,3,
# HOME, etc...)
#This is helpful because it allows us to then group by this new identifier and aggregate within each of these mini-sessions.
 
cusum = log_data.filter((log_data.page == 'NextSong') | (log_data.page == 'Home')) \
    .select('userID', 'page', 'ts') \
    .withColumn('homevisit', ishome(col('page'))) \
    .withColumn('period', Fsum('homevisit').over(user_window))

In [47]:
#Step 4: Now we have the 'period' column which has incrementing unique ids (1,1,1,2,2,3,etc.) that show the boundaries of when
#users visited 'home' during a session. Since these periods are all unique, it's possible to simply 'group' by these periods,
#and then aggregate over them.
#For our purposes, we just want to know how many 'nextSong' events took place in-between home visits, so we can do the following:
#1. First, filter only for 'nextSong' pages so we don't count 'Home' events
#2. Group our data by userID and THEN period (period only unique within userID)
#3. Count the number of events within these groupings. Due to our work, this will equate to the number of 'nextSong' events
#4. Now we have the count of these events between home visits within each session, for each user. Now we average them all.
cusum.filter((cusum.page == 'NextSong')) \
    .groupBy('userID', 'period') \
    .agg({'period':'count'}) \
    .agg({'count(period)':'avg'}).show()

+------------------+
|avg(count(period))|
+------------------+
| 5.956678700361011|
+------------------+



In [31]:
cusum.filter(cusum.userID == '1232').show()

+------+--------+-------------+---------+------+
|userID|    page|           ts|homevisit|period|
+------+--------+-------------+---------+------+
|  1232|    Home|1513727268284|        1|     1|
|  1232|NextSong|1513727589284|        0|     1|
|  1232|NextSong|1513727885284|        0|     1|
|  1232|NextSong|1513728173284|        0|     1|
|  1232|NextSong|1513728346284|        0|     1|
|  1232|NextSong|1513728548284|        0|     1|
|  1232|NextSong|1513738827284|        0|     1|
|  1232|NextSong|1513739064284|        0|     1|
|  1232|    Home|1513739156284|        1|     2|
|  1232|NextSong|1513739286284|        0|     2|
|  1232|NextSong|1513739552284|        0|     2|
|  1232|NextSong|1513739756284|        0|     2|
|  1232|NextSong|1513740075284|        0|     2|
|  1232|NextSong|1513740198284|        0|     2|
|  1232|NextSong|1513740448284|        0|     2|
|  1232|NextSong|1513740707284|        0|     2|
|  1232|NextSong|1513740934284|        0|     2|
|  1232|NextSong|151