In [None]:
import hashlib
import pyspark.sql.functions as f
from pyspark.sql import Window

## Project Introduction

In this final project you will have the opportunity to apply everything you've learned throughout the course on a similar setting to what you face on your daily work as a Data Analyst at Mercedes.

The goal of this project is to understand how you can manipulate and analyze Google Analytics data about user interactions on a website.

For that, you'll work with the [Google Analytics Sample dataset](https://console.cloud.google.com/marketplace/product/obfuscated-ga360-data/obfuscated-ga360-data?inv=1&invt=AbmlmQ), which contains real data from the [Google Merchandise Store](https://shop.googlemerchandisestore.com/), a real ecommerce store that sells Google-branded merchandise.

The data is typical of what an ecommerce website would see and includes the following information:
- **Traffic source data**: information about where website visitors originate, including data about organic traffic, paid search traffic, and display traffic
- **Content data**: information about the behavior of users on the site, such as URLs of pages that visitors look at, how they interact with content, etc.
- **Transactional data**: information about the transactions on the Google Merchandise Store website.

## Download the data

The data is available on a zip file. This zip contains three parquet files:
- `ga_sessions_main.parquet`: the main information about each session
- `ga_sessions_hits.parquet`: detailed information about hits in each session
- `ga_sessions_network.parquet`: information about traffic sources, device and geographic information

**NOTE:** To make things a bit easier, only data from the first 15 days of August 2016 was included in the dataset. Also, some noisy information about `hits` was removed from the original data.

Let's download the data and save it to the Databricks File System (DBFS).

In [None]:
%sh wget https://raw.githubusercontent.com/inesmcm26/lp-big-data-mercedes/main/data/ga_sessions.zip

In [None]:
%sh unzip ga_sessions.zip

In [None]:
dbutils.fs.cp('file:/databricks/driver/ga_sessions_main.parquet', 'dbfs:/FileStore/final_project/ga_sessions_main.parquet')
dbutils.fs.cp('file:/databricks/driver/ga_sessions_network.parquet', 'dbfs:/FileStore/final_project/ga_sessions_network.parquet')
dbutils.fs.cp('file:/databricks/driver/ga_sessions_hits.parquet', 'dbfs:/FileStore/final_project/ga_sessions_hits.parquet')

The column that identifies a session and is **common to all tables** is the `sessionId` column.

Run the following cell to load each dataset into spark dataframes.

In [None]:
df_main = spark.read.parquet('/FileStore/final_project/ga_sessions_main.parquet')
df_hits = spark.read.parquet('/FileStore/final_project/ga_sessions_hits.parquet')
df_network = spark.read.parquet('/FileStore/final_project/ga_sessions_network.parquet')

### Datasets Overview

#### Main dataset

In [None]:
df_main.printSchema()

Besides the session id, the main dataset contains the following columns:
- **visitorId**: The unique identifier for a visitor
- **visitNumber**: The visit number of this user. If this is the first visit to the website, then this is set to 1.
- **visitStartTime**: The timestamp (expressed as POSIX time) of the beginning of the session
- **totals**: A struct with statistics about the session, such as total number of hits, time on site, number of transactions and revenue, etc.
- **channelGrouping**: The channel via which the user came to the Store

#### Hits dataset

In [None]:
df_hits.printSchema()

Besides the session id, the hits dataset contains the following columns:

- **hits**: An array of structs representing all the hits in this session. A hit is an interaction that results in data being sent to Google Analytics. Each struct is a hit defined by the following fields:
    - **hitNumber**: The number of this hit in the session
    - **type**: Type of the hit (PAGE or EVENT)
    - **hour**: Hour of the hit
    - **minute**: Minute of the hit
    - **time**: Time spent on the hit
    - **page**: Information about the page
    - **contentGroup**: Information about the content categorization of the page on the website
    - **product**: Array of structs with product information of all products displayed on the page
    - **eventInfo**: If hit is of type 'EVENT', this field contains information about the event
    - **promotion**: Array of structs with promotion information of all promotions displayed on the page.
    - **promotionActionInfo**: Present when there is a promotion on the hit. It explains whether the promotion was clicked (which corresponds to a hit of type 'EVENT' and this event is a 'Promotion Click'), or the promotion is just viewed on the page but was not clicked. 
    - **transaction**: Information about the transaction when the hit is an event 'Confirm Checkout'. Null otherwise.



#### Network dataset


In [None]:
df_network.printSchema()

Besides the session id, the network dataset contains the following columns:

- **trafficSource**: A struct with information about the source of the session, as well as adds and campaign information
- **device**: A struct with information about the device used in the session
- **geoNetwork**: A struct with information about the geographic location of the user. Most of this information is obscured and only city, country and country are available.
- **customDimensions**: Extra traffic information. You can ignore this column.


## Dataset analysis and cleaning

Start by checking how many rows the dataframes have.

In [None]:
print(df_main.count())
print(df_hits.count())
print(df_network.count())

In [None]:
# nr_rows = WRITE THE SOLUTION HERE

In [None]:
# Run this test to verify that the answer is correct
try:
    assert isinstance(nr_rows, int)
    assert hashlib.sha256(str(nr_rows).encode('utf-8')).hexdigest() == '5a0d24b5fcc0584bfd8d51a4fa0ebb838ef1e9769707e17c7e8002438909e383'
    print('Good job! The answer is correct')
except:
    print('The answer is not right yet! Try again')

Now, see if there are any missing values on the main dataset.

In [None]:
df_main.describe().display()

Assume that if the channel grouping is missing, the channel via which the user came to the Store is 'Direct' and fill the missing values.

In [None]:
df_main = (
    df_main
    .fillna('Direct', subset=['channelGrouping'])
)

df_main.describe().display()

In [None]:
# Run this test to verify that the answer is correct
channel_grouping_values = df_main.select('channelGrouping').rdd.flatMap(lambda x: x).collect()
try:
    assert None not in channel_grouping_values
    assert channel_grouping_values.count('Direct') == 9587
    print('Good job! You managed to fill the missing values values')
except:
    print('Try again')

Now take a closer look into the `geoNetwork` column of the network dataframe. Count how many times a session is derived from each continent.

In [None]:
(
    df_network
    .select(
        f.col('geoNetwork').getField('continent').alias('continent')
    )
    .groupBy('continent')
    .agg(f.count('continent').alias('count'))
).display()

You see the problem? Sometimes a continent is written in lower case and other in upper case. Standardize the data to make the continent always lower case.

**Hint:** You learned how to lower case a String column during the course. But now we want to lower case the field `continent` of the struct column `geoNetwork`. [Here](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/column.html) you can check PySpark column methods. One of them will help you solving the problem.

In [None]:
df_network = (
    df_network
    .withColumn(
        'geoNetwork',
        f.col('geoNetwork').withField('continent', f.lower(f.col('geoNetwork').getField('continent')))
    )
)

In [None]:
# Run this test to verify that the answer is correct
continents = df_network.select(f.col('geoNetwork').getField('continent').alias('continent')).distinct().rdd.flatMap(lambda x: x).collect()

try:
    for continent in continents:
        assert continent == continent.lower()
    print('Good job! You managed to standardize the data')
except:
    print('Try again')

## Answer business questions

### Easy questions


Users access the store through different channels, and each session has a corresponding revenue value.

1. Which channel generates the highest total revenue across all sessions?

Notes:
- Use the `channelGrouping` column in the main dataset for channel types.
- Calculate the revenue using the `totalTransactionRevenue` field within the `totals` column.

In [None]:
(
    df_main
    .groupBy('channelGrouping')
    .agg(
        f.sum(f.col('totals').getField('totalTransactionRevenue')).alias('revenue')
    )
    .orderBy(f.desc('revenue'))
).display()

In [None]:
# channel = WRITE THE SOLUTION HERE

In [None]:
channel = "Referral"

In [None]:
# Run this test to verify that the answer is correct
try:
    assert isinstance(channel, str)
    assert hashlib.sha256(channel.encode('utf-8')).hexdigest() == 'aeb7b00433003f75c286e214eccd11a1e4ba6fbd0d0413cb35864749821cc8e0'
    print('Good job! The answer is correct')
except:
    print('The answer is not right yet! Try again')

2. Users access the store through different browsers. Which are the top 3 browsers ranked by the total time users spent on the site?

Notes:
- You can find the browser used by a user on a session in the `device` column of the network dataframe
- The total time spent on site on a session is registered on the `totals` column of the main dataframe

In [None]:
(
    df_main
    .join(
        df_network,
        on=['sessionId']
    )
    .select(
        f.col('device').getField('browser').alias('browser'),
        f.col('totals').getField('timeOnSite').alias('time')
    )
    .groupBy('browser')
    .agg(f.sum(f.col('time')).alias('total_time'))
    .orderBy(f.desc('total_time'))
).display()

In [None]:
top_browsers = ['Chrome', 'Safari', 'Firefox']

In [None]:
# top_browsers = ['Browser1', 'Browser2', 'Broswer3'] REPLACE THE VALUES WITH THE CORRECT ONES

In [None]:
# Run this test to verify that the answer is correct
try:
    assert isinstance(top_browsers, list)
    assert len(top_browsers) == 3
    for browser in top_browsers:
        assert isinstance(browser, str)
    assert hashlib.sha256(json.dumps(''.join(top_browsers)).encode()).hexdigest() == '13a1c6bec3d2d1c3d8be28dbf835880972effd2ff9619f1b15d6fee8d505ff5c'
    print('Good job! The answer is correct')
except:
    print('The answer is not right yet! Try again')
    print('Check if you wrote the browser names starting with capital letter')

3. Analyse the website traffic (total number of sessions) per hour of the day and day of the week.

Visualize the result using a pivot table.

**NOTE:** The start time of each session is in UNIX time. You may have to first transform it to a date before being able to extract the hour and day of week.

What is the total number of sessions registered at 8pm on tuesdays?

In [None]:
(
    df_main
    .withColumn('date', f.from_unixtime('visitStartTime'))
    .withColumn('day_of_week', f.dayofweek('date'))
    .withColumn('hour', f.hour('date'))
    .groupBy('hour')
    .pivot('day_of_week')
    .agg(
        f.count('sessionId')
    )
    .orderBy('hour')
).display()

In [None]:
# nr_sessions = WRITE THE SOLUTION HERE

In [None]:
nr_sessions = 330

In [None]:
# Run this test to verify that the answer is correct
try:
    assert isinstance(nr_sessions, int)
    assert hashlib.sha256(str(nr_sessions).encode('utf-8')).hexdigest() == '5426d2ca50f244fb43fe9eafc82da08f33f3b4f8d9140802bd0102e780b629d6'
    print('Good job! The answer is correct')
except:
    print('The answer is not right yet! Try again')

4. Identify the `visitorId` of the user with highest average time gap between two consecutive sessions. Consider only visitors that have more than 6 registered sessions.

In [None]:
window = Window.partitionBy('visitorId').orderBy('visitStartTime')

(
    df_main
    .withColumn('date', f.from_unixtime('visitStartTime'))
    .withColumn('prev_date', f.lag('date').over(window))
    .withColumn('days_gap', f.date_diff('date', 'prev_date'))
    .groupBy('visitorId')
    .agg(
        f.count('sessionId').alias('nr_sessions'),
        f.avg('days_gap').alias('avg_days_gap')
    )
    .filter(f.col('nr_sessions') > 6)
    .orderBy(f.desc('avg_days_gap'))
    .limit(1)
).display()

In [None]:
# visitor_id = WRITE THE SOLUTION HERE

In [None]:
visitor_id = '8436426603099391262'

In [None]:
# Run this test to verify that the answer is correct
try:
    assert isinstance(visitor_id, str)
    assert hashlib.sha256(visitor_id.encode('utf-8')).hexdigest() == '5abdd1c91bc52becf7267a3bacfb9d5e979f54f234919d982ed0c1f61cb6425a'
    print('Good job! The answer is correct')
except:
    print('The answer is not right yet! Try again')

### Medium questions

5. What are the top 5 products that are most added to the cart?

**NOTES:**
- A hit of type 'EVENT' can correspond to one of the following event actions (`eventAction` field of `eventInfo`):
    - Product Click
    - Add to Cart
    - Remove from Cart
    - Quickview Click
    - Onsite Click
    - Promotion Click
- A product is identified by its SKU value. You can find this value in field `productSKU` of a product. Remember that the `product` field is an array of product information of all products involved in a hit.

In [None]:
(
    df_hits
    .select(
        'sessionId',
        f.inline('hits')
    )
    .filter(
        (f.col('type') == 'EVENT')
        & (f.col('eventInfo').getField('eventAction') == "Add to Cart")
    )
    .groupBy(
        f.element_at(f.col('product'), 1).getField('productSKU').alias('product_sku'),
        f.element_at(f.col('product'), 1).getField('v2ProductName').alias('product_name')
    )
    .agg(f.count('sessionId').alias('nr_added_to_cart'))
    .orderBy(f.desc('nr_added_to_cart'))
).display()

In [None]:
# top_producs = ["product_sku_1", "product_sku_2", ...] REPLACE THE VALUES WITH THE CORRECT ONES

In [None]:
top_products = ["GGOEGFKQ020399", "GGOEGAAX0037", "GGOEGAAX0104", "GGOEGAAX0342", "GGOEGAAX0074"]

In [None]:
# Run this test to verify that the answer is correct
try:
    assert isinstance(top_products, list)
    assert len(top_products) == 5
    for browser in top_products:
        assert isinstance(browser, str)
    assert hashlib.sha256(json.dumps(''.join(top_products)).encode()).hexdigest() == '2758b9ec187f6a3f3d13a35a194c82b92e12ba7b995037a84c3f4262dacac35f'
    print('Good job! The answer is correct')
except:
    print('The answer is not right yet! Try again')
    print('Make sure you wrote the product ids in the correct order')

6. What is the average time spent by users on the 'Shopping Cart' page in sessions where a purchase was made?

Answer with 2 decimal places.

**NOTES**
- To determine sessions where purchases were made, filter the main dataframe by checking the `transactions` field of the `totals` column. If the field is non-null and greater than 0, it indicates that a purchase occurred during the session.
- Hits that correspond to users being on the 'Shopping Cart' page are of type 'PAGE', and the `pageTitle` field in `page` is 'Shopping Cart'.
- The time spent on a hit is available on the `time` field of the `hits` column of the hits dataframe

In [None]:
(
    df_hits
    .select(
        'sessionId',
        f.inline('hits')
    )
    .join(
        df_main,
        on=['sessionId']
    )
    .filter(
        f.col('totals').getField('transactions').isNotNull()
        & (f.col('totals').getField('transactions') > 0)
        & (f.col('type') == 'PAGE')
        & (f.col('page').getField('pageTitle') == 'Shopping Cart')
    )
    .select(f.round(f.avg('time'), 2).alias('avg_time'))
).display()

In [None]:
# avg_time = WRITE THE SOLUTION HERE

In [None]:
avg_time = 767495.67

In [None]:
# Run this test to verify that the answer is correct
try:
    assert isinstance(avg_time, float)
    assert hashlib.sha256(str(avg_time).encode('utf-8')).hexdigest() == '5434c334d8669be826679ae262c9feb5b895a269d1afde5fd2bdb3fe24fa2d2c'
    print('Good job! The answer is correct')
except:
    print('The answer is not right yet! Try again')

### Hard questions

7. Considering only sessions where there was a promotion click and at least one product was added to the cart, what is the id of the most clicked promotion?

**NOTES:**
- You can check if a product was added to the cart or a promotion was clicked by analysing the `eventInfo` column. A hit of type 'EVENT' can correspond to one of the following event actions (`eventAction` field of `eventInfo` column):
    - Product Click
    - Add to Cart
    - Remove from Cart
    - Quickview Click
    - Onsite Click
    - Promotion Click
- For hits where there was a promotion click, the column `promotion` contains an array with only element - details of the clicked promotion. You can find the promotion id on the field `promoId` of the element in that column.

In [None]:
(
    df_hits
    .withColumn(
        'events_list',
        f.transform(
            'hits',
            lambda hit: f.when(hit.getField('type') == 'EVENT', hit.getField('eventInfo').getField('eventAction')).otherwise('None')
        )
    )
    .filter(
        f.array_contains('events_list', 'Add to Cart')
        & f.array_contains('events_list', 'Promotion Click')
    )
    .select(
        'visitId',
        f.inline('hits')
    )
    .groupBy(f.element_at('promotion', 1).getField('promoId'))
    .agg(f.count('visitId').alias('nr_clicks'))
    .orderBy(f.desc('nr_clicks'))
).display()

In [None]:
# promo_id = WRITE THE SOLUTION HERE

In [None]:
promo_id = "Apparel Row 1"

In [None]:
# Run this test to verify that the answer is correct
try:
    assert isinstance(promo_id, str)
    assert hashlib.sha256(promo_id.encode('utf-8')).hexdigest() == '2f34cf7b9ce1f5062f0fe6f8f9a6d073214fc6869ba4a85014bab1cf672e80cc'
    print('Good job! The answer is correct')
except:
    print('The answer is not right yet! Try again')

8. Identify the user that most views promotions in sessions but never clicks on them.

Use a UDF to answer the question.

**NOTES:**
- You can check if promotions where viewed on a hit by checking the `promoIsView` field of the `promotionActionInfo` column.
- Similarly, you can see if a user clicked on a promotion on a hit by checking the `promoIsClick` field of the `promotionActionInfo` column.

In [None]:
from pyspark.sql.types import BooleanType

def view_but_no_click(hits_list):
    clicked = False
    view = False
    for hit in hits_list:
        if hit['promotionActionInfo']:
            if hit['promotionActionInfo']['promoIsClick']:
                clicked = True
            if hit['promotionActionInfo']['promoIsView']:
                view = True
    
    return view and not clicked

view_but_no_click_udf = f.udf(view_but_no_click, BooleanType())

(
    df_hits
    .join(
        df_main,
        on=['sessionId']
    )
    .withColumn(
        'view_but_no_click',
        view_but_no_click_udf(f.col('hits'))
    )
    .filter(f.col('view_but_no_click'))
    .groupBy('visitorId')
    .agg(f.sum(f.when(f.col('view_but_no_click'), 1).otherwise(0)).alias('nr_sessions'))
    .orderBy(f.desc('nr_sessions'))
).display()

In [None]:
# visitor_id = WRITE THE SOLUTION HERE

In [None]:
visitor_id = '0593150394512575588'

In [None]:
# Run this test to verify that the answer is correct
try:
    assert isinstance(visitor_id, str)
    assert hashlib.sha256(visitor_id.encode('utf-8')).hexdigest() == '230dd9fdc397961e79b1a698614b91f948ac7ab685261b7d21bf842387768fe9'
    print('Good job! The answer is correct')
except:
    print('The answer is not right yet! Try again')

## Sequential analysis

### Sessions path analysis - Simple analysis

We'll focus on these pages to understand where users are most lost in the process:

| Page | pagePathLevel1 |
| - | - |
| Home | /home |
| Item | /google+redesign/ |
| Shopping Cart | /basket.html|
| Payment Method | /payment.html |
| Checkout Confirmation | /ordercompleted.html |

We want to calculate the number and percentage of sessions that stop at each of these stages.

The sequence we're monitoring is the same as the one presented above:

Home -> Item -> Shopping Cart -> Payment Method -> Checkout Confirmation

#### The way you currently do it

To understand how many sessions stop at each stage, you create one table containing the hits for each page and do multiple left joins to monitor the sequence of hits.

First of all, lets process the hits table to put it on the format you're used to dealing: row per hit.

In [None]:
df_exploded_hits = (
    df_hits
    .select(
        'sessionId',
        f.inline('hits')
    )
    .select(
        'sessionId',
        'hitNumber',
        f.col('page').getField('pagePathLevel1').alias('pagePath')
    )
)

df_exploded_hits.display()

Now let's create one table with the hits for each page and then do the multiple left joins.

This corresponds to the following SQL code:

In [None]:
# Register PySpark DataFrame as SQL temporary view to manipulate it with SQL sintax
df_exploded_hits.createOrReplaceTempView('hits_view')

In [None]:
%sql

WITH filtered_hits AS (
  SELECT sessionId, pagePath, hitNumber
  FROM hits_view
  WHERE pagePath IN ("/home", "/google+redesign/", "/basket.html", "/yourinfo.html", "/payment.html", "/revieworder.html", "/ordercompleted.html")
),
home_hits AS (
  SELECT sessionId, hitNumber
  FROM filtered_hits
  WHERE pagePath == "/home"
),
item_hits AS (
  SELECT sessionId, hitNumber
  FROM filtered_hits
  WHERE pagePath == "/google+redesign/"
),
basket_hits AS (
  SELECT sessionId, hitNumber
  FROM filtered_hits
  WHERE pagePath == "/basket.html"
),
payment_hits AS (
  SELECT sessionId, hitNumber
  FROM filtered_hits
  WHERE pagePath == "/payment.html"
),
order_completed_hits AS (
  SELECT sessionId, hitNumber
  FROM filtered_hits
  WHERE pagePath == "/ordercompleted.html"
),
uni AS (
  SELECT
    home_hits.sessionId AS home_sessionId, 
    item_hits.sessionId AS item_sessionId,
    basket_hits.sessionId AS basket_sessionId,
    payment_hits.sessionId AS payment_sessionId,
    order_completed_hits.sessionId AS order_completed_sessionId
  FROM home_hits
  LEFT JOIN item_hits ON home_hits.sessionId == item_hits.sessionId AND home_hits.hitNumber < item_hits.hitNumber
  LEFT JOIN basket_hits ON home_hits.sessionId == basket_hits.sessionId AND item_hits.hitNumber < basket_hits.hitNumber
  LEFT JOIN payment_hits ON home_hits.sessionId == payment_hits.sessionId AND basket_hits.hitNumber < payment_hits.hitNumber
  LEFT JOIN order_completed_hits ON home_hits.sessionId == order_completed_hits.sessionId AND payment_hits.hitNumber < order_completed_hits.hitNumber
)

SELECT
  COUNT(DISTINCT home_sessionId) AS total_home,
  COUNT(DISTINCT item_sessionId) AS total_item,
  (COUNT(DISTINCT item_sessionId) / COUNT(DISTINCT home_sessionId)) * 100 AS 1st_2nd,
  COUNT(DISTINCT basket_sessionId) AS basket_item,
  (COUNT(DISTINCT basket_sessionId) / COUNT(DISTINCT item_sessionId)) * 100 AS 2nd_3rd,
  COUNT(DISTINCT payment_sessionId) AS payment_item,
  (COUNT(DISTINCT payment_sessionId) / COUNT(DISTINCT basket_sessionId)) * 100 AS 3th_4th,
  COUNT(DISTINCT order_completed_sessionId) AS order_completed_item,
  (COUNT(DISTINCT order_completed_sessionId) / COUNT(DISTINCT payment_sessionId)) * 100 AS 4th_5th
FROM uni

This is the corresponding code in PySpark.

In [None]:
# Filter hits based on specific page paths
filtered_hits = (
    df_exploded_hits
    .filter(
        f.col('pagePath').isin(
            '/home',
            '/google+redesign/',
            '/basket.html',
            '/payment.html',
            '/ordercompleted.html'
        )
    )
)

# Create individual DataFrames for each page path
home_hits = (
    filtered_hits
    .filter(f.col('pagePath') == '/home')
    .select(
        f.col('sessionId').alias('home_sessionId'),
        f.col('hitNumber').alias('home_hitNumber')
    )
)
item_hits = (
    filtered_hits
    .filter(f.col('pagePath') == '/google+redesign/')
    .select(
        f.col('sessionId').alias('item_sessionId'),
        f.col('hitNumber').alias('item_hitNumber')
    )
)
basket_hits = (
    filtered_hits
    .filter(f.col('pagePath') == '/basket.html')
    .select(
        f.col('sessionId').alias('basket_sessionId'),
        f.col('hitNumber').alias('basket_hitNumber')
    )
)
payment_hits = (
    filtered_hits
    .filter(f.col('pagePath') == '/payment.html')
    .select(
        f.col('sessionId').alias('payment_sessionId'),
        f.col('hitNumber').alias('payment_hitNumber')
    )
)
order_completed_hits = (
    filtered_hits
    .filter(f.col('pagePath') == '/ordercompleted.html')
    .select(
        f.col('sessionId').alias('order_completed_sessionId'),
        f.col('hitNumber').alias('order_completed_hitNumber')
    )
)

# Perform left joins in sequence
uni = (
    home_hits
    .join(
        item_hits,
        on=[(home_hits.home_sessionId == item_hits.item_sessionId)
            & (home_hits.home_hitNumber < item_hits.item_hitNumber)
        ],
        how='left'
    )
    .join(
        basket_hits,
        on=[(home_hits.home_sessionId == basket_hits.basket_sessionId)
            & (item_hits.item_hitNumber < basket_hits.basket_hitNumber)
        ],
        how='left'
    )
    .join(
        payment_hits,
        on=[(home_hits.home_sessionId == payment_hits.payment_sessionId)
            & (basket_hits.basket_hitNumber < payment_hits.payment_hitNumber)
        ],
        how='left'
    )
    .join(
        order_completed_hits,
        on=[(home_hits.home_sessionId == order_completed_hits.order_completed_sessionId)
            & (payment_hits.payment_hitNumber < order_completed_hits.order_completed_hitNumber)
        ],
        how='left'
    )
)

result = (
    uni.select(
        f.countDistinct('home_sessionId').alias('total_home'),
        f.countDistinct('item_sessionId').alias('total_item'),
        f.countDistinct('basket_sessionId').alias('total_basket'),
        f.countDistinct('payment_sessionId').alias('total_payment'),
        f.countDistinct('order_completed_sessionId').alias('total_order_completed')
    )
    .withColumn('1st_2nd', (f.col('total_item') / f.col('total_home')) * 100)
    .withColumn('2nd_3rd', (f.col('total_basket') / f.col('total_item')) * 100)
    .withColumn('3th_4th', (f.col('total_payment') / f.col('total_basket')) * 100)
    .withColumn('4th_5th', (f.col('total_order_completed') / f.col('total_payment')) * 100)
)

result.display()


#### The PySpark built-in functions way

Now let's do it in a more efficient way without the need for multiple left joins.

First, start by creating one boolean column for each page path that indicates if the hit is on that page.

Complete the code bellow:

In [None]:
df_pages_flag = (
    df_exploded_hits
    .withColumn("is_home", (f.col("pagePath") == "/home"))
    .withColumn("is_item", (f.col("pagePath") == "/google+redesign/"))
    .withColumn("is_basket", (f.col("pagePath") == "/basket.html"))
    .withColumn("is_payment", (f.col("pagePath") == "/payment.html"))
    .withColumn("is_order_completed", (f.col("pagePath") == "/ordercompleted.html"))
)

Next, create the following columns:
- `home_seen`: boolean column that is true if the hit happened after a hit on the home page.
- `items_after_home`: boolean column that is true if the hit is on an item page and the home page has been seen.
- `basket_after_item`: boolean column that is true if the hit is on the shopping cart page and there was a **previous hit on an item page, which, in turn, had a previous hit on the home page**.
- `payment_after_basket`: boolean column that is true if the hit is on the payment page and there was a previous hit on the home, item and basket pages in sequence
- `order_completed_after_payment`: boolean column that indicates if the hit is on the order completed page and there was a previous hit on the home, item, basket and payment pages in sequence

Hint:
- You should create them one after the other. And you should use the column you created in the previous step to create the new column.
- To create some of these columns, you should use a window that goes from the current hit to the first hit of the session. 
- The function to be applied over the window needs to be a window function that returns True if some condition verifies. The condition is that there was a previous hit on the page you're interested in.

In [None]:
windowSpec = Window.partitionBy("sessionId").orderBy("hitNumber").rowsBetween(Window.unboundedPreceding, 0)

df = (
    df_pages_flag
    .withColumn("home_seen", f.max("is_home").over(window))
    .withColumn("item_after_home", f.col('is_item') & f.col('home_seen'))
    .withColumn("basket_after_item", f.col('is_basket') & f.max('item_after_home').over(window))
    .withColumn("payment_after_basket",
                f.col("is_payment") & f.max('basket_after_item').over(window)
    )
    .withColumn("order_completed_after_payment",
                f.col("is_order_completed") & f.max('payment_after_basket').over(window)
    )
)

Finally, calculate the number of distinct sessions where each of these columns is true. This will give you the number of sessions that stop at each stage.

You can store each of these values in a column of a final results dataframe.

After that, calculate the percentage of sessions that reach each stage. For that, divide the number of sessions that reach a stage by the number of sessions that reached the previous stage.

In [None]:
result = (
    df
    .agg(
        f.countDistinct(f.when(f.col("home_seen"), f.col("sessionId"))).alias("total_home"),
        f.countDistinct(f.when(f.col("item_after_home"), f.col("sessionId"))).alias("total_item_after_home"),
        f.countDistinct(f.when(f.col("basket_after_item"), f.col("sessionId"))).alias("total_basket_after_item"),
        f.countDistinct(f.when(f.col("payment_after_basket"), f.col("sessionId"))).alias("total_payment_after_basket"),
        f.countDistinct(f.when(f.col("order_completed_after_payment"), f.col("sessionId"))).alias("total_order_completed_after_payment")
    )
    .withColumn(
        "item_after_home_ratio",  f.col("total_item_after_home") / f.col("total_home")
    ).withColumn(
        "basket_after_item_ratio",  f.col("total_basket_after_item") / f.col("total_item_after_home")
    ).withColumn(
        "payment_after_basket_ratio", f.col("total_payment_after_basket") / f.col("total_basket_after_item")
    ).withColumn(
        "order_completed_after_payment_ratio", 
        f.col("total_order_completed_after_payment") / f.col("total_payment_after_basket")
    )
)

result.display()

#### The UDFs way

Now, let's do it using UDFs.

Start by creating a function returns True if there is a hit on an item page after there was a hit on the home page for each session. Then register this function as a UDF. After that, create a new boolean column on the hits dataframe that is the result of applying this UDF to the `hits` column.

It is easier to use UDFs on top of the original hits dataframe rather than the exploded one. Since we have one row for each session, and each row has an array of hits, we can iterate through an array using python, and it becomes easy to deal with this complex column.

In [None]:
from pyspark.sql.types import BooleanType

@f.udf(BooleanType())
def item_after_home(hits):
    # Hits is a list of dictionaries
    visited_home = False
    
    for hit in hits:
        page_path = hit['page']['pagePathLevel1']
        if page_path == '/home':
            visited_home = True
        
        if visited_home and page_path == '/google+redesign/':
            return True

    return False



df_hits_item_after_home = (
    df_hits
    .withColumn('item_after_home', item_after_home(f.col('hits')))
)

df_hits_item_after_home.display()

Now elaborate on that function. Instead of returning just a boolean, return an array of booleans, one for each page path. Then, register this function as a UDF and apply it to the hits dataframe.

In [None]:
from pyspark.sql.types import BooleanType, ArrayType

@f.udf(ArrayType(BooleanType()))
def stages(hits):
    result = []
    
    home = False
    item_after_home = False
    basket_after_item = False
    payment_after_basket = False
    order_completed_after_payment = False
    
    for hit in hits:
        page_path = hit['page']['pagePathLevel1']

        if page_path == '/home':
            home = True
        
        if home and (page_path == '/google+redesign/'):
            item_after_home = True
        
        if item_after_home and (page_path == '/basket.html'):
            basket_after_item = True
        
        if basket_after_item and (page_path == '/payment.html'):
            payment_after_basket = True
        
        if payment_after_basket and page_path == '/ordercompleted.html':
            order_completed_after_payment = True 

    # Store results in a map
    result = [
        home,
        item_after_home,
        basket_after_item,
        payment_after_basket,
        order_completed_after_payment
    ]
    
    return result


df_stages = (
    df_hits
    .withColumn('stages', stages(f.col('hits')))
)

df_stages.display()

Now create a boolean column for each stage. Basically, create one column for each array element.

In [None]:
df = (
    df_stages
    .select(
        'sessionId',
        f.element_at('stages', 1).alias('home_seen'),
        f.element_at('stages', 2).alias('item_after_home'),
        f.element_at('stages', 3).alias('basket_after_item'),
        f.element_at('stages', 4).alias('payment_after_basket'),
        f.element_at('stages', 5).alias('order_completed_after_payment'),
    )
)

You can now perform the same statistical calculations that you did when using the PySpark built-in functions.

Using a UDF is just an alternative way of finding out if a session reached a certain stage. After this information is available, you can use the built-in functions to calculate the number of sessions that reached each stage and the percentage of sessions that reached each stage.

In [None]:
result = (
    df_stages
    .agg(
        f.countDistinct(f.when(f.col("home_seen"), f.col("sessionId"))).alias("total_home"),
        f.countDistinct(f.when(f.col("item_after_home"), f.col("sessionId"))).alias("total_item_after_home"),
        f.countDistinct(f.when(f.col("basket_after_item"), f.col("sessionId"))).alias("total_basket_after_item"),
        f.countDistinct(f.when(f.col("payment_after_basket"), f.col("sessionId"))).alias("total_payment_after_basket"),
        f.countDistinct(f.when(f.col("order_completed_after_payment"), f.col("sessionId"))).alias("total_order_completed_after_payment")
    )
    .withColumn(
        "item_after_home_ratio",  f.col("total_item_after_home") / f.col("total_home")
    ).withColumn(
        "basket_after_item_ratio",  f.col("total_basket_after_item") / f.col("total_item_after_home")
    ).withColumn(
        "payment_after_basket_ratio", f.col("total_payment_after_basket") / f.col("total_basket_after_item")
    ).withColumn(
        "order_completed_after_payment_ratio", 
        f.col("total_order_completed_after_payment") / f.col("total_payment_after_basket")
    )
)

result.display()

### Promotion effectiveness - More complex path analysis


In question 6 we saw which was the most clicked promotion in sessions where products were added to the cart and the promotion was clicked.

However, we are not sure if clicking the promotion is what lead to the products being added to the cart. 

So now let's see which promotion actually led to additions to the cart. For that we need to do sequential analysis.

In [None]:
from pyspark.sql.types import MapType, StringType, IntegerType

def promotion_purchases(hits):
    # hits is an array of dicts

    promos = {}
    current_promo = None
    prev_page = None
    last_was_promo = False

    for hit in hits:
        current_page = hit['page']['pageTitle']

        # If last event was promo click is first time in page.
        # No need to compare current to last page
        if last_was_promo:
            last_was_promo = False
        else:
            # Reset current promo if the page changes
            if current_page != prev_page:
                current_promo = None

        # Check if the event is a promotion click and update the current promo
        if hit['eventInfo'] and (hit['eventInfo']['eventAction'] == 'Promotion Click'):
            current_promo = hit['promotion'][0]['promoId']
            last_was_promo = True

         # If the event is 'Add to Cart' and a promo is active, update the count
        if current_promo and hit['eventInfo'] and (hit['eventInfo']['eventAction'] == 'Add to Cart'):
            if current_promo not in promos:
                promos[current_promo] = 0
            promos[current_promo] += 1
        
        prev_page = current_page
    
    return promos


promotion_purchases_udf = f.udf(promotion_purchases, MapType(StringType(), IntegerType()))

res = (
    df_hits
    .withColumn(
        'promotion_purchases',
        promotion_purchases_udf(f.col('hits'))
    )
)

res.display()

(
    res
    .select(
        f.explode('promotion_purchases')
    )
    .withColumnRenamed('key', 'promoId')
    .withColumnRenamed('value', 'nr_purchases')
    .groupby('promoId')
    .agg(f.sum('nr_purchases').alias('total_purchases'))
    .orderBy(f.desc('total_purchases'))
).display()

In [None]:
# get origin promo
# if we got to a page because a promo was clicked, set origin promo to promo name
# everytime a promo is clicked, the next hit is a 'PAGE' hit with the resulting page

window_origin_promo = Window.partitionBy('visitId', 'visitNumber', 'visitorId', 'visitStartTime').orderBy('hitNumber')

df_origin_promo = (
    df_hits
    .select(
        'visitId',
        'visitNumber',
        'visitorId',
        'visitStartTime',
        f.inline('hits')
    )
    .withColumn('origin_promo',
                f.when(
                    f.lag(f.col('eventInfo')).over(window_origin_promo).isNotNull()
                    & (f.lag(f.col('eventInfo').getField('eventAction')).over(window_origin_promo) == 'Promotion Click'),
                    f.lag('promotion').over(window_origin_promo)
                ).otherwise(None)
    )
    .select(
        'visitId',
        'visitNumber',
        'visitorId',
        'visitStartTime',
        'type',
        'hitNumber',
        f.col('page').getField('pageTitle').alias('pageTitle'),
        f.col('eventInfo').getField('eventAction').alias('eventAction'),
        'promotion',
        'origin_promo'
    )
)

# df_origin_promo.display()


# Now let's see what happens in each session and each visited page
# We can only conclude that an item was added to the cart because of a promotion if a promotion click led to a page and the user left that page until adding an item to the cart
# But how can we see if the user never left the page between a promotion click and adding an item to the cart? We need to check if the hit numbers are sequential between these two events

# First create a new column with the last hit number on each page on a session
# Then create a 'sequential' column taht is True if the row's hitNumber is equal to the last hit Number and False otherwise

# Let 'sequential' be False for every time the user enters the page during the session. So, it will be false for the first time ever the user entered the page during the session bue also for every time the user entered the page, then got out and then got in again.

# It is important to track this events because we will only want to consider that an addition to the cart was due to a promotion click if the user never left the page between clicking the promotion and adding the item to the cart.
# Imagine the scenario where user clicks promotion -> gets into page A -> exists page A and visits page B -> goes back to page A -> adds item to cart
# This case should not count for the promotion's success cases

window_page = Window.partitionBy('visitId', 'visitNumber', 'visitorId', 'visitStartTime', 'pageTitle').orderBy('hitNumber')

df_sequential = (
    df_origin_promo
    .withColumn('lastHitNumber', f.lag('hitNumber').over(window_page))
    .withColumn('sequential', f.when(f.col('hitNumber') == (f.col('lastHitNumber') + 1), True).otherwise(False))
)

# df_sequential.display()

# Now let's register the page view number. It is a column that indicates which time the page was visited in the session is that hit originated from
# page View could be sequential but it doesn't really matter. to make things easier let's save the page view as the hit number of the hit that originated entering the page after being in a different page

# Let's set the page view as the hit number and then set as the last hit number for all the rows where sequential = True
# Need to use ignoreNulls = True

df_page_view = (
    df_sequential
    .withColumn(
        'pageView',
        f.when(
            f.col('sequential') == False,
            f.col('hitNumber')
        ).otherwise(None)
    )
    .withColumn(
        'pageView',
        f.last('pageView', ignorenulls=True ).over(window_page)
    )
)

# df_page_view.display()

# Finally, we can update the origin promo column to propagate to all hits on a page that followed a visist to a page that was originated from a promotion click

# For that we can get the last origin promo for each page view in a session
window_page_view = Window.partitionBy('visitId', 'visitNumber', 'visitorId', 'visitStartTime', 'pageTitle', 'pageView').orderBy('hitNumber')

df_final = (
    df_page_view
    .withColumn(
        'origin_promo',
        f.last('origin_promo', ignorenulls=True).over(window_page_view)
    )
)

# df_final.display()

(
    df_final
    .filter(
        (f.col('eventAction') == 'Add to Cart')
        & (f.col('origin_promo').isNotNull())
    )
    .groupBy(f.element_at(f.col('origin_promo'), 1).getField('promoId').alias('promoId'))
    .agg(f.count('visitId').alias('nr_purchases'))
    .orderBy(f.desc('nr_purchases'))
).display()