In [0]:
# Row level transformation - select
# Filtering of the data - filter or where
# Grouping of the data - groupBy
# Joining the data - join
# Sorting of the data - orderBy or sort

In [0]:
# select, filter or where => Narrow transformations
# groupBy, orderBy or sort => Wide transformations (shuffling)
# join (Broadcast) => Narrow transformation
# join (reduce side) => Wide transformation (shuffling)

In [0]:
[
    {'user_id': 1, 'user_fname': 'Scott', 'user_lname': 'Tiger'}, # => {'user_id': 1, 'user_fname': 'Scott', 'user_lname': 'Tiger', 'user_name': 'Scott, Tiger'}
    {'user_id': 2, 'user_fname': 'Donald', 'user_lname': 'Duck'}, # => {'user_id': 2, 'user_fname': 'Donald', 'user_lname': 'Duck', 'user_name': 'Donald, Duck'}
    {'user_id': 3, 'user_fname': 'Mickey', 'user_lname': 'Mouse'} # => {'user_id': 3, 'user_fname': 'Mickey', 'user_lname': 'Mouse', 'user_name': 'Mickey, Mouse'}
]

In [0]:
orders_df = spark.read.csv('dbfs:/public/retail_db/orders', schema='order_id INT, order_date DATE, order_customer_id INT, order_status STRING')

In [0]:
display(orders_df)

In [0]:
# 2013-07-25 => 201307

In [0]:
from pyspark.sql.functions import date_format, cast

In [0]:
help(date_format)

In [0]:
display(
    orders_df. \
        withColumn('order_month', date_format('order_date', 'yyyyMM'))
)

In [0]:
orders_df. \
    withColumn('order_month', date_format('order_date', 'yyyyMM').cast('int')). \
    printSchema()

In [0]:
help(cast)

In [0]:
orders_df. \
    withColumn('order_month', cast(int, date_format('order_date', 'yyyyMM'))). \
    printSchema()

In [0]:
display(orders_df)

In [0]:
help(orders_df.filter)

In [0]:
help(orders_df.where)

In [0]:
display(orders_df.select('order_status').distinct())

In [0]:
# SELECT * FROM orders WHERE order_status = 'COMPLETE'
display(orders_df.filter("order_status = 'COMPLETE'"))

In [0]:
orders_df.count()

In [0]:
orders_df.filter("order_status = 'COMPLETE'").count()

In [0]:
# Get all the COMPLETE or CLOSED orders which are placed in the month of 2014, January
# SELECT * FROM orders WHERE order_status IN ('COMPLETE', 'CLOSED')
orders_df.filter("order_status IN ('COMPLETE', 'CLOSED')").count()

In [0]:
# SELECT * FROM orders WHERE date_format(order_date, 'yyyyMM') = 201401
display(orders_df.filter("date_format(order_date, 'yyyyMM') = 201401"))

In [0]:
orders_df.filter("date_format(order_date, 'yyyyMM') = 201401").count()

In [0]:
# SELECT * FROM orders WHERE date_format(order_date, 'yyyyMM') = 201401 AND order_status IN ('COMPLETE', 'CLOSED')
display(orders_df.filter("date_format(order_date, 'yyyyMM') = 201401 AND order_status IN ('COMPLETE', 'CLOSED')"))

In [0]:
orders_df.filter("date_format(order_date, 'yyyyMM') = 201401 AND order_status IN ('COMPLETE', 'CLOSED')").count()

In [0]:
orders_df = spark.read.csv('dbfs:/public/retail_db/orders', schema='order_id INT, order_date DATE, order_customer_id INT, order_status STRING')

In [0]:
display(orders_df)

In [0]:
from pyspark.sql.functions import count, col

In [0]:
# SELECT order_status, count(*) AS order_count FROM orders GROUP BY order_status ORDER BY order_count DESC
display(
    orders_df. \
        groupBy('order_status'). \
        agg(count('order_id').alias('order_count')). \
        orderBy(col('order_count').desc())
)

In [0]:
%fs ls dbfs:/public/retail_db/order_items

In [0]:
order_items_df = spark.read.csv(
    'dbfs:/public/retail_db/order_items',
    schema='''
        order_item_id INT, order_item_order_id INT, order_item_product_id INT,
        order_item_quantity INT, order_item_subtotal FLOAT, order_item_product_price FLOAT
    '''
)

In [0]:
display(order_items_df)

In [0]:
from pyspark.sql.functions import sum, round

In [0]:
# SELECT order_item_order_id, sum(order_item_subtotal) AS order_revenue FROM order_items GROUP BY order_item_order_id ORDER BY order_item_order_id
# agg() => count, sum, min, max, some other aggregate function

display(
    order_items_df. \
        groupBy('order_item_order_id'). \
        agg(round(sum('order_item_subtotal'), 2).alias('order_revenue')).
        orderBy('order_item_order_id')
)

In [0]:
# Get the count of orders per month

display(orders_df)

In [0]:
from pyspark.sql.functions import date_format

In [0]:
display(
    orders_df. \
        groupBy(date_format('order_date', 'yyyyMM').cast('int').alias('order_month')). \
        agg(count('order_id').alias('order_count')). \
        orderBy('order_month')
)

In [0]:
help(orders_df.sort)

In [0]:
# Sorting the data based on one column (ascending or descending)
# Composite Sorting - sorting based on multiple columns (2 or more columns)
  # col1 asc, col2 asc
  # col1 asc, col2 desc
  # col1 desc, col2 asc
  # col1 desc, col2 desc
# Dealing with Nulls

In [0]:
display(orders_df)

In [0]:
from pyspark.sql.functions import col

In [0]:
c = col('order_customer_id')

In [0]:
c.

In [0]:
# SELECT * FROM orders ORDER BY order_customer_id
display(
    orders_df. \
        orderBy(col('order_customer_id').asc())
)

In [0]:
display(orders_df)

In [0]:
# SELECT * FROM orders ORDER BY order_customer_id, order_date
display(
    orders_df. \
        orderBy('order_customer_id', 'order_date')
)

In [0]:
from pyspark.sql.functions import col

In [0]:
# SELECT * FROM orders ORDER BY order_customer_id, order_date DESC
display(
    orders_df. \
        orderBy('order_customer_id', col('order_date').desc())
)

In [0]:
order_items_df = spark.read.csv(
    'dbfs:/public/retail_db/order_items',
    schema='''
        order_item_id INT, order_item_order_id INT, order_item_product_id INT,
        order_item_quantity INT, order_item_subtotal FLOAT, order_item_product_price FLOAT
    '''
)

In [0]:
display(order_items_df)

In [0]:
# SELECT * FROM order_items ORDER BY order_item_order_id, order_item_subtotal DESC
display(
    order_items_df. \
        orderBy('order_item_order_id', col('order_item_subtotal').desc())
)

In [0]:
%fs ls dbfs:/databricks-datasets

In [0]:
%fs ls dbfs:/databricks-datasets/online_retail/data-001

In [0]:
df = spark.read.csv('dbfs:/databricks-datasets/online_retail/data-001', header=True, inferSchema=True)

In [0]:
display(df)

In [0]:
df.count()

In [0]:
display(df.filter('CustomerID IS NULL'))

In [0]:
df.filter('CustomerID IS NULL').count()

In [0]:
display(df.filter('StockCode = 71053'))

In [0]:
display(
    df.filter('StockCode = 71053'). \
        orderBy('CustomerID')
)

In [0]:
from pyspark.sql.functions import col

In [0]:
c = col('CustomerID')

In [0]:
display(
    df.filter('StockCode = 71053'). \
        orderBy(col('CustomerID').asc_nulls_last())
)

In [0]:
display(
    df.filter('StockCode = 71053'). \
        orderBy(col('CustomerID').desc())
)

In [0]:
display(
    df.filter('StockCode = 71053'). \
        orderBy(col('CustomerID').desc_nulls_first())
)