In [1]:
from pyspark.sql import SparkSession

In [2]:
%load_ext sparksql_magic

In [3]:
spark = SparkSession.\
    builder.\
    enableHiveSupport().\
    appName("Spark SQL -predefined functions").\
    master("yarn").\
    getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [7]:
%%sparksql
show databases

0
namespace
default
exercise
hr
kevin
kevin_retail
retail
sms
test


In [8]:
#spark.sql("SHOW functions").show(1000, truncate=False)

In [9]:
%%sparksql
DESCRIBE FUNCTION substr

0
function_desc
Function: substr
Class: org.apache.spark.sql.catalyst.expressions.Substring
"Usage: substr(str, pos[, len]) - Returns the substring of `str` that starts at `pos` and is of length `len`, or the slice of byte array that starts at `pos` and is of length `len`.  substr(str FROM pos[ FOR len]]) - Returns the substring of `str` that starts at `pos` and is of length `len`, or the slice of byte array that starts at `pos` and is of length `len`."


In [10]:
%%sparksql
DESCRIBE FUNCTION || 

0
function_desc
Function: ||
Usage: expr1 || expr2 - Returns the concatenation of `expr1` and `expr2`.


In [11]:
orders = spark.read. \
    schema("order_id INT, order_date STRING, order_customer_id INT, order_status STRING"). \
    csv("/data/retail_db/orders")
orders.createOrReplaceTempView("orders_temp")

In [12]:
%%sparksql

SELECT current_date AS current_date

                                                                                

0
current_date
2023-03-30


In [13]:
%%sparksql

SELECT substr('Hello World', 1, 5) AS result

                                                                                

0
result
Hello


In [16]:
%%sparksql

USE kevin_retail

In [18]:
%%sparksql

SELECT current_database()

0
current_database()
kevin_retail


In [17]:
%%sparksql

show tables

0,1,2
namespace,tableName,isTemporary
kevin_retail,dual,False
kevin_retail,order_items,False
kevin_retail,order_items_stage,False
kevin_retail,orders,False
kevin_retail,orders_part,False
kevin_retail,orders_single_column,False
kevin_retail,sales,False
,orders_temp,True


In [12]:
%%sparksql

DROP TABLE IF EXISTS dual

In [13]:
%%sparksql

CREATE TABLE dual (dummy STRING)

In [14]:
%%sparksql

INSERT INTO dual VALUES ('X')

                                                                                

In [19]:
%%sparksql

SELECT current_date AS current_date FROM dual

                                                                                

0
current_date
2023-03-30


In [20]:
%%sparksql

SELECT substr('Hello World', 1, 5) AS result FROM dual

0
result
Hello


## String Manipulation Functions



- Case Conversion - lower, upper, initcap
- Getting size of the column value - length
- Extracting Data - substr and split
- Trimming and Padding functions - trim, rtrim, ltrim, rpad and lpad
- Reversing strings - reverse
- Concatenating multiple strings concat and concat_ws


In [17]:
%%sparksql
SELECT lower('hEllo wOrlD') AS lower_result,
    upper('hEllo wOrlD') AS upper_result,
    initcap('hEllo wOrlD') AS initcap_result

0,1,2
lower_result,upper_result,initcap_result
hello world,HELLO WORLD,Hello World


In [18]:
%%sparksql
SELECT length('hEllo wOrlD') AS result

0
result
11


In [19]:
%%sparksql 

SHOW tables

0,1,2
namespace,tableName,isTemporary
kevin_retail,dual,False
kevin_retail,order_items,False
kevin_retail,order_items_stage,False
kevin_retail,orders,False
kevin_retail,orders_part,False


In [20]:
%%sparksql

SELECT * FROM orders LIMIT 10

0,1,2,3
order_id,order_date,order_customer_id,order_status
1,2013-07-25 00:00:00.0,11599,CLOSED
2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT
3,2013-07-25 00:00:00.0,12111,COMPLETE
4,2013-07-25 00:00:00.0,8827,CLOSED
5,2013-07-25 00:00:00.0,11318,COMPLETE
6,2013-07-25 00:00:00.0,7130,COMPLETE
7,2013-07-25 00:00:00.0,4530,COMPLETE
8,2013-07-25 00:00:00.0,2911,PROCESSING
9,2013-07-25 00:00:00.0,5657,PENDING_PAYMENT


### Case Conversion and Length

In [21]:
%%sparksql

SELECT order_id, order_date, order_customer_id,
    lower(order_status) AS order_status,
    length(order_status) AS order_status_length
FROM orders LIMIT 10

0,1,2,3,4
order_id,order_date,order_customer_id,order_status,order_status_length
1,2013-07-25 00:00:00.0,11599,closed,6
2,2013-07-25 00:00:00.0,256,pending_payment,15
3,2013-07-25 00:00:00.0,12111,complete,8
4,2013-07-25 00:00:00.0,8827,closed,6
5,2013-07-25 00:00:00.0,11318,complete,8
6,2013-07-25 00:00:00.0,7130,complete,8
7,2013-07-25 00:00:00.0,4530,complete,8
8,2013-07-25 00:00:00.0,2911,processing,10
9,2013-07-25 00:00:00.0,5657,pending_payment,15


### Extracting Data - substr and split

In [22]:
%%sparksql

SELECT substr('2013-07-25 00:00:00.0', 1, 4) AS result

0
result
2013


In [30]:
%%sparksql

SELECT order_id,
  substr(order_date, 1, 10) AS order_date,
  order_customer_id,
  order_status
FROM orders limit 5

0,1,2,3
order_id,order_date,order_customer_id,order_status
1,2013-07-25,11599,CLOSED
2,2013-07-25,256,PENDING_PAYMENT
3,2013-07-25,12111,COMPLETE
4,2013-07-25,8827,CLOSED
5,2013-07-25,11318,COMPLETE


In [24]:
#split converts delimited string into array.

In [31]:
%%sparksql
SELECT split('2013-07-25', '-') AS result

0
result
"['2013', '07', '25']"


In [32]:
%%sparksql

SELECT split('2013-07-25', '-')[1] AS result

0
result
07


In [33]:
%%sparksql

SELECT explode(split('2013-07-25', '-')) AS result

0
result
2013
07
25


### Trimming and Padding Functions

- ltrim is used to remove the spaces on the left side of the string.
- rtrim is used to remove the spaces on the right side of the string.
- trim is used to remove the spaces on both sides of the string.


In [34]:
%%sparksql

SELECT ltrim('     Hello World') AS result

0
result
Hello World


In [35]:
%%sparksql

SELECT rtrim('     Hello World       ') AS result

0
result
Hello World


In [36]:
%%sparksql

SELECT length(trim('     Hello World       ')) AS result

0
result
11


how to use padding to pad characters to a string.

- Let us assume that there are 3 fields - year, month and date which are of type integer.
- If we have to concatenate all the 3 fields and create a date, we might have to pad month and date with 0.
- lpad is used more often than rpad especially when we try to build the date from separate columns.


In [37]:
%%sparksql

SELECT 2013 AS year, 7 AS month, 25 AS myDate

0,1,2
year,month,myDate
2013,7,25


In [38]:
%%sparksql

SELECT lpad(7, 2, 0) AS result

0
result
07


In [39]:
%%sparksql

SELECT lpad(100, 2, 0) AS result

0
result
10


### Reverse and Concatenating multiple strings

- We can use reverse to reverse a string.
- We can concatenate multiple strings using concat and concat_ws.
- concat_ws is typically used if we want to have the same string between all the strings that are being concatenated.


In [40]:
%%sparksql

SELECT reverse('Hello World') AS result

0
result
dlroW olleH


In [41]:
%%sparksql

SELECT concat('Hello', 'World') AS result

0
result
HelloWorld


In [45]:
%%sparksql

SELECT concat('Order Status is ', order_status) AS result
FROM orders LIMIT 5

0
result
Order Status is CLOSED
Order Status is PENDING_PAYMENT
Order Status is COMPLETE
Order Status is CLOSED
Order Status is COMPLETE


In [46]:
%%sparksql

SELECT * FROM (SELECT 2013 AS year, 7 AS month, 25 AS myDate) q

0,1,2
year,month,myDate
2013,7,25


In [47]:
%%sparksql

SELECT concat(year, '-', lpad(month, 2, 0), '-',
              lpad(myDate, 2, 0)) AS order_date
FROM
    (SELECT 2013 AS year, 7 AS month, 25 AS myDate) q

0
order_date
2013-07-25


In [48]:
%%sparksql

SELECT concat_ws('-', year, lpad(month, 2, 0),
              lpad(myDate, 2, 0)) AS order_date
FROM
    (SELECT 2013 AS year, 7 AS month, 25 AS myDate) q

0
order_date
2013-07-25


## Date Manipulation Functions

- Getting Current Date and Timestamp
- Date Arithmetic such as date_add
- Getting beginning date or time using trunc or date_trunc
- Extracting information using date_format as well as calendar functions.
- Dealing with unix timestamp using from_unixtime, to_unix_timestamp


### Getting Current Date and Timestamp

- **current_date** is the function or operator which will return today’s date.
- **current_timestamp** is the function or operator which will return current time up to milliseconds.
- These are not like other functions and do not use *()* at the end.
- These are not listed as part of *SHOW functions* and we can get help using DESCRIBE.
- There is a format associated with date and timestamp.
- Date - yyyy-MM-dd
- Timestamp - yyyy-MM-dd HH:mm:ss.SSS
*Keep in mind that a date or timestamp in Spark SQL are nothing but special strings containing values using above specified formats. We can apply all string manipulation functions on date or timestamp.*


In [50]:
%%sparksql

SELECT current_date AS current_date

0
current_date
2023-03-29


In [51]:
%%sparksql

SELECT current_date() AS current_date

0
current_date
2023-03-29


In [52]:
%%sparksql

SELECT current_timestamp AS current_timestamp

0
current_timestamp
2023-03-29 03:08:54.591000


### Date Arithmetic

- **date_add** can be used to add or subtract days.
- **date_sub** can be used to subtract or add days.
- **datediff** can be used to get difference between 2 dates
- **add_months** can be used add months to a date


In [53]:
%%sparksql

SELECT date_add(current_date, 32) AS result

0
result
2023-04-30


In [54]:
%%sparksql

SELECT date_add('2018-04-15', 730) AS result

0
result
2020-04-14


In [59]:
%%sparksql

SELECT date_add('2020-04-14', -730) AS result

0
result
2018-04-15


In [60]:
%%sparksql

SELECT datediff('2019-03-30', '2017-12-31') AS result

0
result
454


In [62]:
%%sparksql

SELECT add_months(current_date, 3) AS result

0
result
2023-06-29


### Beginning Date or Time - trunc and date_trunc

- We can use MM to get beginning date of the month.
- YY can be used to get begining date of the year.
- We can apply trunc either on date or timestamp, however we cannot apply it other than month or year (such an hour or day).


In [63]:
%%sparksql

DESCRIBE FUNCTION trunc

0
function_desc
Function: trunc
Class: org.apache.spark.sql.catalyst.expressions.TruncDate
"Usage: trunc(date, fmt) - Returns `date` with the time portion of the day truncated to the unit specified by the format model `fmt`."


In [64]:
%%sparksql

SELECT trunc(current_date, 'MM') AS beginning_date_month

0
beginning_date_month
2023-03-01


In [65]:
%%sparksql

SELECT trunc('2019-01-23', 'MM') AS beginning_date_month

0
beginning_date_month
2019-01-01


In [66]:
%%sparksql

SELECT trunc(current_date, 'YY') AS beginning_date_year 

0
beginning_date_year
2023-01-01


#### Below does not work

In [70]:
%%sparksql

SELECT trunc(current_timestamp, 'HH') AS doesnt_work

0
doesnt_work
""


While **trunc** can be used to get beginning time of a given month or year, we can get the beginning time up to Second using date_trunc.

In [71]:
%%sparksql

SELECT date_trunc('HOUR', current_timestamp) AS hour_beginning

0
hour_beginning
2023-03-29 03:00:00


### Extracting information using date_format

In [74]:
%%sparksql
DESCRIBE FUNCTION date_format

0
function_desc
Function: date_format
Class: org.apache.spark.sql.catalyst.expressions.DateFormatClass
"Usage: date_format(timestamp, fmt) - Converts `timestamp` to a value of string in the format specified by the date format `fmt`."


In [75]:
%%sparksql
SELECT current_timestamp AS current_timestamp, 
    date_format(current_timestamp, 'yyyy') AS year

0,1
current_timestamp,year
2023-03-29 03:31:35.608000,2023


In [76]:
%%sparksql
SELECT current_timestamp AS current_timestamp, 
    date_format(current_timestamp, 'yy') AS year

0,1
current_timestamp,year
2023-03-29 03:32:06.909000,23


In [77]:
%%sparksql
SELECT current_timestamp AS current_timestamp, 
    date_format(current_timestamp, 'dd') AS day_of_month

0,1
current_timestamp,day_of_month
2023-03-29 03:32:29.325000,29


In [78]:
%%sparksql

SELECT current_timestamp AS current_timestamp, 
    date_format(current_timestamp, 'DD') AS day_of_year

0,1
current_timestamp,day_of_year
2023-03-29 03:32:49.102000,88


In [79]:
%%sparksql

SELECT current_timestamp AS current_timestamp, 
    date_format(current_timestamp, 'MMM') AS month_name

0,1
current_timestamp,month_name
2023-03-29 03:33:44.239000,Mar


In [80]:
%%sparksql

SELECT current_timestamp AS current_timestamp, 
    date_format(current_timestamp, 'MM') AS month_name

0,1
current_timestamp,month_name
2023-03-29 03:33:56.495000,03


In [81]:
%%sparksql

SELECT current_timestamp AS current_timestamp, 
    date_format(current_timestamp, 'EE') AS dayname

0,1
current_timestamp,dayname
2023-03-29 03:34:47.362000,Wed


In [82]:
%%sparksql

SELECT current_timestamp AS current_timestamp, 
    date_format(current_timestamp, 'EEEE') AS dayname

0,1
current_timestamp,dayname
2023-03-29 03:35:13.036000,Wednesday


Here is how we can get time related information such as hour, minute, seconds, milliseconds etc from timestamp.

In [83]:
%%sparksql
SELECT current_timestamp AS current_timestamp, 
    date_format(current_timestamp, 'HH') AS hour24

0,1
current_timestamp,hour24
2023-03-29 03:39:05.109000,03


In [84]:
%%sparksql

SELECT current_timestamp AS current_timestamp, 
    date_format(current_timestamp, 'hh') AS hour12

0,1
current_timestamp,hour12
2023-03-29 03:43:11.887000,03


In [89]:
%%sparksql

SELECT current_timestamp AS current_timestamp, 
    date_format(current_timestamp, 'mm') AS minutes

0,1
current_timestamp,minutes
2023-03-29 03:44:18.408000,44


In [88]:
%%sparksql
SELECT current_timestamp AS current_timestamp, 
    date_format(current_timestamp, 'ss') AS seconds

0,1
current_timestamp,seconds
2023-03-29 03:44:13.533000,13


In [90]:
%%sparksql

SELECT date_format(current_timestamp, 'yyyyMM') AS current_month

0
current_month
202303


In [91]:
%%sparksql

SELECT date_format(current_timestamp, 'yyyy/MM/dd') AS current_date

0
current_date
2023/03/29


### Extracting information - Calendar functions¶

- We can get year, month, day etc from date or timestamp using functions. 
- There are functions such as day, dayofmonth, month, weekofyear, year etc available for us.

In [93]:
spark.sql("DESCRIBE FUNCTION day").show(truncate=False)

+------------------------------------------------------------------+
|function_desc                                                     |
+------------------------------------------------------------------+
|Function: day                                                     |
|Class: org.apache.spark.sql.catalyst.expressions.DayOfMonth       |
|Usage: day(date) - Returns the day of month of the date/timestamp.|
+------------------------------------------------------------------+



In [94]:
spark.sql("DESCRIBE FUNCTION day").show(truncate=False)

+------------------------------------------------------------------+
|function_desc                                                     |
+------------------------------------------------------------------+
|Function: day                                                     |
|Class: org.apache.spark.sql.catalyst.expressions.DayOfMonth       |
|Usage: day(date) - Returns the day of month of the date/timestamp.|
+------------------------------------------------------------------+



In [95]:
spark.sql("DESCRIBE FUNCTION month").show(truncate=False)

+-----------------------------------------------------------------------+
|function_desc                                                          |
+-----------------------------------------------------------------------+
|Function: month                                                        |
|Class: org.apache.spark.sql.catalyst.expressions.Month                 |
|Usage: month(date) - Returns the month component of the date/timestamp.|
+-----------------------------------------------------------------------+



In [99]:
%%sparksql
DESCRIBE FUNCTION weekofyear

0
function_desc
Function: weekofyear
Class: org.apache.spark.sql.catalyst.expressions.WeekOfYear
Usage: weekofyear(date) - Returns the week of the year of the given date. A week is considered to start on a Monday and week 1 is the first week with >3 days.


In [98]:
spark.sql("DESCRIBE FUNCTION year").show(truncate=False)

+---------------------------------------------------------------------+
|function_desc                                                        |
+---------------------------------------------------------------------+
|Function: year                                                       |
|Class: org.apache.spark.sql.catalyst.expressions.Year                |
|Usage: year(date) - Returns the year component of the date/timestamp.|
+---------------------------------------------------------------------+



In [100]:
%%sparksql

SELECT year(current_date) AS year

0
year
2023


In [103]:
%%sparksql

SELECT month(current_date) AS month

0
month
3


In [104]:
%%sparksql

SELECT weekofyear(current_date) AS weekofyear

0
weekofyear
13


In [105]:
%%sparksql

SELECT day(current_date) AS day

0
day
29


In [106]:
%%sparksql

SELECT dayofmonth(current_date) AS dayofmonth

0
dayofmonth
29


### Dealing with Unix Timestamp¶

- **from_unixtime** can be used to convert Unix epoch to regular timestamp.
- **unix_timestamp** or **to_unix_timestamp** can be used to convert timestamp to Unix epoch.
- We can get Unix epoch or Unix timestamp by running date **+%s** in Unix/Linux terminal
- We can DESCRIBE on the above functions to get details about them.


In [107]:
%%sparksql

SELECT from_unixtime(1556662731) AS timestamp

0
timestamp
2019-04-30 22:18:51


In [108]:
%%sparksql

SELECT to_unix_timestamp('2019-04-30 18:18:51') AS unixtime

0
unixtime
1556648331


In [109]:
%%sparksql

SELECT from_unixtime(1556662731, 'yyyyMM') AS month

0
month
201904


In [110]:
%%sparksql

SELECT from_unixtime(1556662731, 'yyyy-MM-dd') AS date

0
date
2019-04-30


In [111]:
%%sparksql

SELECT from_unixtime(1556662731, 'yyyy-MM-dd HH:mm') AS timestamp

0
timestamp
2019-04-30 22:18


In [112]:
%%sparksql

SELECT from_unixtime(1556662731, 'yyyy-MM-dd hh:mm') AS timestamp

0
timestamp
2019-04-30 10:18


In [115]:
%%sparksql

SELECT to_unix_timestamp('20190430', 'yyyyMMdd') AS date

0
date
1556582400


### Overview of Numeric Functions

- **abs** - always return positive number
- **sum, avg**
- **round** - rounds off to specified precision
- **ceil, floor** - always return integer.
- **greatest**
- **min, max**
- **rand**
- **pow, sqrt**
- **cumedist, stddev, variance**


In [4]:
%%sparksql

SELECT abs(-10.5), abs(10)

                                                                                

0,1
abs(-10.5),abs(10)
10.5,10


In [6]:
%%sparksql
Describe function abs

0
function_desc
Function: abs
Class: org.apache.spark.sql.catalyst.expressions.Abs
Usage: abs(expr) - Returns the absolute value of the numeric or interval value.


In [9]:
%%sparksql

use kevin_retail



In [10]:
%%sparksql
show tables

0,1,2
namespace,tableName,isTemporary
kevin_retail,dual,False
kevin_retail,order_items,False
kevin_retail,order_items_stage,False
kevin_retail,orders,False
kevin_retail,orders_part,False


In [11]:
%%sparksql

SELECT order_item_order_id, order_item_subtotal FROM order_items
WHERE order_item_order_id = 2

                                                                                

0,1
order_item_order_id,order_item_subtotal
2,199.99000549316406
2,250.0
2,129.99000549316406


In [12]:
%%sparksql

SELECT avg(order_item_subtotal) AS order_revenue_avg FROM order_items
WHERE order_item_order_id = 2

                                                                                

0
order_revenue_avg
193.32667032877603


In [15]:

%%sparksql
SELECT order_item_order_id, 
    avg(order_item_subtotal) AS order_revenue_avg 
FROM order_items
GROUP BY order_item_order_id
order by order_item_order_id desc
LIMIT 10

                                                                                

0,1
order_item_order_id,order_revenue_avg
68883,1074.9949951171875
68882,54.9950008392334
68881,129.99000549316406
68880,199.9540008544922
68879,419.99000040690106
68878,184.98250579833984
68875,1199.9749908447266
68873,171.98200302124025
68871,249.99000549316406


In [16]:
%%sparksql

SELECT
    round(10.58) rnd,
    floor(10.58) flr,
    ceil(10.58) cl

0,1,2
rnd,flr,cl
11,10,11


In [17]:
%%sparksql

SELECT
    round(10.44) rnd1,
    round(10.44, 1) rnd1,
    round(10.46, 1) rnd2,
    floor(10.44) flr,
    ceil(10.44) cl

0,1,2,3,4
rnd1,rnd1,rnd2,flr,cl
10,10.4,10.5,10,11


In [24]:
%%sparksql

SELECT
    round(10.44) rnd1,
    round(10.44, 1) rnd1,
    round(10.46, 2) rnd2,
    floor(10.44) flr,
    ceil(10.44) cl

0,1,2,3,4
rnd1,rnd1,rnd2,flr,cl
10,10.4,10.46,10,11


In [19]:
%%sparksql

SELECT avg(order_item_subtotal) AS order_revenue_avg FROM order_items
WHERE order_item_order_id = 2

                                                                                

0
order_revenue_avg
193.32667032877603


In [20]:
%%sparksql

SELECT round(avg(order_item_subtotal), 2) AS order_revenue_avg 
FROM order_items
WHERE order_item_order_id = 2

0
order_revenue_avg
193.33


In [25]:
%%sparksql

SELECT order_item_order_id, 
    round(avg(order_item_subtotal), 2) AS order_revenue_avg 
FROM order_items
GROUP BY order_item_order_id
LIMIT 10

                                                                                

0,1
order_item_order_id,order_revenue_avg
148,160.0
463,207.48
471,84.99
496,88.39
1088,124.99
1580,299.95
1591,146.62
1645,301.96
2366,299.97


In [26]:
%%sparksql

SELECT order_item_order_id, 
    round(sum(order_item_subtotal), 2) AS order_revenue_sum
FROM order_items
GROUP BY order_item_order_id
LIMIT 10

                                                                                

0,1
order_item_order_id,order_revenue_sum
35351,629.93
35361,614.88
35689,799.88
35694,889.94
35820,949.96
35912,799.96
35947,519.96
35982,1201.82
36131,554.95


In [28]:
%%sparksql

SELECT greatest(10, 11, 13, -13)

0
"greatest(10, 11, 13, -13)"
13


In [29]:
%%sparksql

SELECT rand() AS rand

0
rand
0.17234539226299006


In [30]:
%%sparksql

SELECT cast(round(rand() * 1) AS int) AS random_int

0
random_int
1


In [33]:
from pyspark.sql.functions import explode

In [39]:
%%sparksql

SELECT order_item_order_id, 
    round(sum(order_item_subtotal), 2) AS order_revenue_sum,
    min(order_item_subtotal) AS order_item_subtotal_min,
    max(order_item_subtotal) AS order_item_subtotal_max 
FROM order_items
GROUP BY order_item_order_id
LIMIT 10

0,1,2,3
order_item_order_id,order_revenue_sum,order_item_subtotal_min,order_item_subtotal_max
35351,629.93,129.99000549316406,239.9600067138672
35361,614.88,49.97999954223633,179.97000122070312
35689,799.88,150.0,399.9800109863281
35694,889.94,39.9900016784668,299.9800109863281
35820,949.96,199.99000549316406,299.9800109863281
35912,799.96,199.99000549316406,199.99000549316406
35947,519.96,119.9800033569336,399.9800109863281
35982,1201.82,31.989999771118164,399.9800109863281
36131,554.95,44.970001220703125,250.0


In [42]:
%%sparksql

SELECT order_item_order_id, order_item_subtotal
FROM order_items
WHERE order_item_order_id = 2

0,1
order_item_order_id,order_item_subtotal
2,199.99000549316406
2,250.0
2,129.99000549316406


In [43]:
%%sparksql

SELECT round(sum(order_item_subtotal), 2) AS order_revenue_sum,
    min(order_item_subtotal) AS order_item_subtotal_min,
    max(order_item_subtotal) AS order_item_subtotal_max 
FROM order_items
WHERE order_item_order_id = 2

0,1,2
order_revenue_sum,order_item_subtotal_min,order_item_subtotal_max
579.98,129.99000549316406,250.0


### Data Type Conversion

In [44]:
%%sparksql

SELECT current_date AS current_date

0
current_date
2023-03-30


In [45]:
%%sparksql

SELECT split(current_date, '-')[1] AS month

0
month
03


In [49]:
%%sparksql

SELECT split(current_date, '-') AS month

0
month
"['2023', '03', '30']"


In [50]:
%%sparksql

SELECT cast(split(current_date, '-')[1] AS INT) AS month

0
month
3


In [51]:
%%sparksql

SELECT cast('0.04' AS FLOAT) AS result

0
result
0.03999999910593033


In [52]:
%%sparksql

SELECT cast('0.04' AS INT) AS zero

0
zero
0


In [53]:
%%sparksql

SELECT cast('xyz' AS INT) AS returns_null

0
returns_null
""


In [54]:
%%sparksql

CREATE EXTERNAL TABLE IF NOT EXISTS orders_single_column (s STRING
) LOCATION '/user/hive/warehouse/kevin_retail.db/orders'

In [55]:
%%sparksql

SELECT * FROM orders_single_column LIMIT 10

                                                                                

0
s
"1,2013-07-25 00:00:00.0,11599,CLOSED"
"2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT"
"3,2013-07-25 00:00:00.0,12111,COMPLETE"
"4,2013-07-25 00:00:00.0,8827,CLOSED"
"5,2013-07-25 00:00:00.0,11318,COMPLETE"
"6,2013-07-25 00:00:00.0,7130,COMPLETE"
"7,2013-07-25 00:00:00.0,4530,COMPLETE"
"8,2013-07-25 00:00:00.0,2911,PROCESSING"
"9,2013-07-25 00:00:00.0,5657,PENDING_PAYMENT"


In [57]:
%%sparksql

SELECT split(s, ',')[0] AS order_id,
    split(s, ',')[1] AS order_date,
    split(s, ',')[2] AS order_customer_id,
    split(s, ',')[3] AS order_status
FROM orders_single_column LIMIT 10

                                                                                

0,1,2,3
order_id,order_date,order_customer_id,order_status
1,2013-07-25 00:00:00.0,11599,CLOSED
2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT
3,2013-07-25 00:00:00.0,12111,COMPLETE
4,2013-07-25 00:00:00.0,8827,CLOSED
5,2013-07-25 00:00:00.0,11318,COMPLETE
6,2013-07-25 00:00:00.0,7130,COMPLETE
7,2013-07-25 00:00:00.0,4530,COMPLETE
8,2013-07-25 00:00:00.0,2911,PROCESSING
9,2013-07-25 00:00:00.0,5657,PENDING_PAYMENT


In [58]:
%%sparksql

SELECT cast(split(s, ',')[0] AS INT) AS order_id,
    cast(split(s, ',')[1] AS TIMESTAMP) AS order_date,
    cast(split(s, ',')[2] AS INT) AS order_customer_id,
    cast(split(s, ',')[3] AS STRING) AS order_status
FROM orders_single_column LIMIT 10

0,1,2,3
order_id,order_date,order_customer_id,order_status
1,2013-07-25 00:00:00,11599,CLOSED
2,2013-07-25 00:00:00,256,PENDING_PAYMENT
3,2013-07-25 00:00:00,12111,COMPLETE
4,2013-07-25 00:00:00,8827,CLOSED
5,2013-07-25 00:00:00,11318,COMPLETE
6,2013-07-25 00:00:00,7130,COMPLETE
7,2013-07-25 00:00:00,4530,COMPLETE
8,2013-07-25 00:00:00,2911,PROCESSING
9,2013-07-25 00:00:00,5657,PENDING_PAYMENT


### Handling NULL Values


- By default if we try to add or concatenate null to another column or expression or literal, it will return null.
- If we want to replace null with some default value, we can use **nvl**. For not null values, nvl returns the original expression value.
  - Replace commission_pct with 0 if it is null.
  - We can also use **coalesce** in the place of **nvl**.
- **coalesce** returns first not null value if we pass multiple arguments to it.
- **nvl2** can be used to perform one action when the value is not null and some other action when the value is null.
  -  We want to increase commission_pct by 1 if it is not null and set commission_pct to 2 if it is null.
- We can also use **CASE WHEN ELSE END** for any conditional logic.


In [59]:
%%sparksql

SELECT 1 + NULL AS result

0
result
""


In [60]:
%%sparksql

SELECT concat('Hello', NULL) AS result

0
result
""


In [61]:
%%sparksql

SELECT nvl(1, 0) nvl, coalesce(1, 0) AS coalesce

0,1
nvl,coalesce
1,1


In [62]:
%%sparksql

SELECT coalesce(NULL, NULL, 2, NULL, 3) AS result

0
result
2


In [63]:
%%sparksql

SELECT nvl(NULL, NULL, 2, NULL, 3) AS result

AnalysisException: Invalid number of arguments for function nvl. Expected: 2; Found: 5; line 2 pos 7

In [64]:
%%sparksql

CREATE TABLE IF NOT EXISTS sales(
    sales_person_id INT,
    sales_amount FLOAT,
    commission_pct INT
)

In [65]:
%%sparksql

INSERT INTO sales VALUES
    (1, 1000, 10),
    (2, 1500, 8),
    (3, 500, NULL),
    (4, 800, 5),
    (5, 250, NULL)

                                                                                

In [66]:
%%sparksql

SELECT * FROM sales

0,1,2
sales_person_id,sales_amount,commission_pct
1,1000.0,10
2,1500.0,8
3,500.0,
4,800.0,5
5,250.0,


In [67]:
%%sparksql

SELECT s.*, 
    nvl(commission_pct, 0) AS commission_pct
FROM sales AS s

0,1,2,3
sales_person_id,sales_amount,commission_pct,commission_pct
1,1000.0,10,10
2,1500.0,8,8
3,500.0,,0
4,800.0,5,5
5,250.0,,0


In [68]:
%%sparksql

SELECT s.*, 
    coalesce(commission_pct, 0) AS commission_pct
FROM sales AS s

0,1,2,3
sales_person_id,sales_amount,commission_pct,commission_pct
1,1000.0,10,10
2,1500.0,8,8
3,500.0,,0
4,800.0,5,5
5,250.0,,0


In [69]:
%%sparksql

SELECT s.*, 
    round(sales_amount * commission_pct / 100, 2) AS incorrect_commission_amount
FROM sales AS s

0,1,2,3
sales_person_id,sales_amount,commission_pct,incorrect_commission_amount
1,1000.0,10,100.0
2,1500.0,8,120.0
3,500.0,,
4,800.0,5,40.0
5,250.0,,


In [70]:
%%sparksql

SELECT s.*, 
    round(sales_amount * nvl(commission_pct, 0) / 100, 2) AS commission_amount
FROM sales AS s

0,1,2,3
sales_person_id,sales_amount,commission_pct,commission_amount
1,1000.0,10,100.0
2,1500.0,8,120.0
3,500.0,,0.0
4,800.0,5,40.0
5,250.0,,0.0


In [71]:
%%sparksql

SELECT s.*, 
    round(sales_amount * coalesce(commission_pct, 0) / 100, 2) AS commission_amount
FROM sales AS s

0,1,2,3
sales_person_id,sales_amount,commission_pct,commission_amount
1,1000.0,10,100.0
2,1500.0,8,120.0
3,500.0,,0.0
4,800.0,5,40.0
5,250.0,,0.0


In [72]:
%%sparksql

SELECT s.*, 
    nvl2(commission_pct, commission_pct + 1, 2) AS commission_pct
FROM sales AS s

0,1,2,3
sales_person_id,sales_amount,commission_pct,commission_pct
1,1000.0,10,11
2,1500.0,8,9
3,500.0,,2
4,800.0,5,6
5,250.0,,2


In [73]:
%%sparksql

SELECT s.*, 
    round(sales_amount * nvl2(commission_pct, commission_pct + 1, 2) / 100, 2) AS commission_amount
FROM sales AS s

0,1,2,3
sales_person_id,sales_amount,commission_pct,commission_amount
1,1000.0,10,110.0
2,1500.0,8,135.0
3,500.0,,10.0
4,800.0,5,48.0
5,250.0,,5.0


In [74]:
%%sparksql

SELECT s.*, 
    CASE WHEN commission_pct IS NULL 
        THEN 2
        ELSE commission_pct + 1
    END AS commission_pct
FROM sales AS s

0,1,2,3
sales_person_id,sales_amount,commission_pct,commission_pct
1,1000.0,10,11
2,1500.0,8,9
3,500.0,,2
4,800.0,5,6
5,250.0,,2


In [76]:
%%sparksql

SELECT s.*, 
    CASE WHEN commission_pct IS NOT NULL 
        THEN commission_pct + 1
        ELSE 2
    END AS commission_pct
FROM sales AS s

0,1,2,3
sales_person_id,sales_amount,commission_pct,commission_pct
1,1000.0,10,11
2,1500.0,8,9
3,500.0,,2
4,800.0,5,6
5,250.0,,2


In [22]:
%%sparksql

SELECT s.*, 
    CASE 
    WHEN commission_pct IS NULL 
        THEN round((sales_amount * 2 / 100), 2)
        ELSE round((sales_amount * (commission_pct + 1)/ 100), 2)
    END AS commission_amount
FROM sales AS s

0,1,2,3
sales_person_id,sales_amount,commission_pct,commission_amount
1,1000.0,10,110.0
2,1500.0,8,135.0
3,500.0,,10.0
4,800.0,5,48.0
5,250.0,,5.0


## Using CASE and WHEN


- We can use **CASE** and **WHEN** for that.
- Let us implement this conditional logic to come up with derived order_status.
    - If order_status is COMPLETE or CLOSED, set COMPLETED
    - If order_status have PENDING in it, then we will say PENDING
    - If order_status have PROCESSING or PAYMENT_REVIEW in it, then we will say PENDING
    - We will set all others as OTHER
- We can also have **ELSE** as part of **CASE and WHEN**.


In [78]:
%%sparksql

SELECT DISTINCT order_status FROM orders LIMIT 10

                                                                                

0
order_status
PENDING_PAYMENT
COMPLETE
ON_HOLD
PAYMENT_REVIEW
PROCESSING
CLOSED
SUSPECTED_FRAUD
PENDING
CANCELED


In [80]:
%%sparksql
SELECT o.*,
    CASE 
    WHEN order_status IN ('COMPLETE', 'CLOSED') THEN 'COMPLETED'
    END AS updated_order_status
FROM orders o
LIMIT 10

0,1,2,3,4
order_id,order_date,order_customer_id,order_status,updated_order_status
1,2013-07-25 00:00:00.0,11599,CLOSED,COMPLETED
2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT,
3,2013-07-25 00:00:00.0,12111,COMPLETE,COMPLETED
4,2013-07-25 00:00:00.0,8827,CLOSED,COMPLETED
5,2013-07-25 00:00:00.0,11318,COMPLETE,COMPLETED
6,2013-07-25 00:00:00.0,7130,COMPLETE,COMPLETED
7,2013-07-25 00:00:00.0,4530,COMPLETE,COMPLETED
8,2013-07-25 00:00:00.0,2911,PROCESSING,
9,2013-07-25 00:00:00.0,5657,PENDING_PAYMENT,


In [81]:
%%sparksql
SELECT o.*,
    CASE 
    WHEN order_status IN ('COMPLETE', 'CLOSED') THEN 'COMPLETED'
    ELSE order_status
    END AS updated_order_status
FROM orders o
LIMIT 10

0,1,2,3,4
order_id,order_date,order_customer_id,order_status,updated_order_status
1,2013-07-25 00:00:00.0,11599,CLOSED,COMPLETED
2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT,PENDING_PAYMENT
3,2013-07-25 00:00:00.0,12111,COMPLETE,COMPLETED
4,2013-07-25 00:00:00.0,8827,CLOSED,COMPLETED
5,2013-07-25 00:00:00.0,11318,COMPLETE,COMPLETED
6,2013-07-25 00:00:00.0,7130,COMPLETE,COMPLETED
7,2013-07-25 00:00:00.0,4530,COMPLETE,COMPLETED
8,2013-07-25 00:00:00.0,2911,PROCESSING,PROCESSING
9,2013-07-25 00:00:00.0,5657,PENDING_PAYMENT,PENDING_PAYMENT


In [82]:
%%sparksql

SELECT o.*,
    CASE 
        WHEN order_status IN ('COMPLETE', 'CLOSED') THEN 'COMPLETED'
        WHEN order_status LIKE '%PENDING%' THEN 'PENDING'
        ELSE 'OTHER'
    END AS updated_order_status
FROM orders o
LIMIT 10

0,1,2,3,4
order_id,order_date,order_customer_id,order_status,updated_order_status
1,2013-07-25 00:00:00.0,11599,CLOSED,COMPLETED
2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT,PENDING
3,2013-07-25 00:00:00.0,12111,COMPLETE,COMPLETED
4,2013-07-25 00:00:00.0,8827,CLOSED,COMPLETED
5,2013-07-25 00:00:00.0,11318,COMPLETE,COMPLETED
6,2013-07-25 00:00:00.0,7130,COMPLETE,COMPLETED
7,2013-07-25 00:00:00.0,4530,COMPLETE,COMPLETED
8,2013-07-25 00:00:00.0,2911,PROCESSING,OTHER
9,2013-07-25 00:00:00.0,5657,PENDING_PAYMENT,PENDING


In [83]:
%%sparksql

SELECT o.*,
    CASE 
        WHEN order_status IN ('COMPLETE', 'CLOSED') THEN 'COMPLETED'
        WHEN order_status LIKE '%PENDING%' OR order_status IN ('PROCESSING', 'PAYMENT_REVIEW')
            THEN 'PENDING'
        ELSE 'OTHER'
    END AS updated_order_status
FROM orders o
LIMIT 10

0,1,2,3,4
order_id,order_date,order_customer_id,order_status,updated_order_status
1,2013-07-25 00:00:00.0,11599,CLOSED,COMPLETED
2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT,PENDING
3,2013-07-25 00:00:00.0,12111,COMPLETE,COMPLETED
4,2013-07-25 00:00:00.0,8827,CLOSED,COMPLETED
5,2013-07-25 00:00:00.0,11318,COMPLETE,COMPLETED
6,2013-07-25 00:00:00.0,7130,COMPLETE,COMPLETED
7,2013-07-25 00:00:00.0,4530,COMPLETE,COMPLETED
8,2013-07-25 00:00:00.0,2911,PROCESSING,PENDING
9,2013-07-25 00:00:00.0,5657,PENDING_PAYMENT,PENDING


In [84]:
%%sparksql

SELECT DISTINCT order_status,
    CASE 
        WHEN order_status IN ('COMPLETE', 'CLOSED') THEN 'COMPLETED'
        WHEN order_status LIKE '%PENDING%' OR order_status IN ('PROCESSING', 'PAYMENT_REVIEW')
            THEN 'PENDING'
        ELSE 'OTHER'
    END AS updated_order_status
FROM orders
ORDER BY updated_order_status

0,1
order_status,updated_order_status
COMPLETE,COMPLETED
CLOSED,COMPLETED
CANCELED,OTHER
SUSPECTED_FRAUD,OTHER
ON_HOLD,OTHER
PENDING_PAYMENT,PENDING
PENDING,PENDING
PAYMENT_REVIEW,PENDING
PROCESSING,PENDING


### Query Example - Word Count

- Create table by name lines.
- Insert data into the table.
- Split lines into array of words.
- Explode array of words from each line into individual records.
- Use group by and get the count. We cannot use GROUP BY directly on exploded records and hence we need to use nested sub query.


In [24]:
%%sparksql

DROP DATABASE IF EXISTS wordCount_demo CASCADE

In [25]:
%%sparksql
CREATE DATABASE IF NOT EXISTS wordCount_demo

In [27]:
%%sparksql

USE wordCount_demo

In [29]:
%%sparksql

CREATE TABLE lines (s STRING)

In [30]:
%%sparksql

INSERT INTO lines VALUES
  ('Hello World'),
  ('How are you'),
  ('Let us perform the word count'),
  ('The definition of word count is'),
  ('to get the count of each word from this data')

In [31]:
%%sparksql

SELECT * FROM lines

0
s
Hello World
How are you
Let us perform the word count
The definition of word count is
to get the count of each word from this data


In [32]:
%%sparksql

SELECT split(s, ' ') AS word_array FROM lines

0
word_array
"['Hello', 'World']"
"['How', 'are', 'you']"
"['Let', 'us', 'perform', 'the', 'word', 'count']"
"['The', 'definition', 'of', 'word', 'count', 'is']"
"['to', 'get', 'the', 'count', 'of', 'each', 'word', 'from', 'this', 'data']"


In [33]:
%%sparksql

SELECT * FROM lines

0
s
Hello World
How are you
Let us perform the word count
The definition of word count is
to get the count of each word from this data


In [34]:
%%sparksql

SELECT split(s, ' ') AS word_array FROM lines

0
word_array
"['Hello', 'World']"
"['How', 'are', 'you']"
"['Let', 'us', 'perform', 'the', 'word', 'count']"
"['The', 'definition', 'of', 'word', 'count', 'is']"
"['to', 'get', 'the', 'count', 'of', 'each', 'word', 'from', 'this', 'data']"


In [35]:
%%sparksql

SELECT explode(split(s, ' ')) AS words FROM lines

only showing top 20 row(s)


0
words
Hello
World
How
are
you
Let
us
perform
the


In [36]:
%%sparksql

SELECT count(1) FROM (SELECT explode(split(s, ' ')) AS words FROM lines)

0
count(1)
27


In [37]:
%%sparksql

SELECT explode(split(s, ' ')) AS words, count(1) FROM lines
GROUP BY explode(split(s, ' '))

AnalysisException: The generator is not supported: outside the SELECT clause, found: Aggregate [explode(split(s#131,  , -1))], [split(s#131,  , -1) AS _gen_input_0#133, count(1) AS count(1)#132L]

In [41]:
%%sparksql

SELECT initcap(word), count(1) FROM (
  SELECT explode(split(s, ' ')) AS word FROM lines
) q
GROUP BY word
order by word asc

only showing top 20 row(s)


0,1
initcap(word),count(1)
Hello,1
How,1
Let,1
The,1
World,1
Are,1
Count,3
Data,1
Definition,1


In [43]:
%%sparksql

SELECT count(1) FROM
(
    SELECT word, count(1) FROM (
        SELECT explode(split(s, ' ')) AS word FROM lines
    ) q
    GROUP BY word
)

0
count(1)
21
