In [2]:
import findspark
findspark.init()

In [4]:
import pyspark
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder \
        .master("yarn") \
        .appName("ch06") \
        .config("spark.submit.deployMode", "client") \
        .config("spark.executor.memory", "2g") \
        .config("spark.driver.memory", "2g") \
        .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/12 07:08:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/05/12 07:08:42 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/05/12 07:08:44 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


In [6]:
spark.sparkContext

In [8]:
!hdfs dfs -ls /data/retail-data/by-day/2010-12-01.csv

-rw-r--r--   2 hadoop supergroup     275001 2025-05-10 14:57 /data/retail-data/by-day/2010-12-01.csv


In [9]:
df = spark.read.format("csv").option("header", True).option("inferSchema", True).load("hdfs:///data/retail-data/by-day/2010-12-01.csv")
df.show(2)

                                                                                

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
only showing top 2 rows



In [22]:
# create sql table
df.createOrReplaceTempView("retail_data")

In [10]:
df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



In [13]:
df.schema

StructType([StructField('InvoiceNo', StringType(), True), StructField('StockCode', StringType(), True), StructField('Description', StringType(), True), StructField('Quantity', IntegerType(), True), StructField('InvoiceDate', TimestampType(), True), StructField('UnitPrice', DoubleType(), True), StructField('CustomerID', DoubleType(), True), StructField('Country', StringType(), True)])

In [14]:
df.columns

['InvoiceNo',
 'StockCode',
 'Description',
 'Quantity',
 'InvoiceDate',
 'UnitPrice',
 'CustomerID',
 'Country']

#### Converting to Spark Types

In [18]:
from pyspark.sql.functions import lit

In [21]:
df.select(lit(1), lit("one")).show(2)

+---+---+
|  1|one|
+---+---+
|  1|one|
|  1|one|
+---+---+
only showing top 2 rows



#### Working With Boolens

In [27]:
df.where("InvoiceNo != 536563").select("InvoiceNo", "Description").show(2, False)

+---------+----------------------------------+
|InvoiceNo|Description                       |
+---------+----------------------------------+
|536365   |WHITE HANGING HEART T-LIGHT HOLDER|
|536365   |WHITE METAL LANTERN               |
+---------+----------------------------------+
only showing top 2 rows



In [31]:
spark.sql("""
    select InvoiceNo, Description
    from retail_data
    where InvoiceNo != 536563
""").show(2)

+---------+--------------------+
|InvoiceNo|         Description|
+---------+--------------------+
|   536365|WHITE HANGING HEA...|
|   536365| WHITE METAL LANTERN|
+---------+--------------------+
only showing top 2 rows



In [29]:
from pyspark.sql.functions import instr, col, expr

In [32]:
priceFilter = col("UnitPrice") >  600
descFilter = instr(col("Description"), "POSTAGE") >= 1

df.where(priceFilter | descFilter) \
    .select("InvoiceNo", "UnitPrice", "Description") \
    .show(3, False)

+---------+---------+-----------+
|InvoiceNo|UnitPrice|Description|
+---------+---------+-----------+
|536370   |18.0     |POSTAGE    |
|536403   |15.0     |POSTAGE    |
|536527   |18.0     |POSTAGE    |
+---------+---------+-----------+
only showing top 3 rows



In [36]:
spark.sql("""
    select InvoiceNo, UnitPrice, Description
    from retail_data
    where UnitPrice > 600 or instr(Description, "POSTAGE") >=1
""").show(3, False)

+---------+---------+-----------+
|InvoiceNo|UnitPrice|Description|
+---------+---------+-----------+
|536370   |18.0     |POSTAGE    |
|536403   |15.0     |POSTAGE    |
|536527   |18.0     |POSTAGE    |
+---------+---------+-----------+
only showing top 3 rows



In [52]:
df.withColumn("isexpensive", (col("StockCode")=="DOT") & (priceFilter | descFilter)) \
.where("isexpensive").select("UnitPrice", "isexpensive").show(2)

+---------+-----------+
|UnitPrice|isexpensive|
+---------+-----------+
|   569.77|       true|
|   607.49|       true|
+---------+-----------+



In [54]:
spark.sql("""
    with isexpensive as (
          select *, (StockCode="DOT") AND ((UnitPrice > 600) or (Description like '%POSTAGE%')) as isexpensive
          from retail_data
    )
    select *
    from isexpensive
    where isexpensive
""").show(2)

+---------+---------+--------------+--------+-------------------+---------+----------+--------------+-----------+
|InvoiceNo|StockCode|   Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|isexpensive|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+-----------+
|   536544|      DOT|DOTCOM POSTAGE|       1|2010-12-01 14:32:00|   569.77|      NULL|United Kingdom|       true|
|   536592|      DOT|DOTCOM POSTAGE|       1|2010-12-01 17:06:00|   607.49|      NULL|United Kingdom|       true|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+-----------+



#### Working With Numbers

In [79]:
from pyspark.sql.functions import pow, round, bround, corr

In [57]:
fabricatedQuantity = (pow(col("Quantity") * col("UnitPrice"), 2) + 5).alias("realQuantity")
df.select("CustomerID", fabricatedQuantity).show(2)

+----------+------------------+
|CustomerID|      realQuantity|
+----------+------------------+
|   17850.0|239.08999999999997|
|   17850.0|          418.7156|
+----------+------------------+
only showing top 2 rows



In [58]:
spark.sql("""
    select CustomerID, pow(Quantity * UnitPrice, 2) + 5 as realQuantity
    from retail_data 
""").show(2)

+----------+------------------+
|CustomerID|      realQuantity|
+----------+------------------+
|   17850.0|239.08999999999997|
|   17850.0|          418.7156|
+----------+------------------+
only showing top 2 rows



In [69]:
df.select("UnitPrice", round("UnitPrice"), round("UnitPrice", 2), bround("UnitPrice"), bround(lit(2.5)), bround(lit(2.6))).show(2)

+---------+-------------------+-------------------+--------------------+--------------+--------------+
|UnitPrice|round(UnitPrice, 0)|round(UnitPrice, 2)|bround(UnitPrice, 0)|bround(2.5, 0)|bround(2.6, 0)|
+---------+-------------------+-------------------+--------------------+--------------+--------------+
|     2.55|                3.0|               2.55|                 3.0|           2.0|           3.0|
|     3.39|                3.0|               3.39|                 3.0|           2.0|           3.0|
+---------+-------------------+-------------------+--------------------+--------------+--------------+
only showing top 2 rows



In [64]:
spark.sql("""
    select round(UnitPrice), round(UnitPrice, 2), bround(UnitPrice)
    from retail_data
""").show(2)

+-------------------+-------------------+--------------------+
|round(UnitPrice, 0)|round(UnitPrice, 2)|bround(UnitPrice, 0)|
+-------------------+-------------------+--------------------+
|                3.0|               2.55|                 3.0|
|                3.0|               3.39|                 3.0|
+-------------------+-------------------+--------------------+
only showing top 2 rows



In [70]:
# Perarson Correlation coefficient

In [78]:
df.stat.corr("UnitPrice", "Quantity")

-0.04112314436835552

In [80]:
df.select(corr("UnitPrice", "Quantity")).show()

+-------------------------+
|corr(UnitPrice, Quantity)|
+-------------------------+
|     -0.04112314436835552|
+-------------------------+



In [84]:
df.describe().show()

[Stage 60:>                                                         (0 + 1) / 1]

+-------+-----------------+------------------+--------------------+------------------+------------------+------------------+--------------+
|summary|        InvoiceNo|         StockCode|         Description|          Quantity|         UnitPrice|        CustomerID|       Country|
+-------+-----------------+------------------+--------------------+------------------+------------------+------------------+--------------+
|  count|             3108|              3108|                3098|              3108|              3108|              1968|          3108|
|   mean| 536516.684944841|27834.304044117645|                NULL| 8.627413127413128| 4.151946589446603|15661.388719512195|          NULL|
| stddev|72.89447869788873|17407.897548583845|                NULL|26.371821677029203|15.638659854603892|1854.4496996893627|          NULL|
|    min|           536365|             10002| 4 PURPLE FLOCK D...|               -24|               0.0|           12431.0|     Australia|
|    max|          C

                                                                                

In [90]:
from pyspark.sql.functions import monotonically_increasing_id

df.select(monotonically_increasing_id().alias("sl_no"), "InvoiceNo").show(5)

+-----+---------+
|sl_no|InvoiceNo|
+-----+---------+
|    0|   536365|
|    1|   536365|
|    2|   536365|
|    3|   536365|
|    4|   536365|
+-----+---------+
only showing top 5 rows



In [96]:
spark.sql("""
    select row_number() over(order by CustomerID desc) as row_no, CustomerID
    from retail_data
""").show(5)

25/05/12 11:13:36 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/12 11:13:36 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/12 11:13:36 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+------+----------+
|row_no|CustomerID|
+------+----------+
|     1|   18229.0|
|     2|   18229.0|
|     3|   18229.0|
|     4|   18229.0|
|     5|   18229.0|
+------+----------+
only showing top 5 rows



#### Working With Strings

In [105]:
from pyspark.sql.functions import initcap, lower, upper
from pyspark.sql.functions import trim, ltrim, rtrim, lpad, rpad

In [103]:
df.select("CustomerID", initcap("Description"), lower("StockCode"), upper("StockCode"), ).show(2, False)

+----------+----------------------------------+----------------+----------------+
|CustomerID|initcap(Description)              |lower(StockCode)|upper(StockCode)|
+----------+----------------------------------+----------------+----------------+
|17850.0   |White Hanging Heart T-light Holder|85123a          |85123A          |
|17850.0   |White Metal Lantern               |71053           |71053           |
+----------+----------------------------------+----------------+----------------+
only showing top 2 rows



In [108]:
df.select(
    trim(lit("     Hello  coco    ")),
    ltrim(lit("     Hello  coco    ")),
    rtrim(lit("     Hello  coco    ")),
    lpad(lit("HELLO"), 10, "*"),
    rpad(lit("HELLO"), 10 ,"*")
).show(2)

+--------------------------+---------------------------+---------------------------+------------------+------------------+
|trim(     Hello  coco    )|ltrim(     Hello  coco    )|rtrim(     Hello  coco    )|lpad(HELLO, 10, *)|rpad(HELLO, 10, *)|
+--------------------------+---------------------------+---------------------------+------------------+------------------+
|               Hello  coco|            Hello  coco    |                Hello  coco|        *****HELLO|        HELLO*****|
|               Hello  coco|            Hello  coco    |                Hello  coco|        *****HELLO|        HELLO*****|
+--------------------------+---------------------------+---------------------------+------------------+------------------+
only showing top 2 rows



In [114]:
spark.sql("""
    select 
        trim("     Hello  coco    "),
        ltrim("     Hello  coco    "),
        rtrim("     Hello  coco    "),
        lpad("HELLO", 10, "*"),
        rpad("HELLO", 10 ,"*")
""").show()

+--------------------------+---------------------------+---------------------------+------------------+------------------+
|trim(     Hello  coco    )|ltrim(     Hello  coco    )|rtrim(     Hello  coco    )|lpad(HELLO, 10, *)|rpad(HELLO, 10, *)|
+--------------------------+---------------------------+---------------------------+------------------+------------------+
|               Hello  coco|            Hello  coco    |                Hello  coco|        *****HELLO|        HELLO*****|
+--------------------------+---------------------------+---------------------------+------------------+------------------+



#### Regular Expression

In [134]:
from pyspark.sql.functions import regexp_replace, regexp_extract, regexp_substr

In [None]:
df.select("CustomerID", regexp_replace("Description", "BLACK|WHITE|RED|BLUE|GREEN", "COLOR").alias("Description")).show(2, False)

+----------+----------------------------------+
|CustomerID|Description                       |
+----------+----------------------------------+
|17850.0   |COLOR HANGING HEART T-LIGHT HOLDER|
|17850.0   |COLOR METAL LANTERN               |
+----------+----------------------------------+
only showing top 2 rows



In [None]:
spark.sql("""
    select CustomerID, regexp_replace(Description, "BLACK|WHITE|RED|BLUE|GREEN", "COLOR") as Description
    from retail_data
""").show(2, False)

+----------+----------------------------------+
|CustomerID|Description                       |
+----------+----------------------------------+
|17850.0   |COLOR HANGING HEART T-LIGHT HOLDER|
|17850.0   |COLOR METAL LANTERN               |
+----------+----------------------------------+
only showing top 2 rows



In [None]:
# character level replacement
from pyspark.sql.functions import translate
df.select("Description", translate("Description", "LET", "*$@")).show(2, False)

# here L -> *, E -> $ , T-> @

+----------------------------------+----------------------------------+
|Description                       |translate(Description, LET, *$@)  |
+----------------------------------+----------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER|WHI@$ HANGING H$AR@ @-*IGH@ HO*D$R|
|WHITE METAL LANTERN               |WHI@$ M$@A* *AN@$RN               |
+----------------------------------+----------------------------------+
only showing top 2 rows



In [128]:
spark.sql("""
    select Description, translate(Description, "LET", "*$@")
    from retail_data 
""").show(2, False)

+----------------------------------+----------------------------------+
|Description                       |translate(Description, LET, *$@)  |
+----------------------------------+----------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER|WHI@$ HANGING H$AR@ @-*IGH@ HO*D$R|
|WHITE METAL LANTERN               |WHI@$ M$@A* *AN@$RN               |
+----------------------------------+----------------------------------+
only showing top 2 rows



In [143]:
#regex_extract

df.select("Description", regexp_extract("Description", "(BLACK|WHITE|RED|BLUE|GREEN)", 1).alias("color")).show(5, False)

+-----------------------------------+-----+
|Description                        |color|
+-----------------------------------+-----+
|WHITE HANGING HEART T-LIGHT HOLDER |WHITE|
|WHITE METAL LANTERN                |WHITE|
|CREAM CUPID HEARTS COAT HANGER     |     |
|KNITTED UNION FLAG HOT WATER BOTTLE|     |
|RED WOOLLY HOTTIE WHITE HEART.     |RED  |
+-----------------------------------+-----+
only showing top 5 rows



In [141]:
#regex_substr
df.select("Description", regexp_substr("Description", lit("BLACK|WHITE|RED|BLUE|GREEN")).alias("color")).show(5, False)

+-----------------------------------+-----+
|Description                        |color|
+-----------------------------------+-----+
|WHITE HANGING HEART T-LIGHT HOLDER |WHITE|
|WHITE METAL LANTERN                |WHITE|
|CREAM CUPID HEARTS COAT HANGER     |NULL |
|KNITTED UNION FLAG HOT WATER BOTTLE|NULL |
|RED WOOLLY HOTTIE WHITE HEART.     |RED  |
+-----------------------------------+-----+
only showing top 5 rows



#### Dates and Timestamps

In [149]:
from pyspark.sql.functions import current_date, current_timestamp, unix_timestamp
from pyspark.sql.functions import date_diff, date_add, date_sub, now

In [165]:
datedf = spark.range(1).withColumn("today", current_date()).withColumn("now", current_timestamp())
datedf.createOrReplaceTempView("date_table")
datedf.show(1, False)

+---+----------+--------------------------+
|id |today     |now                       |
+---+----------+--------------------------+
|0  |2025-05-12|2025-05-12 14:55:54.764549|
+---+----------+--------------------------+



In [166]:
spark.sql("""
    select current_date(), current_timestamp()
""").show(1, False)

+--------------+--------------------------+
|current_date()|current_timestamp()       |
+--------------+--------------------------+
|2025-05-12    |2025-05-12 14:56:04.641913|
+--------------+--------------------------+



In [167]:
datedf.select(date_sub("today", 2), date_add("today", 2)).show()

+------------------+------------------+
|date_sub(today, 2)|date_add(today, 2)|
+------------------+------------------+
|        2025-05-10|        2025-05-14|
+------------------+------------------+



In [177]:
spark.sql("""
    select date_sub(today, 2), date_add(today, 2), today - INTERVAL 2 DAY, today + INTERVAL 2 DAY
    from date_table
""").show()

+------------------+------------------+--------------------------------------------------------------+----------------------------------------------------------+
|date_sub(today, 2)|date_add(today, 2)|date_add(today, (- extractansiintervaldays(INTERVAL '2' DAY)))|date_add(today, extractansiintervaldays(INTERVAL '2' DAY))|
+------------------+------------------+--------------------------------------------------------------+----------------------------------------------------------+
|        2025-05-10|        2025-05-14|                                                    2025-05-10|                                                2025-05-14|
+------------------+------------------+--------------------------------------------------------------+----------------------------------------------------------+



In [None]:
from pyspark.sql.functions import datediff, to_date, to_timestamp, months_between

In [182]:
datedf.select(to_date(lit("2025-02-05")).alias("start"), 
            to_date(lit("2025-05-11")).alias("end")) \
            .select(months_between("end", "start"), date_diff("end", "start")).show()  

+--------------------------------+---------------------+
|months_between(end, start, true)|date_diff(end, start)|
+--------------------------------+---------------------+
|                      3.19354839|                   95|
+--------------------------------+---------------------+



In [None]:
# to_date expects date format yyyy-mm-dd if not then ti return NULL
datedf.select(to_date(lit("2025-23-05")), to_date(lit("2025-05-23"))).show()

+-------------------+-------------------+
|to_date(2025-23-05)|to_date(2025-05-23)|
+-------------------+-------------------+
|               NULL|         2025-05-23|
+-------------------+-------------------+



In [188]:
# to fix one need to pass the format
datedf.select(to_date(lit("2015-24-05"), "yyyy-dd-MM")).show()

+-------------------------------+
|to_date(2015-24-05, yyyy-dd-MM)|
+-------------------------------+
|                     2015-05-24|
+-------------------------------+



In [195]:
spark.sql("""
    select TO_DATE("2025-04-23"), TO_DATE("2025-23-04"), TO_DATE("2025-23-04", "yyyy-dd-MM")
""").show()

+-------------------+-------------------+-------------------------------+
|to_date(2025-04-23)|to_date(2025-23-04)|to_date(2025-23-04, yyyy-dd-MM)|
+-------------------+-------------------+-------------------------------+
|         2025-04-23|               NULL|                     2025-04-23|
+-------------------+-------------------+-------------------------------+



#### Working With NULL

In [199]:
from pyspark.sql.functions import coalesce, ifnull, nullif, nvl, nvl2

In [216]:
df.select(coalesce(lit(None),  lit(None), lit(2.5)),
          ifnull(lit(None), lit(24)),
          nullif(lit(2), lit(2)),
          nullif(lit(2), lit(3)),
          nvl(lit(None), lit("hello")),
          nvl2(lit("first"), lit("second"), lit("third")),
          nvl2(lit(None), lit("second"), lit("third"))
    ).show(1)

+-------------------------+----------------+------------+------------+----------------+--------------------------+-------------------------+
|coalesce(NULL, NULL, 2.5)|ifnull(NULL, 24)|nullif(2, 2)|nullif(2, 3)|nvl(NULL, hello)|nvl2(first, second, third)|nvl2(NULL, second, third)|
+-------------------------+----------------+------------+------------+----------------+--------------------------+-------------------------+
|                      2.5|              24|        NULL|           2|           hello|                    second|                    third|
+-------------------------+----------------+------------+------------+----------------+--------------------------+-------------------------+
only showing top 1 row



In [219]:
spark.sql("""
    select coalesce(null, null, 4, null),
          ifnull(null, 3),
          nullif(2, 2),
          nullif(2 ,3),
          nvl(null, "hello"),
          nvl2("first", "second", "third"),
          nvl2(null, "second", "third")

""").show()

+-----------------------------+---------------+------------+------------+----------------+--------------------------+-------------------------+
|coalesce(NULL, NULL, 4, NULL)|ifnull(NULL, 3)|nullif(2, 2)|nullif(2, 3)|nvl(NULL, hello)|nvl2(first, second, third)|nvl2(NULL, second, third)|
+-----------------------------+---------------+------------+------------+----------------+--------------------------+-------------------------+
|                            4|              3|        NULL|           2|           hello|                    second|                    third|
+-----------------------------+---------------+------------+------------+----------------+--------------------------+-------------------------+



In [None]:
# drop
df.na.drop("any").count()   # if any value in row is null then the row is dropped

1968

In [224]:
df.na.drop("all").count()   # if all value of row is null then drop the row

3108

In [226]:
df.na.drop("any", subset=["StockCode", "InvoiceNo"]).count()

3108

In [None]:
# fill
df.na.fill("All null Value").where("Description = 'All null Value'").show(2)
# fill("All null Value") only applies to columns of type string. thats why CustomerID is still NULL

+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|   Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|   536414|    22139|All null Value|      56|2010-12-01 11:52:00|      0.0|      NULL|United Kingdom|
|   536545|    21134|All null Value|       1|2010-12-01 14:32:00|      0.0|      NULL|United Kingdom|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
only showing top 2 rows



In [232]:
df.na.fill({"Description": "All null value", "CustomerID": 0}) \
    .where("Description = 'All null value'").show(2)

+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|   Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|   536414|    22139|All null value|      56|2010-12-01 11:52:00|      0.0|       0.0|United Kingdom|
|   536545|    21134|All null value|       1|2010-12-01 14:32:00|      0.0|       0.0|United Kingdom|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
only showing top 2 rows



In [None]:
spark.sql("""
    with fillnull as (
          select ifnull(Description, "All null value") as Description
    from retail_data
    )
    
    select *
    from fillnull
          where Description='All null value'
""") .show(2)

+--------------+
|   Description|
+--------------+
|All null value|
|All null value|
+--------------+
only showing top 2 rows



In [237]:
# replace
df.na.replace("", "UNKNOWN", "Description").show(2)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
only showing top 2 rows



In [243]:
# null ordering
from pyspark.sql.functions import desc, asc 
from pyspark.sql.functions import desc_nulls_first, desc_nulls_last, asc_nulls_first, asc_nulls_last

In [253]:
df.sort(("Description")).show(5)

+---------+---------+-----------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+-----------+--------+-------------------+---------+----------+--------------+
|   536414|    22139|       NULL|      56|2010-12-01 11:52:00|      0.0|      NULL|United Kingdom|
|   536550|    85044|       NULL|       1|2010-12-01 14:34:00|      0.0|      NULL|United Kingdom|
|   536545|    21134|       NULL|       1|2010-12-01 14:32:00|      0.0|      NULL|United Kingdom|
|   536546|    22145|       NULL|       1|2010-12-01 14:33:00|      0.0|      NULL|United Kingdom|
|   536547|    37509|       NULL|       1|2010-12-01 14:33:00|      0.0|      NULL|United Kingdom|
+---------+---------+-----------+--------+-------------------+---------+----------+--------------+
only showing top 5 rows



In [254]:
df.sort(asc_nulls_first("Description")).show(5)

+---------+---------+-----------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+-----------+--------+-------------------+---------+----------+--------------+
|   536414|    22139|       NULL|      56|2010-12-01 11:52:00|      0.0|      NULL|United Kingdom|
|   536550|    85044|       NULL|       1|2010-12-01 14:34:00|      0.0|      NULL|United Kingdom|
|   536545|    21134|       NULL|       1|2010-12-01 14:32:00|      0.0|      NULL|United Kingdom|
|   536546|    22145|       NULL|       1|2010-12-01 14:33:00|      0.0|      NULL|United Kingdom|
|   536547|    37509|       NULL|       1|2010-12-01 14:33:00|      0.0|      NULL|United Kingdom|
+---------+---------+-----------+--------+-------------------+---------+----------+--------------+
only showing top 5 rows



In [256]:
df.orderBy(desc("Description")).show(5)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536592|    84832|ZINC WILLIE WINKI...|       1|2010-12-01 17:06:00|     1.66|      NULL|United Kingdom|
|   536381|    84832|ZINC WILLIE WINKI...|       1|2010-12-01 09:41:00|     0.85|   15311.0|United Kingdom|
|   536544|    84832|ZINC WILLIE WINKI...|       3|2010-12-01 14:32:00|     1.66|      NULL|United Kingdom|
|   536522|    84836|ZINC METAL HEART ...|       1|2010-12-01 12:49:00|     1.25|   15012.0|United Kingdom|
|   536446|    84836|ZINC METAL HEART ...|      12|2010-12-01 12:15:00|     1.25|   15983.0|United Kingdom|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
only showing top 5 rows



#### Complex types

In [265]:
from pyspark.sql.functions import struct, array, create_map


In [275]:
#struct

complex_df = df.select(struct("Description", "InvoiceNo").alias("complex_type"))
complex_df.show(2, False)

+--------------------------------------------+
|complex_type                                |
+--------------------------------------------+
|{WHITE HANGING HEART T-LIGHT HOLDER, 536365}|
|{WHITE METAL LANTERN, 536365}               |
+--------------------------------------------+
only showing top 2 rows



In [274]:
complex_df.selectExpr("complex_type.Description", "complex_type.InvoiceNo").show(2, False)

+----------------------------------+---------+
|Description                       |InvoiceNo|
+----------------------------------+---------+
|WHITE HANGING HEART T-LIGHT HOLDER|536365   |
|WHITE METAL LANTERN               |536365   |
+----------------------------------+---------+
only showing top 2 rows



In [287]:
#Array
from pyspark.sql.functions import split, size, array_contains, explode


In [None]:
# split
df.select(split("Description", " ")).show(2, False)

+----------------------------------------+
|split(Description,  , -1)               |
+----------------------------------------+
|[WHITE, HANGING, HEART, T-LIGHT, HOLDER]|
|[WHITE, METAL, LANTERN]                 |
+----------------------------------------+
only showing top 2 rows



In [None]:
# size
df.select("Description", size(split("Description", " ")).alias("lenght")).show(2, False)

+----------------------------------+------+
|Description                       |lenght|
+----------------------------------+------+
|WHITE HANGING HEART T-LIGHT HOLDER|5     |
|WHITE METAL LANTERN               |3     |
+----------------------------------+------+
only showing top 2 rows



In [None]:
#array_contains
df.select(split("Description", " "), array_contains(split("Description", " "), "WHITE")).show(2, False)

+----------------------------------------+------------------------------------------------+
|split(Description,  , -1)               |array_contains(split(Description,  , -1), WHITE)|
+----------------------------------------+------------------------------------------------+
|[WHITE, HANGING, HEART, T-LIGHT, HOLDER]|true                                            |
|[WHITE, METAL, LANTERN]                 |true                                            |
+----------------------------------------+------------------------------------------------+
only showing top 2 rows



In [289]:
# explode
df.select("CustomerID", explode(split("Description", " "))).show(5)

+----------+-------+
|CustomerID|    col|
+----------+-------+
|   17850.0|  WHITE|
|   17850.0|HANGING|
|   17850.0|  HEART|
|   17850.0|T-LIGHT|
|   17850.0| HOLDER|
+----------+-------+
only showing top 5 rows



In [304]:
# map
from pyspark.sql.functions import create_map

In [305]:
complex_map = df.select(create_map(col("CustomerID").cast("int"), "Description").alias("complex_map"))
complex_map.show(2, False)

+---------------------------------------------+
|complex_map                                  |
+---------------------------------------------+
|{17850 -> WHITE HANGING HEART T-LIGHT HOLDER}|
|{17850 -> WHITE METAL LANTERN}               |
+---------------------------------------------+
only showing top 2 rows



In [306]:
complex_map.selectExpr("complex_map['17850']").show(2)

+--------------------+
|  complex_map[17850]|
+--------------------+
|WHITE HANGING HEA...|
| WHITE METAL LANTERN|
+--------------------+
only showing top 2 rows



In [309]:
complex_map.select(col("complex_map").getItem("17850")).show(5)

+--------------------+
|  complex_map[17850]|
+--------------------+
|WHITE HANGING HEA...|
| WHITE METAL LANTERN|
|CREAM CUPID HEART...|
|KNITTED UNION FLA...|
|RED WOOLLY HOTTIE...|
+--------------------+
only showing top 5 rows



#### User Defined Function

In [384]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

In [None]:
numdf = spark.range(5).toDF("num")
numdf.show()

+---+
|num|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+



In [324]:
numdf.createOrReplaceTempView("number")

In [385]:
def power3(num: int) -> int:
    return num**3

power3udf = udf(power3, IntegerType())
# now one can power3udf detaframe function

In [386]:
numdf.select(power3udf("num")).show()

+-----------+
|power3(num)|
+-----------+
|          0|
|          1|
|          8|
|         27|
|         64|
+-----------+



In [387]:
numdf.select(power3(col("num"))).show()

+-------------+
|POWER(num, 3)|
+-------------+
|          0.0|
|          1.0|
|          8.0|
|         27.0|
|         64.0|
+-------------+



In [388]:
# register power3 with spark to use as SQL
from pyspark.sql.types import DoubleType, IntegerType

spark.udf.register("power3", power3, IntegerType())

25/05/13 09:35:14 WARN SimpleFunctionRegistry: The function power3 replaced a previously registered function.


<function __main__.power3(num: int) -> int>

In [389]:
numdf.selectExpr("power3(num)").show()

+-----------+
|power3(num)|
+-----------+
|          0|
|          1|
|          8|
|         27|
|         64|
+-----------+



In [390]:
spark.sql("""
    select power3(num)
    from number
""").show()

+-----------+
|power3(num)|
+-----------+
|          0|
|          1|
|          8|
|         27|
|         64|
+-----------+



In [391]:
spark.stop()