In [11]:
import glob
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark import SparkContext, SparkConf

In [12]:
spark = SparkSession.builder\
                    .appName('Chapter4')\
                    .getOrCreate()

In [13]:
spark = SparkSession.builder\
                    .master("local")\
                    .appName('abc')\
                    .getOrCreate()

In [16]:
df = spark.read.format("csv")\
        .option("header", "True")\
        .option("inferSchema", "true")\
        .load("Spark-The-Definitive-Guide/data/retail-data/all/online-retail-dataset.csv")

## Working with booleans

In [17]:
from pyspark.sql.functions import instr
priceFilter = col("UnitPrice") > 600
descripFilter = instr(df.Description, "POSTAGE") >= 1
df.where(df.StockCode.isin("DOT"))\
        .where(priceFilter | descripFilter)\
        .show(5)

+---------+---------+--------------+--------+---------------+---------+----------+--------------+
|InvoiceNo|StockCode|   Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------+--------+---------------+---------+----------+--------------+
|   536544|      DOT|DOTCOM POSTAGE|       1|12/1/2010 14:32|   569.77|      null|United Kingdom|
|   536592|      DOT|DOTCOM POSTAGE|       1|12/1/2010 17:06|   607.49|      null|United Kingdom|
|   536862|      DOT|DOTCOM POSTAGE|       1|12/3/2010 11:13|   254.43|      null|United Kingdom|
|   536864|      DOT|DOTCOM POSTAGE|       1|12/3/2010 11:27|   121.06|      null|United Kingdom|
|   536865|      DOT|DOTCOM POSTAGE|       1|12/3/2010 11:28|   498.47|      null|United Kingdom|
+---------+---------+--------------+--------+---------------+---------+----------+--------------+
only showing top 5 rows



In [18]:
# Adding a boolean column to a df based on logical conditions
from pyspark.sql.functions import instr
DOTCodeFilter = col("StockCode") == "DOT"
priceFilter = col("UnitPrice") > 600
descripFilter = instr(col("Description"), "POSTAGE") >= 1
df.withColumn("isExpensive", DOTCodeFilter & (priceFilter | descripFilter))\
            .where("isExpensive")\
            .select("unitPrice", "isExpensive")\
            .show(5)

+---------+-----------+
|unitPrice|isExpensive|
+---------+-----------+
|   569.77|       true|
|   607.49|       true|
|   254.43|       true|
|   121.06|       true|
|   498.47|       true|
+---------+-----------+
only showing top 5 rows



## Working with numerical values

In [19]:
from pyspark.sql.functions import expr, pow
fabricatedQuantity = pow(col("Quantity") * col("UnitPrice"), 2)
df.select(
        expr("CustomerId"),
        fabricatedQuantity.alias("realQuantity"))\
        .show(2)

+----------+------------------+
|CustomerId|      realQuantity|
+----------+------------------+
|     17850|234.08999999999997|
|     17850|          413.7156|
+----------+------------------+
only showing top 2 rows



In [20]:
fabricatedQuantity

Column<b'POWER((Quantity * UnitPrice), 2.0)'>

In [22]:
df.selectExpr(
"CustomerId",
"(POWER((Quantity * UnitPrice), 2.0) + 5) as realQuantity").show(2)

+----------+------------------+
|CustomerId|      realQuantity|
+----------+------------------+
|     17850|239.08999999999997|
|     17850|          418.7156|
+----------+------------------+
only showing top 2 rows



In [24]:
# Pearson correlation Coefficient
# import org.apache.spark.sql.functions.{corr}
# df.stat.corr("Quantity", "UnitPrice")
%time df.select(corr("Quantity", "UnitPrice")).show()

+-------------------------+
|corr(Quantity, UnitPrice)|
+-------------------------+
|     -0.00123492454487...|
+-------------------------+

CPU times: user 1.64 ms, sys: 778 µs, total: 2.42 ms
Wall time: 293 ms


In [25]:
df.describe().select(col("summary"),col("Quantity"),col("UnitPrice")).show()

+-------+-----------------+-----------------+
|summary|         Quantity|        UnitPrice|
+-------+-----------------+-----------------+
|  count|           541909|           541909|
|   mean| 9.55224954743324| 4.61111362608971|
| stddev|218.0811578502344|96.75985306117953|
|    min|           -80995|        -11062.06|
|    max|            80995|          38970.0|
+-------+-----------------+-----------------+



In [26]:
colName = "UnitPrice"
quantileProbs = [0.5]
relError = 0.05
df.stat.approxQuantile("UnitPrice", quantileProbs, relError)

[2.1]

## Working with strings

In [29]:
# from pyspark.sql.functions import initcap
df.select(
col("Description"),
lower(col("Description")),
upper(lower(col("Description")))).show(5)

+--------------------+--------------------+-------------------------+
|         Description|  lower(Description)|upper(lower(Description))|
+--------------------+--------------------+-------------------------+
|WHITE HANGING HEA...|white hanging hea...|     WHITE HANGING HEA...|
| WHITE METAL LANTERN| white metal lantern|      WHITE METAL LANTERN|
|CREAM CUPID HEART...|cream cupid heart...|     CREAM CUPID HEART...|
|KNITTED UNION FLA...|knitted union fla...|     KNITTED UNION FLA...|
|RED WOOLLY HOTTIE...|red woolly hottie...|     RED WOOLLY HOTTIE...|
+--------------------+--------------------+-------------------------+
only showing top 5 rows



In [28]:
df.select(
ltrim(lit(" HELLO   ")).alias("ltrim"),
rtrim(lit("   HELLO       ")).alias("rtrim"),
trim(lit("       HELLO        ")).alias("trim"),
lpad(lit("HELLO"), 3, " ").alias("lp"),
rpad(lit("HELLO"), 10, " ").alias("rp"))\
.show(2)

+--------+--------+-----+---+----------+
|   ltrim|   rtrim| trim| lp|        rp|
+--------+--------+-----+---+----------+
|HELLO   |   HELLO|HELLO|HEL|HELLO     |
|HELLO   |   HELLO|HELLO|HEL|HELLO     |
+--------+--------+-----+---+----------+
only showing top 2 rows



### Regular Expressions

In [30]:
from pyspark.sql.functions import regexp_replace
regex_string = "BLACK|WHITE|RED|GREEN|BLUE"
df.select(regexp_replace(col("Description"), regex_string, "COLOR")
.alias("color_cleaned"),
col("Description"))\
.show(2)

+--------------------+--------------------+
|       color_cleaned|         Description|
+--------------------+--------------------+
|COLOR HANGING HEA...|WHITE HANGING HEA...|
| COLOR METAL LANTERN| WHITE METAL LANTERN|
+--------------------+--------------------+
only showing top 2 rows



In [31]:
df.select(
translate(col("Description"), "LEET", "1337"),
col("Description"))\
.show(2)

+----------------------------------+--------------------+
|translate(Description, LEET, 1337)|         Description|
+----------------------------------+--------------------+
|              WHI73 HANGING H3A...|WHITE HANGING HEA...|
|               WHI73 M37A1 1AN73RN| WHITE METAL LANTERN|
+----------------------------------+--------------------+
only showing top 2 rows



In [32]:
containsBlack = instr(col("Description"), "BLACK") >= 1
containsWhite = instr(col("Description"), "WHITE") >= 1
df.withColumn("hasSimpleColor", containsBlack | containsWhite)\
    .filter("hasSimpleColor")\
    .select("Description")\
    .show(3, False)

+----------------------------------+
|Description                       |
+----------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER|
|WHITE METAL LANTERN               |
|RED WOOLLY HOTTIE WHITE HEART.    |
+----------------------------------+
only showing top 3 rows



In [45]:
from pyspark.sql.functions import expr, locate

simpleColors = ["black", "white", "red", "green", "blue"]

def color_locator(column, color_string):
    """This function creates a column declaring whether or
    not a given pySpark column contains the UPPERCASED
    color.
    Returns a new column type that can be used
    in a select statement.
    """
    return locate(color_string.upper(), column)\
                                    .cast("boolean")\
                                    .alias("is_" + c)

# selectedColumns = [color_locator(df.Description, c) for c in simpleColors]
selectedColumns = [color_locator(df.Description, c) for c in simpleColors]
print([c for c in simpleColors])
# selectedColumns.append(expr("*")) # has to a be Column type
df\
.select(*selectedColumns)\
.where(expr("is_white OR is_red"))\
.select("Description")\
.show(3, False)

NameError: name 'c' is not defined

## Dates and timestamps

In [46]:
dateDF = spark.range(10)\
.withColumn("today", current_date())\
.withColumn("now", current_timestamp())
dateDF.createOrReplaceTempView("dateTable")
dateDF.printSchema()

root
 |-- id: long (nullable = false)
 |-- today: date (nullable = false)
 |-- now: timestamp (nullable = false)



In [48]:
dateDF\
.select(col("today"),
date_sub(col("today"), 5),
date_add(col("today"), 5))\
.show(1)

+----------+------------------+------------------+
|     today|date_sub(today, 5)|date_add(today, 5)|
+----------+------------------+------------------+
|2019-01-20|        2019-01-15|        2019-01-25|
+----------+------------------+------------------+
only showing top 1 row



In [50]:
dateDF\
.withColumn("week_ago", date_sub(col("today"), 7))\
.select(datediff(col("week_ago"), col("today")))\
.show(1)
dateDF\
.select(
to_date(lit("2016-01-01")).alias("start"),
to_date(lit("2017-05-22")).alias("end"))\
.select(months_between(col("start"), col("end")))\
.show(1)

+-------------------------+
|datediff(week_ago, today)|
+-------------------------+
|                       -7|
+-------------------------+
only showing top 1 row

+--------------------------------+
|months_between(start, end, true)|
+--------------------------------+
|                    -16.67741935|
+--------------------------------+
only showing top 1 row



In [54]:
from pyspark.sql.functions import unix_timestamp, from_unixtime
dateFormat = "yyyy-dd-MM"
cleanDateDF = spark.range(1).select(to_date(unix_timestamp(lit("2017-12-11"), dateFormat).cast("timestamp").alias("date")),
                        to_date(unix_timestamp(lit("2017-20-12"), dateFormat).cast("timestamp").alias("date2")))

cleanDateDF.createOrReplaceTempView("dateTable2")

In [55]:
cleanDateDF.show()

+--------------------------------------------------------------------------------+---------------------------------------------------------------------------------+
|to_date(CAST(unix_timestamp('2017-12-11', 'yyyy-dd-MM') AS TIMESTAMP) AS `date`)|to_date(CAST(unix_timestamp('2017-20-12', 'yyyy-dd-MM') AS TIMESTAMP) AS `date2`)|
+--------------------------------------------------------------------------------+---------------------------------------------------------------------------------+
|                                                                      2017-11-12|                                                                       2017-12-20|
+--------------------------------------------------------------------------------+---------------------------------------------------------------------------------+



## Working with nulls 

In [58]:
fill_cols_vals = {
"StockCode": 5,
"Description" : "No Value"
}
df.na.fill(fill_cols_vals)

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: string, UnitPrice: double, CustomerID: int, Country: string]

In [57]:
df.na.replace([""], ["UNKNOWN"], "Description")

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: string, UnitPrice: double, CustomerID: int, Country: string]

## Working with complex types

In [59]:
# structs

In [60]:
complexDF = df\
.select(struct("Description", "InvoiceNo").alias("complex"))
complexDF.createOrReplaceTempView("complexDF")

In [63]:
df.select(split(col("Description"), " ")).show(2)

+---------------------+
|split(Description,  )|
+---------------------+
| [WHITE, HANGING, ...|
| [WHITE, METAL, LA...|
+---------------------+
only showing top 2 rows



In [62]:
df.select(split(col("Description"), " ").alias("array_col"))\
.selectExpr("array_col[0]")\
.show(2)

+------------+
|array_col[0]|
+------------+
|       WHITE|
|       WHITE|
+------------+
only showing top 2 rows



In [65]:
# look if an array contains

df.select(array_contains(split(col("Description"), " "), "WHITE")).show()

+--------------------------------------------+
|array_contains(split(Description,  ), WHITE)|
+--------------------------------------------+
|                                        true|
|                                        true|
|                                       false|
|                                       false|
|                                        true|
|                                       false|
|                                       false|
|                                       false|
|                                       false|
|                                       false|
|                                       false|
|                                       false|
|                                       false|
|                                       false|
|                                       false|
|                                       false|
|                                       false|
|                                       false|
|            

In [66]:
# explode

# The explode function takes a column that consists of arrays and creates one
# row (with the rest of the values duplicated) per value in the array. The
# following figure illustrates the process.

df.withColumn("splitted", split(col("Description"), " "))\
.withColumn("exploded", explode(col("splitted")))\
.select("Description", "InvoiceNo", "exploded").show()



+--------------------+---------+--------+
|         Description|InvoiceNo|exploded|
+--------------------+---------+--------+
|WHITE HANGING HEA...|   536365|   WHITE|
|WHITE HANGING HEA...|   536365| HANGING|
|WHITE HANGING HEA...|   536365|   HEART|
|WHITE HANGING HEA...|   536365| T-LIGHT|
|WHITE HANGING HEA...|   536365|  HOLDER|
| WHITE METAL LANTERN|   536365|   WHITE|
| WHITE METAL LANTERN|   536365|   METAL|
| WHITE METAL LANTERN|   536365| LANTERN|
|CREAM CUPID HEART...|   536365|   CREAM|
|CREAM CUPID HEART...|   536365|   CUPID|
|CREAM CUPID HEART...|   536365|  HEARTS|
|CREAM CUPID HEART...|   536365|    COAT|
|CREAM CUPID HEART...|   536365|  HANGER|
|KNITTED UNION FLA...|   536365| KNITTED|
|KNITTED UNION FLA...|   536365|   UNION|
|KNITTED UNION FLA...|   536365|    FLAG|
|KNITTED UNION FLA...|   536365|     HOT|
|KNITTED UNION FLA...|   536365|   WATER|
|KNITTED UNION FLA...|   536365|  BOTTLE|
|RED WOOLLY HOTTIE...|   536365|     RED|
+--------------------+---------+--

## user defined functions

In [67]:
# Jsons

udfExampleDF = spark.range(5).toDF("num")
def power3(double_value):
    return double_value ** 3
power3(2.0)

8.0

In [68]:
power3udf = udf(power3)

In [69]:
udfExampleDF.select(power3udf(col("num"))).show()

+-----------+
|power3(num)|
+-----------+
|          0|
|          1|
|          8|
|         27|
|         64|
+-----------+



In [70]:
df.select(power3udf(col("Quantity"))).show()

+----------------+
|power3(Quantity)|
+----------------+
|             216|
|             216|
|             512|
|             216|
|             216|
|               8|
|             216|
|             216|
|             216|
|           32768|
|             216|
|             216|
|             512|
|             216|
|             216|
|              27|
|               8|
|              27|
|              27|
|              64|
+----------------+
only showing top 20 rows



In [72]:
udfExampleDF.selectExpr("power3(num)").show()

AnalysisException: "Undefined function: 'power3'. This function is neither a registered temporary function nor a permanent function registered in the database 'default'.; line 1 pos 0"