In [None]:
val df = spark.read.format("csv")
.option("header", "true")
.option("inferSchema", "true")
.load("/home/koushik/git/spark/input/Spark-The-Definitive-Guide/retail-data/by-day/2010-12-01.csv")

In [None]:
df.printSchema()

In [None]:
df.createOrReplaceTempView("dfTable")

#### Booleans

In [None]:
// .equalTo is same as "==="
import org.apache.spark.sql.functions.col
df.where(col("InvoiceNo").equalTo(536365))
//df.where(col("InvoiceNo")===(536365))
.select("InvoiceNo", "Description")
.show(5, false)

In [None]:
df.where(col("InvoiceNo") === 536365)
.select("InvoiceNo", "Description")
.show(5, false)

In [None]:
//Boolean expression, combining filters
val priceFilter = col("UnitPrice") > 600
val descripFilter = col("Description").contains("POSTAGE")


In [None]:
df.where(col("StockCode").isin("DOT","COM"))
.where(priceFilter.or(descripFilter))
.show()

In [None]:
val DOTCodeFilter = col("StockCode") === "DOT"
val priceFilter = col("UnitPrice") > 600
val descripFilter = col("Description").contains("POSTAGE")

In [None]:
df.withColumn("isExpensive",DOTCodeFilter.and(priceFilter.or(descripFilter))).show(5)
//.where("isExpensive").select("unitPrice", "isExpensive").show(5)

In [None]:
df.withColumn("isExpensive",DOTCodeFilter.and(priceFilter.or(descripFilter)))
.where("isExpensive").show(5)

In [None]:
df.withColumn("isExpensive",DOTCodeFilter.and(priceFilter.or(descripFilter)))
.where("isExpensive").select("unitPrice", "isExpensive").show(5)

#### Numbers:

In [None]:
//Numbers: POW function
import org.apache.spark.sql.functions.{expr, pow}
val fabricatedQuantity =pow(col("Quantity") * col("UnitPrice"),2)

In [1]:
df.select(expr("CustomerId"),fabricatedQuantity.alias("realQuantity")).show(2)

+----------+------------------+
|CustomerId|      realQuantity|
+----------+------------------+
|   17850.0|234.08999999999997|
|   17850.0|          413.7156|
+----------+------------------+
only showing top 2 rows



In [2]:
df.selectExpr("CustomerId","(POWER((Quantity * UnitPrice), 2.0) + 5) as realQuantity").show(2)

+----------+------------------+
|CustomerId|      realQuantity|
+----------+------------------+
|   17850.0|239.08999999999997|
|   17850.0|          418.7156|
+----------+------------------+
only showing top 2 rows



In [3]:
//rounding off
import org.apache.spark.sql.functions.{round, bround}
df.select(round(col("UnitPrice"), 1).alias("rounded"),col("UnitPrice")).show(5)

+-------+---------+
|rounded|UnitPrice|
+-------+---------+
|    2.6|     2.55|
|    3.4|     3.39|
|    2.8|     2.75|
|    3.4|     3.39|
|    3.4|     3.39|
+-------+---------+
only showing top 5 rows



In [4]:
//"describe" method will take all numeric columns and calculate the count, mean, standard deviation, min, and max.
df.describe().show()

+-------+-----------------+------------------+--------------------+------------------+------------------+------------------+--------------+
|summary|        InvoiceNo|         StockCode|         Description|          Quantity|         UnitPrice|        CustomerID|       Country|
+-------+-----------------+------------------+--------------------+------------------+------------------+------------------+--------------+
|  count|             3108|              3108|                3098|              3108|              3108|              1968|          3108|
|   mean| 536516.684944841|27834.304044117645|                null| 8.627413127413128| 4.151946589446603|15661.388719512195|          null|
| stddev|72.89447869788873|17407.897548583845|                null|26.371821677029203|15.638659854603892|1854.4496996893627|          null|
|    min|           536365|             10002| 4 PURPLE FLOCK D...|               -24|               0.0|           12431.0|     Australia|
|    max|          C

#### Strings

In [5]:
//turn into capital letters 
import org.apache.spark.sql.functions.{initcap}
df.select(initcap(col("Description"))).show(2, false)

+----------------------------------+
|initcap(Description)              |
+----------------------------------+
|White Hanging Heart T-light Holder|
|White Metal Lantern               |
+----------------------------------+
only showing top 2 rows



In [6]:
import org.apache.spark.sql.functions.{lower, upper}
df.select(col("Description"),lower(col("Description")),upper(lower(col("Description")))).show(2)

+--------------------+--------------------+-------------------------+
|         Description|  lower(Description)|upper(lower(Description))|
+--------------------+--------------------+-------------------------+
|WHITE HANGING HEA...|white hanging hea...|     WHITE HANGING HEA...|
| WHITE METAL LANTERN| white metal lantern|      WHITE METAL LANTERN|
+--------------------+--------------------+-------------------------+
only showing top 2 rows



In [7]:
import org.apache.spark.sql.functions.{lit, ltrim, rtrim, rpad,trim,lpad}
df.select(
ltrim(lit(" HELLO ")).as("ltrim"),
rtrim(lit(" HELLO ")).as("rtrim"),
trim(lit(" HELLO ")).as("trim"),
lpad(lit("HELLO"), 3, " ").as("lp"),
rpad(lit("HELLO"), 10, " ").as("rp")).show(2)

+------+------+-----+---+----------+
| ltrim| rtrim| trim| lp|        rp|
+------+------+-----+---+----------+
|HELLO | HELLO|HELLO|HEL|HELLO     |
|HELLO | HELLO|HELLO|HEL|HELLO     |
+------+------+-----+---+----------+
only showing top 2 rows



#### Regular Expressions

In [None]:
import org.apache.spark.sql.functions.regexp_replace
val simpleColors = Seq("black", "white", "red", "green", "blue")
val regexString = simpleColors.map(_.toUpperCase).mkString("|")

In [8]:
df.select(
    regexp_replace(col("Description"),regexString, "COLOR")
    .alias("color_cleaned"),col("Description")).show(2)

+--------------------+--------------------+
|       color_cleaned|         Description|
+--------------------+--------------------+
|COLOR HANGING HEA...|WHITE HANGING HEA...|
| COLOR METAL LANTERN| WHITE METAL LANTERN|
+--------------------+--------------------+
only showing top 2 rows



In [9]:
//replace given characters with other characters.
import org.apache.spark.sql.functions.translate
df.select(translate(col("Description"), "LEET", "1337"),col("Description")).show(2)

+----------------------------------+--------------------+
|translate(Description, LEET, 1337)|         Description|
+----------------------------------+--------------------+
|              WHI73 HANGING H3A...|WHITE HANGING HEA...|
|               WHI73 M37A1 1AN73RN| WHITE METAL LANTERN|
+----------------------------------+--------------------+
only showing top 2 rows



In [None]:
import org.apache.spark.sql.functions.regexp_extract
val regexString = simpleColors.map(_.toUpperCase).mkString("(", "|", ")")
// the | signifies OR in regular expression syntax

In [10]:
df.select(regexp_extract(col("Description"), regexString, 1).alias("color_cleaned"),col("Description")).show(2)

+-------------+--------------------+
|color_cleaned|         Description|
+-------------+--------------------+
|        WHITE|WHITE HANGING HEA...|
|        WHITE| WHITE METAL LANTERN|
+-------------+--------------------+
only showing top 2 rows



In [None]:
//contains
val containsBlack = col("Description").contains("BLACK")
val containsWhite = col("DESCRIPTION").contains("WHITE")

In [11]:
df.withColumn("hasSimpleColor", containsBlack.or(containsWhite)).filter("hasSimpleColor").select("Description").show(3, false)

+----------------------------------+
|Description                       |
+----------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER|
|WHITE METAL LANTERN               |
|RED WOOLLY HOTTIE WHITE HEART.    |
+----------------------------------+
only showing top 3 rows



In [None]:
// contains with a number of items /varargs
val simpleColors = Seq("black", "white", "red", "green", "blue")
val selectedColumns = simpleColors.map(color => {col("Description").contains(color.toUpperCase).alias(s"is_$color")}):+expr("*")// could also append this value

In [12]:
df.select(selectedColumns:_*).where(col("is_white").or(col("is_red"))).select("Description").show(3, false)

+----------------------------------+
|Description                       |
+----------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER|
|WHITE METAL LANTERN               |
|RED WOOLLY HOTTIE WHITE HEART.    |
+----------------------------------+
only showing top 3 rows



#### Dates

In [None]:
import org.apache.spark.sql.functions.{current_date, current_timestamp}
val dateDF = spark.range(10).withColumn("today", current_date()).withColumn("now", current_timestamp())
dateDF.createOrReplaceTempView("dateTable")

In [13]:
dateDF.show(2)

+---+----------+--------------------+
| id|     today|                 now|
+---+----------+--------------------+
|  0|2019-02-07|2019-02-07 02:58:...|
|  1|2019-02-07|2019-02-07 02:58:...|
+---+----------+--------------------+
only showing top 2 rows



In [14]:
//add and subtract
import org.apache.spark.sql.functions.{date_add, date_sub}
dateDF.select(date_sub(col("today"), 5),date_add(col("today"), 5)).show(1)

+------------------+------------------+
|date_sub(today, 5)|date_add(today, 5)|
+------------------+------------------+
|        2019-02-02|        2019-02-12|
+------------------+------------------+
only showing top 1 row



In [15]:
//difference between two dates. "datediff"
import org.apache.spark.sql.functions.{datediff, months_between,to_date}
dateDF.withColumn("week_ago", date_sub(col("today"), 7)).show(1)
dateDF.withColumn("week_ago", date_sub(col("today"), 7)).select(datediff(col("week_ago"), col("today"))).show(1)

+---+----------+--------------------+----------+
| id|     today|                 now|  week_ago|
+---+----------+--------------------+----------+
|  0|2019-02-07|2019-02-07 02:58:...|2019-01-31|
+---+----------+--------------------+----------+
only showing top 1 row

+-------------------------+
|datediff(week_ago, today)|
+-------------------------+
|                       -7|
+-------------------------+
only showing top 1 row



In [None]:
//to_date
val start_end=dateDF.select(to_date(lit("2016-01-01")).alias("start"),to_date(lit("2017-05-22")).alias("end"))

In [16]:
start_end.show(1)
start_end.select(months_between(col("start"), col("end"))).show(1)

+----------+----------+
|     start|       end|
+----------+----------+
|2016-01-01|2017-05-22|
+----------+----------+
only showing top 1 row

+--------------------------------+
|months_between(start, end, true)|
+--------------------------------+
|                    -16.67741935|
+--------------------------------+
only showing top 1 row



In [17]:
import org.apache.spark.sql.functions.{to_date, lit}
spark.range(5).withColumn("date", lit("2017-01-01")).select(to_date(col("date"))).show(1)

+---------------+
|to_date(`date`)|
+---------------+
|     2017-01-01|
+---------------+
only showing top 1 row



In [18]:
//Format must be of YYYY-MM-DD
dateDF.select(to_date(lit("2016-20-12")),to_date(lit("2017-12-11"))).show(1)

+---------------------+---------------------+
|to_date('2016-20-12')|to_date('2017-12-11')|
+---------------------+---------------------+
|                 null|           2017-12-11|
+---------------------+---------------------+
only showing top 1 row



In [None]:
// we added an extra optional argument  dateFormat" to "to_date" function to change from  YYYY-DD-MM to  YYYY-MM-DD
import org.apache.spark.sql.functions.to_date
val dateFormat = "yyyy-dd-MM"
val cleanDateDF = spark.range(1).select(to_date(lit("2017-12-11"), dateFormat).alias("date"),to_date(lit("2017-20-12"), dateFormat).alias("date2"))

In [None]:
cleanDateDF.createOrReplaceTempView("dateTable2")

In [19]:
cleanDateDF.show()

+----------+----------+
|      date|     date2|
+----------+----------+
|2017-11-12|2017-12-20|
+----------+----------+



In [20]:
//  "to_timestamp"  always needs a format
import org.apache.spark.sql.functions.to_timestamp
cleanDateDF.select(to_timestamp(col("date"), dateFormat).alias("date")).show()

+-------------------+
|               date|
+-------------------+
|2017-11-12 00:00:00|
+-------------------+



In [21]:
//comparing between date or timestamp  when they are in  correct format and type
cleanDateDF.filter(col("date2") > lit("2017-12-12")).show()
cleanDateDF.filter(col("date2") > "'2017-12-12'").show()//as a string, which Spark parses to a literal

+----------+----------+
|      date|     date2|
+----------+----------+
|2017-11-12|2017-12-20|
+----------+----------+

+----------+----------+
|      date|     date2|
+----------+----------+
|2017-11-12|2017-12-20|
+----------+----------+



#### Null

In [22]:
//Coalesce: function  allows you to select the first non-null value from a set of columns.
//In this case, there are no null values, so it simply returns the first column:
import org.apache.spark.sql.functions.coalesce
df.select(col("Description"), col("CustomerId")).show(2)
df.select(coalesce(col("Description"), col("CustomerId"))).show(2)

+--------------------+----------+
|         Description|CustomerId|
+--------------------+----------+
|WHITE HANGING HEA...|   17850.0|
| WHITE METAL LANTERN|   17850.0|
+--------------------+----------+
only showing top 2 rows

+---------------------------------+
|coalesce(Description, CustomerId)|
+---------------------------------+
|             WHITE HANGING HEA...|
|              WHITE METAL LANTERN|
+---------------------------------+
only showing top 2 rows



In [None]:
//drop: removes rows that contain nulls. 
df.na.drop()
df.na.drop("all")//drop if all values of a row are null
df.na.drop("any")//drop if any value of a row is null
df.na.drop("all", Seq("StockCode", "InvoiceNo"))//only  for selected columns

In [None]:
//fill : you can fill one or more columns with a set of values.
df.na.fill("All Null values become this string")
df.na.fill(5)// replace all null values of any  numeric  column with 5
df.na.fill(5, Seq("StockCode", "InvoiceNo"))// replace all null values of specific columns column with 5

In [None]:
//Same as above with a map
val fillColValues = Map("StockCode" -> 5, "Description" -> "No Value")

In [None]:

df.na.fill(fillColValues)

#### Replace

In [None]:
df.na.replace("Description", Map("" -> "UNKNOWN"))// replace empty string "" with "UNKNOWN"

#### Ordering

In [None]:
//use asc_nulls_first, desc_nulls_first, asc_nulls_last, or desc_nulls_last 
//to specify where you would like your null values to appear in an ordered DataFrame.

### Complex Types

#### Structs

In [23]:
df.selectExpr("(Description, InvoiceNo) as complex", "*").show(2)
df.selectExpr("struct(Description, InvoiceNo) as complex", "*").show(2)

+--------------------+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|             complex|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+--------------------+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|[WHITE HANGING HE...|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|
|[WHITE METAL LANT...|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
+--------------------+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
only showing top 2 rows

+--------------------+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|             complex|InvoiceNo|StockCode|         Description|Quantity|

In [None]:
// A df ( named complex) within a df
import org.apache.spark.sql.functions.struct
val complexDF = df.select(struct("Description", "InvoiceNo").alias("complex"))

In [24]:
complexDF.createOrReplaceTempView("complexDFSchema")
complexDF.select("complex.Description").show(2)// accessing data from the inner df
complexDF.select(col("complex").getField("Description")).show(2)// accessing data from the inner df
complexDF.select("complex.*").show(2)

+--------------------+
|         Description|
+--------------------+
|WHITE HANGING HEA...|
| WHITE METAL LANTERN|
+--------------------+
only showing top 2 rows

+--------------------+
| complex.Description|
+--------------------+
|WHITE HANGING HEA...|
| WHITE METAL LANTERN|
+--------------------+
only showing top 2 rows

+--------------------+---------+
|         Description|InvoiceNo|
+--------------------+---------+
|WHITE HANGING HEA...|   536365|
| WHITE METAL LANTERN|   536365|
+--------------------+---------+
only showing top 2 rows



#### Arrays

In [25]:
//split
import org.apache.spark.sql.functions.split
df.select(split(col("Description"), " ")).show(2)

+---------------------+
|split(Description,  )|
+---------------------+
| [WHITE, HANGING, ...|
| [WHITE, METAL, LA...|
+---------------------+
only showing top 2 rows



In [26]:
df.select(split(col("Description"), " ").alias("array_col")).selectExpr("array_col[0]").show(2)

+------------+
|array_col[0]|
+------------+
|       WHITE|
|       WHITE|
+------------+
only showing top 2 rows



In [27]:
//Size/Length
import org.apache.spark.sql.functions.size
df.select(size(split(col("Description"), " "))).show(2)

+---------------------------+
|size(split(Description,  ))|
+---------------------------+
|                          5|
|                          3|
+---------------------------+
only showing top 2 rows



In [28]:
//Contains
// in Scala
import org.apache.spark.sql.functions.array_contains
df.select(array_contains(split(col("Description"), " "), "WHITE")).show(2)

+--------------------------------------------+
|array_contains(split(Description,  ), WHITE)|
+--------------------------------------------+
|                                        true|
|                                        true|
+--------------------------------------------+
only showing top 2 rows



In [29]:
//explode
import org.apache.spark.sql.functions.{split, explode}
df.withColumn("splitted", split(col("Description"), " ")).withColumn("exploded", explode(col("splitted")))
.select( "InvoiceNo","Description","splitted", "exploded").show(5)

+---------+--------------------+--------------------+--------+
|InvoiceNo|         Description|            splitted|exploded|
+---------+--------------------+--------------------+--------+
|   536365|WHITE HANGING HEA...|[WHITE, HANGING, ...|   WHITE|
|   536365|WHITE HANGING HEA...|[WHITE, HANGING, ...| HANGING|
|   536365|WHITE HANGING HEA...|[WHITE, HANGING, ...|   HEART|
|   536365|WHITE HANGING HEA...|[WHITE, HANGING, ...| T-LIGHT|
|   536365|WHITE HANGING HEA...|[WHITE, HANGING, ...|  HOLDER|
+---------+--------------------+--------------------+--------+
only showing top 5 rows



#### Maps

In [30]:
import org.apache.spark.sql.functions.map
df.select(map(col("Description"), col("InvoiceNo")).alias("complex_map")).show(2)

+--------------------+
|         complex_map|
+--------------------+
|[WHITE HANGING HE...|
|[WHITE METAL LANT...|
+--------------------+
only showing top 2 rows



In [31]:
df.select(map( col("InvoiceNo"),col("Description")).alias("complex_map"))
.selectExpr("explode(complex_map)").show(2)

+------+--------------------+
|   key|               value|
+------+--------------------+
|536365|WHITE HANGING HEA...|
|536365| WHITE METAL LANTERN|
+------+--------------------+
only showing top 2 rows



#### JSON

In [None]:
val jsonDF = spark.range(1).selectExpr("""'{"myJSONKey" : {"myJSONValue" : [1, 2, 3]}}' as jsonString""")

In [32]:
//get_json_object ,json_tuple
import org.apache.spark.sql.functions.{get_json_object, json_tuple}
jsonDF.select(get_json_object(col("jsonString"), "$.myJSONKey.myJSONValue[1]") as "column",
                json_tuple(col("jsonString"), "myJSONKey")as "tuple").show(2)

+------+--------------------+
|column|               tuple|
+------+--------------------+
|     2|{"myJSONValue":[1...|
+------+--------------------+



In [33]:
//to_json (from Struct to Json)
import org.apache.spark.sql.functions.to_json
df.selectExpr("(InvoiceNo, Description) as myStruct").select(to_json(col("myStruct"))).show(2)

+-----------------------+
|structstojson(myStruct)|
+-----------------------+
|   {"InvoiceNo":"536...|
|   {"InvoiceNo":"536...|
+-----------------------+
only showing top 2 rows



In [None]:
//from_json (from Json to specified schema)
import org.apache.spark.sql.functions.from_json
import org.apache.spark.sql.types._
val parseSchema = new StructType(Array(
        new StructField("InvoiceNo",StringType,true),
        new StructField("Description",StringType,true)))

In [35]:
df.selectExpr("(InvoiceNo, Description) as myStruct")
       .select(to_json(col("myStruct")).alias("newJSON")).select(from_json(col("newJSON"), parseSchema), col("newJSON")).show(2)

+----------------------+--------------------+
|jsontostructs(newJSON)|             newJSON|
+----------------------+--------------------+
|  [536365, WHITE HA...|{"InvoiceNo":"536...|
|  [536365, WHITE ME...|{"InvoiceNo":"536...|
+----------------------+--------------------+
only showing top 2 rows



#### UDF

In [None]:
import org.apache.spark.sql.functions.udf
def power3(number:Double):Double = number * number * number //define UDF

In [36]:
println(power3(2.0))//use UDF but not on DF yet

8.0


In [None]:
spark.udf.register("power3", power3(_:Double):Double)//register the UDF as a Spark SQL

In [38]:
udfExampleDF.selectExpr("power3(num) as cube").show(2)

+----+
|cube|
+----+
|27.0|
|64.0|
+----+



In [None]:
val power3udf = udf(power3(_:Double):Double)
val udfExampleDF = spark.range(3,5).toDF("num")

In [37]:
udfExampleDF.select(power3udf(col("num"))).show(2)//use the UDF as any other function

+--------+
|UDF(num)|
+--------+
|    27.0|
|    64.0|
+--------+

