## Working with booleans

In [1]:
import org.apache.spark.sql.functions.col
import org.apache.spark.{SparkContext, SparkConf}

In [2]:
import org.apache.spark.sql.functions._

In [3]:
val conf = new SparkConf()
                .setAppName("SparkLearning_Charper_4")
                .set("spark.driver.memory","16g")
                    
val Spark = new SparkContext(conf)

conf = org.apache.spark.SparkConf@51870c49
Spark = org.apache.spark.SparkContext@1450ce70


org.apache.spark.SparkContext@1450ce70

In [None]:
val df = spark.read.format("csv")
        .option("header", "True")
        .option("inferSchema", "true")
        .load("Spark-The-Definitive-Guide/data/retail-data/all/online-retail-dataset.csv")

In [None]:
df.where(col("InvoiceNo").equalTo(536365))
    .select("InvoiceNo", "Description")
    .show(5, false)

In [None]:
val priceFilter = col("UnitPrice") > 600
val descripFilter = col("Description").contains("POSTAGE")

df.where(col("StockCode").isin("DOT"))
        .where(priceFilter.or(descripFilter))
        .show(5)

In [None]:
 // Adding a boolean column to a df based on logical conditions
val DOTCodeFilter = col("StockCode") === "DOT"
val priceFilter = col("UnitPrice") > 600
val descripFilter = col("Description").contains("POSTAGE")
df.withColumn("isExpensive",
              DOTCodeFilter.and(priceFilter.and(descripFilter)))
    .where("isExpensive")
//     .select("unitPrice", "isExpensive")
    .show(5)

## Working with numerical values

In [None]:
val fabricatedQuantity = pow(col("Quantity") * col("UnitPrice"),2)
df.select(expr("CustomerId"),
        fabricatedQuantity.alias("realQuantity"))
        .show(2)

In [None]:
fabricatedQuantity

In [None]:
df.selectExpr(
"CustomerId",
"(POWER((Quantity * UnitPrice), 2.0) + 5) as realQuantity")
.show(2)

In [None]:
df.select(corr("Quantity", "UnitPrice")).show()

In [None]:
df.describe().select(col("summary"),col("Quantity"),col("UnitPrice")).show()

In [None]:
// # More stats functions
val colName = "UnitPrice"
val quantileProbs = Array(0.25,0.5,0.75,0.95)
val relError = 0.05
df.stat.approxQuantile("UnitPrice", quantileProbs, relError)

In [None]:
df.stat.freqItems(Seq("StockCode", "Quantity")).show()

## Working with Strings

In [None]:
// import org.apache.spark.sql.functions.{initcap}
df.select(initcap(col("Description"))).show(2, false)

In [None]:
df.select(
col("Description"),
lower(col("Description")),
upper(lower(col("Description")))).show(5)

In [None]:
// Another trivial task is adding or removing whitespace around a string. We can
// do this with lpad , ltrim , rpad and rtrim , trim .

df.select(
ltrim(lit(" HELLO ")).as("ltrim"),
rtrim(lit(" HELLO    ")).as("rtrim"),
trim(lit("   HELLO    ")).as("trim"),
lpad(lit("HELLO"), 3, " ").as("lp"),
rpad(lit("HELLO"), 10, " ").as("rp"))
.show(2)

### Regular Expressions

In [None]:
import org.apache.spark.sql.functions.regexp_replace

val simpleColors = Seq("black", "white", "red", "green", "blue")
val regexString = simpleColors.map(_.toUpperCase).mkString("|")
// the | signifies `OR` in regular expression syntax

In [None]:
df.select(
regexp_replace(col("Description"), regexString, "COLOR")
.alias("color_cleaned"),
col("Description"))
.show(2)

In [None]:
df.select(
translate(col("Description"), "LEET", "1337"),
col("Description"))
.show(2)

In [None]:
import org.apache.spark.sql.functions.regexp_extract
val regexString = simpleColors.map(_.toUpperCase).mkString("(", "|", ")")
// the | signifies OR in regular expression syntax
df.select(
        regexp_extract(col("Description"), regexString, 1)
        .alias("color_cleaned"),
        col("Description"))
.show(2)

In [None]:
val containsBlack = col("Description").contains("BLACK")
val containsWhite = col("DESCRIPTION").contains("WHITE")

In [None]:
df.withColumn("hasSimpleColor", containsBlack.or(containsWhite))
.filter("hasSimpleColor")
.select(col("Description"),col("hasSimpleColor"))
.show(3, false)

In [None]:
val simpleColors = Seq("black", "white", "red", "green", "blue")
val selectedColumns = simpleColors.map(color => {
                                                col("Description")
                                                .contains(color.toUpperCase)
                                                .alias(s"is_$color")
                                                }):+expr("*") // could also append this value

In [None]:
df.select(selectedColumns:_*)
        .where(col("is_white").or(col("is_red")))
        .select("Description")
        .show(3, false)

## Date and timestamps

In [None]:
df.printSchema()

In [None]:
val dateDF = spark.range(10)
                    .withColumn("today", current_date())
                    .withColumn("now", current_timestamp())
dateDF.createOrReplaceTempView("dateTable")

In [None]:
dateDF.printSchema()

In [None]:
dateDF
.select(col("today"),
date_sub(col("today"), 5),
date_add(col("today"), 5))
.show(1)

In [None]:
dateDF
.withColumn("week_ago", date_sub(col("today"), 7))
.select(datediff(col("week_ago"), col("today")))
.show(1)
dateDF
.select(
to_date(lit("2017-06-25")).alias("start"),
to_date(lit("2018-11-21")).alias("end"))
.select(months_between(col("start"), col("end")))
.show(1)

In [None]:
// WARNING SPARK COERCE DATES AND WHENEVER WE CANNOT PARSE THE DATE, HE WILL PUT NULL
dateDF.select(to_date(lit("2016-20-12")),to_date(lit("2017-12-11"))).show(5)

In [None]:
val dateFormat = "yyyy-dd-MM"
val cleanDateDF = spark.range(1).select(unix_timestamp(lit("2017-12-11"), dateFormat).cast("timestamp")).show()

## Working with NULLs

In [None]:
// fill na's by mapping a value for each column
val fillColValues = Map(
"StockCode" -> 5,
"Description" -> "No Value"
)
df.na.fill(fillColValues)

In [None]:
df.na.replace("Description", Map("" -> "UNKNOWN"))

## Working with complex types

In [None]:
//  structs

In [None]:
// import org.apache.spark.sql.functions.struct
val complexDF = df
.select(struct("Description", "InvoiceNo").alias("complex"))
complexDF.createOrReplaceTempView("complexDF")

In [None]:
complexDF.show()

In [None]:
df.select(split(col("Description"), " ")).show(2)

In [None]:
// splitting an string on a data frame and then select and element from it
df.select(split(col("Description"), " ").alias("array_col"))
.selectExpr("array_col[2]")
.show(2)

In [None]:
df.select(array_contains(split(col("Description"), " "), "WHITE")).show()

In [None]:
// # explode

// # The explode function takes a column that consists of arrays and creates one
// # row (with the rest of the values duplicated) per value in the array. The
// # following figure illustrates the process.

In [None]:
df.withColumn("splitted", split(col("Description"), " "))
.withColumn("exploded", explode(col("splitted")))
.select("Description", "InvoiceNo", "splitted","exploded").show()

In [None]:
// maps

df.select(map(col("Description"), col("InvoiceNo")).alias("complex_map")).show()

In [None]:
val jsonDF = spark.range(1)
.selectExpr("""
'{"myJSONKey" : {"myJSONValue" : [1, 2, 3]}}' as jsonString
""")

In [None]:
jsonDF.select(
get_json_object(col("jsonString"), "$.myJSONKey.myJSONValue[1]"),
json_tuple(col("jsonString"), "myJSONKey"))
.show()

## user defined functions

In [None]:
val udfExampleDF = spark.range(5).toDF("num")

In [None]:
def power3(number:Double):Double = {
        number * number * number
    }
power3(2.0)

In [None]:
val power3udf = udf(power3(_:Double):Double)

In [None]:
df.select(power3udf(col("Quantity"))).show()

In [None]:
spark.udf.register("power3", power3(_:Double):Double)
udfExampleDF.selectExpr("power3(num)").show()