In [1]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("001")\
.master("local").config("spark.sql.warehouse.dir", "file:///C:/tmp/hive")\
.getOrCreate()
spark.conf.set("spark.sql.shuffle.partitions", "5")
print("SparkSession created with name as 'spark'")

SparkSession created with name as 'spark'


In [2]:
df=spark.read.format("csv")\
.option("header", "true")\
.option("inferSchema", "true")\
.load("../../../data/SparkTheDefinitiveGuide/retail-data/by-day/2010-12-01.csv")
df.printSchema()
df.createOrReplaceTempView("dfTable")

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Country: string (nullable = true)



### Structs

#### DataFrames within DataFrames

In [5]:
df.selectExpr("(Description, InvoiceNo) as complex", "*").show(2)
df.selectExpr("struct(Description, InvoiceNo) as complex", "*").show(2)
from pyspark.sql.functions import struct
df.select(struct("Description", "InvoiceNo").alias("complex")).show(2)

+--------------------+---------+---------+--------------------+--------+----------------+---------+----------+--------------+
|             complex|InvoiceNo|StockCode|         Description|Quantity|     InvoiceDate|UnitPrice|CustomerID|       Country|
+--------------------+---------+---------+--------------------+--------+----------------+---------+----------+--------------+
|[WHITE HANGING HE...|   536365|   85123A|WHITE HANGING HEA...|       6|01-12-2010 08:26|     2.55|     17850|United Kingdom|
|          [, 536365]|   536365|    71053|                null|       6|01-12-2010 08:26|     3.39|     17850|United Kingdom|
+--------------------+---------+---------+--------------------+--------+----------------+---------+----------+--------------+
only showing top 2 rows

+--------------------+---------+---------+--------------------+--------+----------------+---------+----------+--------------+
|             complex|InvoiceNo|StockCode|         Description|Quantity|     InvoiceDate|Unit

### Array

In [19]:
from pyspark.sql.functions import split,col
df.select(split(col("Description"), " ")).show(2)

+-------------------------+
|split(Description,  , -1)|
+-------------------------+
|     [WHITE, HANGING, ...|
|     [KNITTED, UNION, ...|
+-------------------------+
only showing top 2 rows



In [20]:
 # Contains   : instr also does the same thing 
from pyspark.sql.functions import array_contains
df.select(array_contains(split(col("Description"), " "), "WHITE")).show(2)

+------------------------------------------------+
|array_contains(split(Description,  , -1), WHITE)|
+------------------------------------------------+
|                                            true|
|                                           false|
+------------------------------------------------+
only showing top 2 rows



#### Explode 

In [21]:
from pyspark.sql.functions import split, explode
df.withColumn("splitted", split(col("Description"), " "))\
.withColumn("exploded", explode(col("splitted")))\
.select("Description", "InvoiceNo", "splitted", "exploded").show(2)

+--------------------+---------+--------------------+--------+
|         Description|InvoiceNo|            splitted|exploded|
+--------------------+---------+--------------------+--------+
|WHITE HANGING HEA...|   536365|[WHITE, HANGING, ...|   WHITE|
|WHITE HANGING HEA...|   536365|[WHITE, HANGING, ...| HANGING|
+--------------------+---------+--------------------+--------+
only showing top 2 rows



#### Maps

In [22]:
from pyspark.sql.functions import create_map
df.select(create_map(col("Description"), col("InvoiceNo")).alias("complex_map")).show(2)

+--------------------+
|         complex_map|
+--------------------+
|[WHITE HANGING HE...|
|[KNITTED UNION FL...|
+--------------------+
only showing top 2 rows



In [None]:
# access value with key 
from pyspark.sql.functions import split,col
df.select(map(col("Description"), col("InvoiceNo")).alias("complex_map"))\
.selectExpr("complex_map['WHITE METAL LANTERN']").show(2)

In [None]:
df.select(map(col("Description"), col("InvoiceNo")).alias("complex_map"))\
.selectExpr("explode(complex_map)").show(2)

### JSON

In [41]:
jsonDF = spark.range(1).selectExpr("""'{"myJSONKey" : {"myJSONValue" : [1, 2, 3]}}' as jsonString""")

In [51]:
# get_json_object ,json_tuple
from pyspark.sql.functions import get_json_object, json_tuple
jsonDF.select(get_json_object(col("jsonString"), "$.myJSONKey.myJSONValue[1]").alias('Column') ,
json_tuple(col("jsonString"), "myJSONKey").alias('Tuple')).show(2)

+------+--------------------+
|Column|               Tuple|
+------+--------------------+
|     2|{"myJSONValue":[1...|
+------+--------------------+



In [57]:
# to_json (from Struct to Json)
from pyspark.sql.functions import to_json
df.selectExpr("(InvoiceNo, Description) as myStruct")\
.select(to_json(col("myStruct"))).show(2)

DataFrame[myStruct: struct<InvoiceNo:string,Description:string>]

In [58]:
# from_json (from Json to specified schema)
from pyspark.sql.functions import from_json
from pyspark.sql.types import *
parseSchema = StructType((
StructField("InvoiceNo",StringType(),True),
StructField("Description",StringType(),True)))
df.selectExpr("(InvoiceNo, Description) as myStruct")\
.select(to_json(col("myStruct")).alias("newJSON"))\
.select(from_json(col("newJSON"), parseSchema), col("newJSON")).show(2)

+--------------------+--------------------+
|  from_json(newJSON)|             newJSON|
+--------------------+--------------------+
|[536365, WHITE HA...|{"InvoiceNo":"536...|
|[536365, KNITTED ...|{"InvoiceNo":"536...|
+--------------------+--------------------+
only showing top 2 rows



### UDF

In [60]:
udfExampleDF = spark.range(5).toDF("num") 
def power3(double_value): return double_value ** 3 # Define UDF
power3(2.0)                                                 # Use UDF

8.0

### When you use the function, there are essentially two different things that occur. 
If the function is written in Scala or Java, you can use it within the Java Virtual Machine (JVM). This means that there will be little performance penalty aside from the fact that you can’t take advantage of code generation capabilities that Spark has for built-in functions. There can be performance issues if you create or use a lot of objects

If the function is written in Python, something quite different happens. Spark starts a Python process on the worker, serializes all of the data to a format that Python can understand (remember, it was in the JVM earlier), executes the function row by row on that data in the Python process, and then finally returns the results of the row operations to the JVM and Spark.


### Thats why UDF is not a good choice in spark. 

In [62]:
from pyspark.sql.functions import udf
power3udf = udf(power3)                        # Register the UDF
from pyspark.sql.functions import col
udfExampleDF.select(power3udf(col("num"))).show(2)     # Use the uDF

+-----------+
|power3(num)|
+-----------+
|          0|
|          1|
+-----------+
only showing top 2 rows

