# Apache Spark

## Import PySpark

In [1]:
import findspark
findspark.init()

import pyspark

## Initiate Spark Session with YARN Mode

In [2]:
from pyspark.conf import SparkConf
from pyspark.sql.session import SparkSession
import pyspark.sql.functions as F

conf=SparkConf()
conf.set("spark.driver.memory",      "1g") 
conf.set("spark.executor.memory",    "1g")
conf.set("spark.executor.instances", "3" )
conf.set("spark.jars.packages", "org.apache.spark:spark-avro_2.12:3.1.2")

spark = SparkSession.builder.master("yarn").appName("Spark BDP Example 1").enableHiveSupport().config(conf=conf).getOrCreate()

:: loading settings :: url = jar:file:/home/bigdatapedia/spark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/bigdatapedia/.ivy2/cache
The jars for the packages stored in: /home/bigdatapedia/.ivy2/jars
org.apache.spark#spark-avro_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-1eafdca0-894e-4c23-baf3-313e2a3fe414;1.0
	confs: [default]
	found org.apache.spark#spark-avro_2.12;3.1.2 in central
	found org.spark-project.spark#unused;1.0.0 in central
:: resolution report :: resolve 600ms :: artifacts dl 6ms
	:: modules in use:
	org.apache.spark#spark-avro_2.12;3.1.2 from central in [default]
	org.spark-project.spark#unused;1.0.0 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   2   |   0   |   0   |   0   ||   2   |   0   |
	--------------------

In [3]:
spark

## Initiate Spark Session with Local Mode

In [None]:
from pyspark.conf import SparkConf
from pyspark.sql.session import SparkSession
import pyspark.sql.functions as F

spark = SparkSession.builder.master("local").appName("Spark BDP Example").enableHiveSupport().config(conf=conf).getOrCreate()

In [6]:
spark

### Load Sample data 

In [4]:
!hdfs dfs -mkdir -p /user/bigdatapedia/input/parquet

In [5]:
!hdfs dfs -put /home/bigdatapedia/data/customer_parq.parquet /user/bigdatapedia/input/parquet/

In [6]:
!hdfs dfs -ls /user/bigdatapedia/input/parquet

Found 1 items
-rw-r--r--   3 bigdatapedia supergroup     254648 2023-07-23 02:16 /user/bigdatapedia/input/parquet/customer_parq.parquet


In [4]:
df_cust = spark.read.parquet("/user/bigdatapedia/input/parquet")

                                                                                

In [9]:
df_cust.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- customer_fname: string (nullable = true)
 |-- customer_lname: string (nullable = true)
 |-- customer_email: string (nullable = true)
 |-- customer_password: string (nullable = true)
 |-- customer_street: string (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)
 |-- customer_zipcode: string (nullable = true)



In [23]:
from pyspark.sql import functions as F


df = df_cust.select("customer_id", "customer_fname", "customer_lname", "customer_city", "customer_state", F.current_timestamp().alias("cre_ts"))

In [24]:
df.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- customer_fname: string (nullable = true)
 |-- customer_lname: string (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)
 |-- cre_ts: timestamp (nullable = false)



In [25]:
df.show(5, 0)

+-----------+--------------+--------------+-------------+--------------+-----------------------+
|customer_id|customer_fname|customer_lname|customer_city|customer_state|cre_ts                 |
+-----------+--------------+--------------+-------------+--------------+-----------------------+
|1          |Richard       |Hernandez     |Brownsville  |TX            |2023-07-23 02:24:35.581|
|2          |Mary          |Barrett       |Littleton    |CO            |2023-07-23 02:24:35.581|
|3          |Ann           |Smith         |Caguas       |PR            |2023-07-23 02:24:35.581|
|4          |Mary          |Jones         |San Marcos   |CA            |2023-07-23 02:24:35.581|
|5          |Robert        |Hudson        |Caguas       |PR            |2023-07-23 02:24:35.581|
+-----------+--------------+--------------+-------------+--------------+-----------------------+
only showing top 5 rows



### Actions

#### 1) show

In [40]:
df.show(5, truncate=True)

+-----------+--------------+--------------+-------------+--------------+--------------------+
|customer_id|customer_fname|customer_lname|customer_city|customer_state|              cre_ts|
+-----------+--------------+--------------+-------------+--------------+--------------------+
|          1|       Richard|     Hernandez|  Brownsville|            TX|2023-07-23 02:34:...|
|          2|          Mary|       Barrett|    Littleton|            CO|2023-07-23 02:34:...|
|          3|           Ann|         Smith|       Caguas|            PR|2023-07-23 02:34:...|
|          4|          Mary|         Jones|   San Marcos|            CA|2023-07-23 02:34:...|
|          5|        Robert|        Hudson|       Caguas|            PR|2023-07-23 02:34:...|
+-----------+--------------+--------------+-------------+--------------+--------------------+
only showing top 5 rows



In [41]:
df.show(5, truncate=False)

+-----------+--------------+--------------+-------------+--------------+-----------------------+
|customer_id|customer_fname|customer_lname|customer_city|customer_state|cre_ts                 |
+-----------+--------------+--------------+-------------+--------------+-----------------------+
|1          |Richard       |Hernandez     |Brownsville  |TX            |2023-07-23 02:35:02.768|
|2          |Mary          |Barrett       |Littleton    |CO            |2023-07-23 02:35:02.768|
|3          |Ann           |Smith         |Caguas       |PR            |2023-07-23 02:35:02.768|
|4          |Mary          |Jones         |San Marcos   |CA            |2023-07-23 02:35:02.768|
|5          |Robert        |Hudson        |Caguas       |PR            |2023-07-23 02:35:02.768|
+-----------+--------------+--------------+-------------+--------------+-----------------------+
only showing top 5 rows



#### 2) Collect

In [42]:
df.limit(5).collect()

[Row(customer_id=1, customer_fname='Richard', customer_lname='Hernandez', customer_city='Brownsville', customer_state='TX', cre_ts=datetime.datetime(2023, 7, 23, 2, 35, 28, 560000)),
 Row(customer_id=2, customer_fname='Mary', customer_lname='Barrett', customer_city='Littleton', customer_state='CO', cre_ts=datetime.datetime(2023, 7, 23, 2, 35, 28, 560000)),
 Row(customer_id=3, customer_fname='Ann', customer_lname='Smith', customer_city='Caguas', customer_state='PR', cre_ts=datetime.datetime(2023, 7, 23, 2, 35, 28, 560000)),
 Row(customer_id=4, customer_fname='Mary', customer_lname='Jones', customer_city='San Marcos', customer_state='CA', cre_ts=datetime.datetime(2023, 7, 23, 2, 35, 28, 560000)),
 Row(customer_id=5, customer_fname='Robert', customer_lname='Hudson', customer_city='Caguas', customer_state='PR', cre_ts=datetime.datetime(2023, 7, 23, 2, 35, 28, 560000))]

#### 3) count

In [43]:
df.count()

12435

#### 4) head

In [44]:
df.head()

Row(customer_id=1, customer_fname='Richard', customer_lname='Hernandez', customer_city='Brownsville', customer_state='TX', cre_ts=datetime.datetime(2023, 7, 23, 2, 37, 10, 683000))

#### 5) tail

In [46]:
df.tail(4)

[Row(customer_id=12432, customer_fname='Angela', customer_lname='Smith', customer_city='Caguas', customer_state='PR', cre_ts=datetime.datetime(2023, 7, 23, 2, 36, 25, 933000)),
 Row(customer_id=12433, customer_fname='Benjamin', customer_lname='Garcia', customer_city='Levittown', customer_state='NY', cre_ts=datetime.datetime(2023, 7, 23, 2, 36, 25, 933000)),
 Row(customer_id=12434, customer_fname='Mary', customer_lname='Mills', customer_city='Caguas', customer_state='PR', cre_ts=datetime.datetime(2023, 7, 23, 2, 36, 25, 933000)),
 Row(customer_id=12435, customer_fname='Laura', customer_lname='Horton', customer_city='Summerville', customer_state='SC', cre_ts=datetime.datetime(2023, 7, 23, 2, 36, 25, 933000))]

#### 6) take

In [50]:
df.take(5)

[Row(customer_id=1, customer_fname='Richard', customer_lname='Hernandez', customer_city='Brownsville', customer_state='TX', cre_ts=datetime.datetime(2023, 7, 23, 2, 39, 30, 391000)),
 Row(customer_id=2, customer_fname='Mary', customer_lname='Barrett', customer_city='Littleton', customer_state='CO', cre_ts=datetime.datetime(2023, 7, 23, 2, 39, 30, 391000)),
 Row(customer_id=3, customer_fname='Ann', customer_lname='Smith', customer_city='Caguas', customer_state='PR', cre_ts=datetime.datetime(2023, 7, 23, 2, 39, 30, 391000)),
 Row(customer_id=4, customer_fname='Mary', customer_lname='Jones', customer_city='San Marcos', customer_state='CA', cre_ts=datetime.datetime(2023, 7, 23, 2, 39, 30, 391000)),
 Row(customer_id=5, customer_fname='Robert', customer_lname='Hudson', customer_city='Caguas', customer_state='PR', cre_ts=datetime.datetime(2023, 7, 23, 2, 39, 30, 391000))]

#### 7) all write options
    (like parquet, ORC, json, etc...)

### Dataframe Reader

#### 1) Parquet

In [50]:
df_cust = spark.read.parquet("/user/bigdatapedia/input/parquet")

In [51]:
df_cust.show(5)

[Stage 20:>                                                         (0 + 1) / 1]

+-----------+--------------+--------------+--------------+-----------------+--------------------+-------------+--------------+----------------+
|customer_id|customer_fname|customer_lname|customer_email|customer_password|     customer_street|customer_city|customer_state|customer_zipcode|
+-----------+--------------+--------------+--------------+-----------------+--------------------+-------------+--------------+----------------+
|          1|       Richard|     Hernandez|     XXXXXXXXX|        XXXXXXXXX|  6303 Heather Plaza|  Brownsville|            TX|           78521|
|          2|          Mary|       Barrett|     XXXXXXXXX|        XXXXXXXXX|9526 Noble Embers...|    Littleton|            CO|           80126|
|          3|           Ann|         Smith|     XXXXXXXXX|        XXXXXXXXX|3422 Blue Pioneer...|       Caguas|            PR|           00725|
|          4|          Mary|         Jones|     XXXXXXXXX|        XXXXXXXXX|  8324 Little Common|   San Marcos|            CA|          

                                                                                

#### 2) ORC

In [53]:
!hdfs dfs -mkdir -p /user/bigdatapedia/input/orc

In [54]:
!hdfs dfs -put /home/bigdatapedia/data/new_orders.snappy.orc /user/bigdatapedia/input/orc/

In [55]:
!hdfs dfs -ls /user/bigdatapedia/input/orc

Found 1 items
-rw-r--r--   3 bigdatapedia supergroup     185892 2023-07-23 02:42 /user/bigdatapedia/input/orc/new_orders.snappy.orc


In [56]:
df_order = spark.read.orc("/user/bigdatapedia/input/orc")

In [59]:
df_order.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: timestamp (nullable = true)
 |-- order_customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



In [58]:
df_order.show(5,0)

+--------+-------------------+-----------------+---------------+
|order_id|order_date         |order_customer_id|order_status   |
+--------+-------------------+-----------------+---------------+
|1       |2013-07-25 00:00:00|11599            |CLOSED         |
|2       |2013-07-25 00:00:00|256              |PENDING_PAYMENT|
|3       |2013-07-25 00:00:00|12111            |COMPLETE       |
|4       |2013-07-25 00:00:00|8827             |CLOSED         |
|5       |2013-07-25 00:00:00|11318            |COMPLETE       |
+--------+-------------------+-----------------+---------------+
only showing top 5 rows



#### 3) Avro

In [60]:
!hdfs dfs -mkdir -p /user/bigdatapedia/input/avro

In [61]:
!hdfs dfs -put /home/bigdatapedia/data/customers_avro.avro /user/bigdatapedia/input/avro/

In [62]:
!hdfs dfs -ls /user/bigdatapedia/input/avro

Found 1 items
-rw-r--r--   3 bigdatapedia supergroup    1032497 2023-07-23 02:44 /user/bigdatapedia/input/avro/customers_avro.avro


In [4]:
df_cust_avro = spark.read.format("avro").load("/user/bigdatapedia/input/avro")

In [6]:
df_cust_avro.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- customer_fname: string (nullable = true)
 |-- customer_lname: string (nullable = true)
 |-- customer_email: string (nullable = true)
 |-- customer_password: string (nullable = true)
 |-- customer_street: string (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)
 |-- customer_zipcode: string (nullable = true)



In [5]:
df_cust_avro.show(5, 0)

[Stage 0:>                                                          (0 + 1) / 1]

+-----------+--------------+--------------+--------------+-----------------+-----------------------+-------------+--------------+----------------+
|customer_id|customer_fname|customer_lname|customer_email|customer_password|customer_street        |customer_city|customer_state|customer_zipcode|
+-----------+--------------+--------------+--------------+-----------------+-----------------------+-------------+--------------+----------------+
|1          |Richard       |Hernandez     |XXXXXXXXX     |XXXXXXXXX        |6303 Heather Plaza     |Brownsville  |TX            |78521           |
|2          |Mary          |Barrett       |XXXXXXXXX     |XXXXXXXXX        |9526 Noble Embers Ridge|Littleton    |CO            |80126           |
|3          |Ann           |Smith         |XXXXXXXXX     |XXXXXXXXX        |3422 Blue Pioneer Bend |Caguas       |PR            |00725           |
|4          |Mary          |Jones         |XXXXXXXXX     |XXXXXXXXX        |8324 Little Common     |San Marcos   |CA  

                                                                                

#### 4) Json

In [7]:
!hdfs dfs -mkdir -p /user/bigdatapedia/input/json

In [8]:
!hdfs dfs -put /home/bigdatapedia/data/complex_JS.json /user/bigdatapedia/input/json/

In [9]:
!hdfs dfs -ls /user/bigdatapedia/input/json

Found 1 items
-rw-r--r--   3 bigdatapedia supergroup        681 2023-07-23 02:54 /user/bigdatapedia/input/json/complex_JS.json


In [13]:
df_json = spark.read.json("/user/bigdatapedia/input/json", multiLine=True)

In [14]:
df_json.printSchema()

root
 |-- data: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- MainId: long (nullable = true)
 |    |    |-- categories: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- CategoryID: long (nullable = true)
 |    |    |    |    |-- CategoryName: string (nullable = true)
 |    |    |    |    |-- categoryFunction: array (nullable = true)
 |    |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |    |-- Function1: string (nullable = true)
 |    |    |    |    |    |    |-- Function2: string (nullable = true)
 |    |    |-- firstName: string (nullable = true)
 |    |    |-- lastName: string (nullable = true)
 |    |    |-- location: string (nullable = true)
 |    |    |-- middleName: string (nullable = true)
 |-- messages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- success: boolean (nullable = true)



In [15]:
df_json.show(5,0)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------+-------+
|data                                                                                                                                                                                                      |messages|success|
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------+-------+
|[{1111, [{1, Example, null}], Sherlock, Homes, null, Homes}, {122, [{2, Example2, [{FuntionData1, FuntionData2}]}], James, Watson, null, null}, {123, [{2, Example2, null}], James, null, NewJersy, null}]|[]      |true   |
+---------------------------------------------------------------------------------------------------------------

In [20]:
df_json.select(df_json.data.MainId.alias("sno"),
              df_json.data.firstName.alias("fname")).show()

+----------------+--------------------+
|             sno|               fname|
+----------------+--------------------+
|[1111, 122, 123]|[Sherlock, James,...|
+----------------+--------------------+



In [26]:
df_json.select(F.explode(df_json.data.MainId).alias("sno"), df_json.data.firstName.alias("fname")).show(5, 0)

+----+------------------------+
|sno |fname                   |
+----+------------------------+
|1111|[Sherlock, James, James]|
|122 |[Sherlock, James, James]|
|123 |[Sherlock, James, James]|
+----+------------------------+



#### 5) csv

In [27]:
!hdfs dfs -mkdir -p /user/bigdatapedia/input/csv

In [28]:
!hdfs dfs -put /home/bigdatapedia/data/sample_de30_patient.txt /user/bigdatapedia/input/csv/

In [29]:
!hdfs dfs -ls /user/bigdatapedia/input/csv

Found 1 items
-rw-r--r--   3 bigdatapedia supergroup        298 2023-07-23 03:08 /user/bigdatapedia/input/csv/sample_de30_patient.txt


In [30]:
df_csv = spark.read.csv("/user/bigdatapedia/input/csv", 
                        header = True, 
                        inferSchema=True)

                                                                                

In [31]:
df_csv.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- drug: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: integer (nullable = true)



In [32]:
df_csv.show(5, 0)

+---+---------+-----------+------+---+
|id |name     |drug       |gender|age|
+---+---------+-----------+------+---+
|51 |senthil  |paracetamol|male  |12 |
|32 |saravanan|avil       |male  |44 |
|33 |rajesh   |metacin    |male  |26 |
|34 |usha     |paracetamol|female|20 |
|25 |alex     |paracetamol|male  |48 |
+---+---------+-----------+------+---+
only showing top 5 rows



#### 6) Hive

In [41]:
spark.sql("set hive.enable.vectorization=true;")

df_hive = spark.sql("""
                        select * from de30t.demo
                    """)

23/07/23 03:19:01 WARN command.SetCommand: 'SET hive.enable.vectorization=true' might not work, since Spark doesn't support changing the Hive config dynamically. Please pass the Hive-specific config by adding the prefix spark.hadoop (e.g. spark.hadoop.hive.enable.vectorization) when starting a Spark application. For details, see the link: https://spark.apache.org/docs/latest/configuration.html#dynamically-loading-spark-properties.


In [42]:
df_hive.show(5, 0)

+---+----+
|id |name|
+---+----+
|1  |a   |
|2  |b   |
+---+----+



### Dataframe Writer

#### 1) Parquet

In [52]:
df_cust.write.parquet("/user/bigdatapedia/output/parquet")

                                                                                

#### 2) ORC

In [54]:
df_cust.write.orc("/user/bigdatapedia/output/orc")

                                                                                

#### 3) Avro

In [5]:
df_cust.write.format("avro").save("/user/bigdatapedia/output/avro1")

                                                                                

### Functions

In [114]:
from pyspark.sql.functions import lit, current_timestamp, when, window

from pyspark.sql import functions as F


#### 1) Case when

In [115]:
df_select.show(5,0)

+-----------+--------------+-------------+--------------+-----------------------+
|customer_id|customer_fname|customer_city|customer_state|cre_ts                 |
+-----------+--------------+-------------+--------------+-----------------------+
|1          |Richard       |Brownsville  |TX            |2023-07-22 03:27:55.995|
|2          |Mary          |Littleton    |CO            |2023-07-22 03:27:55.995|
|3          |Ann           |Caguas       |PR            |2023-07-22 03:27:55.995|
|4          |Mary          |San Marcos   |CA            |2023-07-22 03:27:55.995|
|5          |Robert        |Caguas       |PR            |2023-07-22 03:27:55.995|
+-----------+--------------+-------------+--------------+-----------------------+
only showing top 5 rows



In [120]:
df_casewhen = df_select.select("*", when(df_select.customer_state == 'TX', 
                                         "Texas").otherwise("Non Texas").alias("New_State"))

In [121]:
df_casewhen.show(5,0)

+-----------+--------------+-------------+--------------+-----------------------+---------+
|customer_id|customer_fname|customer_city|customer_state|cre_ts                 |New_State|
+-----------+--------------+-------------+--------------+-----------------------+---------+
|1          |Richard       |Brownsville  |TX            |2023-07-22 03:31:07.668|Texas    |
|2          |Mary          |Littleton    |CO            |2023-07-22 03:31:07.668|Non Texas|
|3          |Ann           |Caguas       |PR            |2023-07-22 03:31:07.668|Non Texas|
|4          |Mary          |San Marcos   |CA            |2023-07-22 03:31:07.668|Non Texas|
|5          |Robert        |Caguas       |PR            |2023-07-22 03:31:07.668|Non Texas|
+-----------+--------------+-------------+--------------+-----------------------+---------+
only showing top 5 rows



#### 2) Window

In [128]:
from pyspark.sql.functions import row_number, rank, dense_rank

from pyspark.sql.window import Window

In [140]:
WindowSpec = Window.partitionBy("customer_city").orderBy("customer_city")

In [141]:
df_rank = df_select.withColumn("Ranking", rank().over(WindowSpec))

In [142]:
df_rank.show(5, 0)

+-----------+--------------+-------------+--------------+-----------------------+-------+
|customer_id|customer_fname|customer_city|customer_state|cre_ts                 |Ranking|
+-----------+--------------+-------------+--------------+-----------------------+-------+
|147        |Mary          |Hanover      |PA            |2023-07-22 03:40:53.351|1      |
|2544       |Mary          |Hanover      |PA            |2023-07-22 03:40:53.351|1      |
|2705       |Mary          |Hanover      |PA            |2023-07-22 03:40:53.351|1      |
|4650       |Mary          |Hanover      |PA            |2023-07-22 03:40:53.351|1      |
|6108       |Emma          |Hanover      |PA            |2023-07-22 03:40:53.351|1      |
+-----------+--------------+-------------+--------------+-----------------------+-------+
only showing top 5 rows



#### 3) Split

In [26]:
df_cast = df.selectExpr("*", 'cast(cre_ts as string) as sys_ts')

df_cast.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- customer_fname: string (nullable = true)
 |-- customer_lname: string (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)
 |-- cre_ts: timestamp (nullable = false)
 |-- sys_ts: string (nullable = false)



In [27]:
df_cast.show(5,0)

+-----------+--------------+--------------+-------------+--------------+-----------------------+-----------------------+
|customer_id|customer_fname|customer_lname|customer_city|customer_state|cre_ts                 |sys_ts                 |
+-----------+--------------+--------------+-------------+--------------+-----------------------+-----------------------+
|1          |Richard       |Hernandez     |Brownsville  |TX            |2023-07-23 02:24:39.585|2023-07-23 02:24:39.585|
|2          |Mary          |Barrett       |Littleton    |CO            |2023-07-23 02:24:39.585|2023-07-23 02:24:39.585|
|3          |Ann           |Smith         |Caguas       |PR            |2023-07-23 02:24:39.585|2023-07-23 02:24:39.585|
|4          |Mary          |Jones         |San Marcos   |CA            |2023-07-23 02:24:39.585|2023-07-23 02:24:39.585|
|5          |Robert        |Hudson        |Caguas       |PR            |2023-07-23 02:24:39.585|2023-07-23 02:24:39.585|
+-----------+--------------+----

In [28]:
df_split = df_cast.select("*", F.split("sys_ts", " ")[0].alias("Date_Str"), F.split("sys_ts", " ")[1].alias("TS_Str"))

df_split.show(5,0)

+-----------+--------------+--------------+-------------+--------------+----------------------+----------------------+----------+-----------+
|customer_id|customer_fname|customer_lname|customer_city|customer_state|cre_ts                |sys_ts                |Date_Str  |TS_Str     |
+-----------+--------------+--------------+-------------+--------------+----------------------+----------------------+----------+-----------+
|1          |Richard       |Hernandez     |Brownsville  |TX            |2023-07-23 02:24:40.18|2023-07-23 02:24:40.18|2023-07-23|02:24:40.18|
|2          |Mary          |Barrett       |Littleton    |CO            |2023-07-23 02:24:40.18|2023-07-23 02:24:40.18|2023-07-23|02:24:40.18|
|3          |Ann           |Smith         |Caguas       |PR            |2023-07-23 02:24:40.18|2023-07-23 02:24:40.18|2023-07-23|02:24:40.18|
|4          |Mary          |Jones         |San Marcos   |CA            |2023-07-23 02:24:40.18|2023-07-23 02:24:40.18|2023-07-23|02:24:40.18|
|5    

#### 4) Concat

In [32]:
df_concat = df_cast.select("*", F.concat("customer_fname", F.lit(" ") ,"customer_lname").alias("customer_full"))
df_concat.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- customer_fname: string (nullable = true)
 |-- customer_lname: string (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)
 |-- cre_ts: timestamp (nullable = false)
 |-- sys_ts: string (nullable = false)
 |-- customer_full: string (nullable = true)



In [33]:
df_concat.show(5,0)

+-----------+--------------+--------------+-------------+--------------+----------------------+----------------------+-----------------+
|customer_id|customer_fname|customer_lname|customer_city|customer_state|cre_ts                |sys_ts                |customer_full    |
+-----------+--------------+--------------+-------------+--------------+----------------------+----------------------+-----------------+
|1          |Richard       |Hernandez     |Brownsville  |TX            |2023-07-23 02:26:31.75|2023-07-23 02:26:31.75|Richard Hernandez|
|2          |Mary          |Barrett       |Littleton    |CO            |2023-07-23 02:26:31.75|2023-07-23 02:26:31.75|Mary Barrett     |
|3          |Ann           |Smith         |Caguas       |PR            |2023-07-23 02:26:31.75|2023-07-23 02:26:31.75|Ann Smith        |
|4          |Mary          |Jones         |San Marcos   |CA            |2023-07-23 02:26:31.75|2023-07-23 02:26:31.75|Mary Jones       |
|5          |Robert        |Hudson       

In [37]:
df_concat = df_cast.select("*", F.concat("customer_fname", F.coalesce(F.lit(" "), F.lit("Null") ,"customer_lname")).alias("customer_full"))
df_concat.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- customer_fname: string (nullable = true)
 |-- customer_lname: string (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)
 |-- cre_ts: timestamp (nullable = false)
 |-- sys_ts: string (nullable = false)
 |-- customer_full: string (nullable = true)



#### 5) Month

In [38]:
df_month = df.select("*",F.month("cre_ts").alias("Month"))
                     
df_month.show(5,0)             

+-----------+--------------+--------------+-------------+--------------+-----------------------+-----+
|customer_id|customer_fname|customer_lname|customer_city|customer_state|cre_ts                 |Month|
+-----------+--------------+--------------+-------------+--------------+-----------------------+-----+
|1          |Richard       |Hernandez     |Brownsville  |TX            |2023-07-23 02:31:19.205|7    |
|2          |Mary          |Barrett       |Littleton    |CO            |2023-07-23 02:31:19.205|7    |
|3          |Ann           |Smith         |Caguas       |PR            |2023-07-23 02:31:19.205|7    |
|4          |Mary          |Jones         |San Marcos   |CA            |2023-07-23 02:31:19.205|7    |
|5          |Robert        |Hudson        |Caguas       |PR            |2023-07-23 02:31:19.205|7    |
+-----------+--------------+--------------+-------------+--------------+-----------------------+-----+
only showing top 5 rows



In [39]:
df_year = df.select("*",F.year("cre_ts").alias("Month"))
                     
df_year.show(5,0)             

+-----------+--------------+--------------+-------------+--------------+-----------------------+-----+
|customer_id|customer_fname|customer_lname|customer_city|customer_state|cre_ts                 |Month|
+-----------+--------------+--------------+-------------+--------------+-----------------------+-----+
|1          |Richard       |Hernandez     |Brownsville  |TX            |2023-07-23 02:31:38.435|2023 |
|2          |Mary          |Barrett       |Littleton    |CO            |2023-07-23 02:31:38.435|2023 |
|3          |Ann           |Smith         |Caguas       |PR            |2023-07-23 02:31:38.435|2023 |
|4          |Mary          |Jones         |San Marcos   |CA            |2023-07-23 02:31:38.435|2023 |
|5          |Robert        |Hudson        |Caguas       |PR            |2023-07-23 02:31:38.435|2023 |
+-----------+--------------+--------------+-------------+--------------+-----------------------+-----+
only showing top 5 rows



#### 6) Explode

In [19]:
df_json.select(F.explode(df_json.data.MainId).alias("sno")).show()

[Stage 5:>                                                          (0 + 1) / 1]

+----+
| sno|
+----+
|1111|
| 122|
| 123|
+----+



                                                                                