# Apache Spark

## Import PySpark

In [5]:
import findspark
findspark.init()

import pyspark

## Initiate Spark Session with YARN Mode

In [6]:
from pyspark.conf import SparkConf
from pyspark.sql.session import SparkSession
import pyspark.sql.functions as F

conf=SparkConf()
conf.set("spark.driver.memory",      "1g") 
conf.set("spark.executor.memory",    "1g")
conf.set("spark.executor.instances", "2" )

spark = SparkSession.builder.master("yarn").appName("Spark BDP Example").enableHiveSupport().config(conf=conf).getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [7]:
spark

## Initiate Spark Session with Local Mode

In [5]:
from pyspark.conf import SparkConf
from pyspark.sql.session import SparkSession
import pyspark.sql.functions as F

spark = SparkSession.builder.master("local").appName("Spark BDP Example").enableHiveSupport().config(conf=conf).getOrCreate()

In [6]:
spark

## Dataframe

In [76]:
df.select("id", "name", "age").show()

+---+---------+---+
| id|     name|age|
+---+---------+---+
| 51|  senthil| 12|
| 32|saravanan| 44|
| 33|   rajesh| 26|
| 34|     usha| 20|
| 25|     alex| 48|
| 16|    nasir| 37|
| 17|    singh| 15|
| 18| santhosh| 12|
| 19|    sarah| 14|
| 40|      raj| 27|
+---+---------+---+



In [80]:
df.rdd.toDebugString()

b'(2) MapPartitionsRDD[64] at javaToPython at NativeMethodAccessorImpl.java:0 []\n |  MapPartitionsRDD[63] at javaToPython at NativeMethodAccessorImpl.java:0 []\n |  SQLExecutionRDD[62] at javaToPython at NativeMethodAccessorImpl.java:0 []\n |  MapPartitionsRDD[61] at javaToPython at NativeMethodAccessorImpl.java:0 []\n |  MapPartitionsRDD[50] at applySchemaToPythonRDD at NativeMethodAccessorImpl.java:0 []\n |  MapPartitionsRDD[49] at map at SerDeUtil.scala:69 []\n |  MapPartitionsRDD[48] at mapPartitions at SerDeUtil.scala:117 []\n |  PythonRDD[47] at RDD at PythonRDD.scala:53 []\n |  /user/bigdatapedia/spark_txt MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0 []\n |  /user/bigdatapedia/spark_txt HadoopRDD[0] at textFile at NativeMethodAccessorImpl.java:0 []'

In [None]:
(2) MapPartitionsRDD[64] at javaToPython at NativeMethodAccessorImpl.java:0 []\n
|  MapPartitionsRDD[63] at javaToPython at NativeMethodAccessorImpl.java:0 []\n
|  SQLExecutionRDD[62] at javaToPython at NativeMethodAccessorImpl.java:0 []\n
|  MapPartitionsRDD[61] at javaToPython at NativeMethodAccessorImpl.java:0 []\n
|  MapPartitionsRDD[50] at applySchemaToPythonRDD at NativeMethodAccessorImpl.java:0 []\n
|  MapPartitionsRDD[49] at map at SerDeUtil.scala:69 []\n
|  MapPartitionsRDD[48] at mapPartitions at SerDeUtil.scala:117 []\n
|  PythonRDD[47] at RDD at PythonRDD.scala:53 []\n
|  /user/bigdatapedia/spark_txt MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0 []\n
|  /user/bigdatapedia/spark_txt HadoopRDD[0] at textFile at NativeMethodAccessorImpl.java:0 []

### Load Sample data 

In [1]:
!hdfs dfs -mkdir -p /user/bigdatapedia/input/parquet

In [2]:
!hdfs dfs -put /home/bigdatapedia/data/customer_parq.parquet /user/bigdatapedia/input/parquet/

In [3]:
!hdfs dfs -ls /user/bigdatapedia/input/parquet

Found 1 items
-rw-r--r--   3 bigdatapedia supergroup     254648 2023-07-22 02:28 /user/bigdatapedia/input/parquet/customer_parq.parquet


### Transformations
    Narrow Transformation
    Wide Transformation (Shuffle)

In [8]:
df_cust = spark.read.parquet("/user/bigdatapedia/input/parquet")

                                                                                

In [9]:
df_cust

DataFrame[customer_id: int, customer_fname: string, customer_lname: string, customer_email: string, customer_password: string, customer_street: string, customer_city: string, customer_state: string, customer_zipcode: string]

In [10]:
from pyspark.sql.functions import lit, current_timestamp

#### 1) select 

In [34]:
df_select = df_cust.select("customer_id", "customer_fname", "customer_city", "customer_state", current_timestamp().alias("cre_ts"))

df_select.show(5, 0)

+-----------+--------------+-------------+--------------+-----------------------+
|customer_id|customer_fname|customer_city|customer_state|cre_ts                 |
+-----------+--------------+-------------+--------------+-----------------------+
|1          |Richard       |Brownsville  |TX            |2023-07-22 02:42:20.099|
|2          |Mary          |Littleton    |CO            |2023-07-22 02:42:20.099|
|3          |Ann           |Caguas       |PR            |2023-07-22 02:42:20.099|
|4          |Mary          |San Marcos   |CA            |2023-07-22 02:42:20.099|
|5          |Robert        |Caguas       |PR            |2023-07-22 02:42:20.099|
+-----------+--------------+-------------+--------------+-----------------------+
only showing top 5 rows



In [79]:
df_select.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- customer_fname: string (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)
 |-- cre_ts: timestamp (nullable = false)



#### 2) withColumn

In [94]:
df_cust_wc = df_cust.withColumn("new_cust_id", df_cust["customer_id"] + 100)

In [95]:
df_cust_wc.show(10)

+-----------+--------------+--------------+--------------+-----------------+--------------------+-------------+--------------+----------------+-----------+
|customer_id|customer_fname|customer_lname|customer_email|customer_password|     customer_street|customer_city|customer_state|customer_zipcode|new_cust_id|
+-----------+--------------+--------------+--------------+-----------------+--------------------+-------------+--------------+----------------+-----------+
|          1|       Richard|     Hernandez|     XXXXXXXXX|        XXXXXXXXX|  6303 Heather Plaza|  Brownsville|            TX|           78521|        101|
|          2|          Mary|       Barrett|     XXXXXXXXX|        XXXXXXXXX|9526 Noble Embers...|    Littleton|            CO|           80126|        102|
|          3|           Ann|         Smith|     XXXXXXXXX|        XXXXXXXXX|3422 Blue Pioneer...|       Caguas|            PR|           00725|        103|
|          4|          Mary|         Jones|     XXXXXXXXX|      

In [97]:
df_cust_wc_2 = df_cust.withColumn("newnull", lit(None))

In [98]:
df_cust_wc_2.show()

+-----------+--------------+--------------+--------------+-----------------+--------------------+-------------+--------------+----------------+-------+
|customer_id|customer_fname|customer_lname|customer_email|customer_password|     customer_street|customer_city|customer_state|customer_zipcode|newnull|
+-----------+--------------+--------------+--------------+-----------------+--------------------+-------------+--------------+----------------+-------+
|          1|       Richard|     Hernandez|     XXXXXXXXX|        XXXXXXXXX|  6303 Heather Plaza|  Brownsville|            TX|           78521|   null|
|          2|          Mary|       Barrett|     XXXXXXXXX|        XXXXXXXXX|9526 Noble Embers...|    Littleton|            CO|           80126|   null|
|          3|           Ann|         Smith|     XXXXXXXXX|        XXXXXXXXX|3422 Blue Pioneer...|       Caguas|            PR|           00725|   null|
|          4|          Mary|         Jones|     XXXXXXXXX|        XXXXXXXXX|  8324 Littl

In [100]:
df_cust_wc_3 = df_cust.withColumn("cre_ts", current_timestamp())

In [104]:
df_cust_wc_3.select("customer_id", "customer_fname", "cre_ts").show(5, False)

+-----------+--------------+-----------------------+
|customer_id|customer_fname|cre_ts                 |
+-----------+--------------+-----------------------+
|1          |Richard       |2023-07-16 03:29:55.108|
|2          |Mary          |2023-07-16 03:29:55.108|
|3          |Ann           |2023-07-16 03:29:55.108|
|4          |Mary          |2023-07-16 03:29:55.108|
|5          |Robert        |2023-07-16 03:29:55.108|
+-----------+--------------+-----------------------+
only showing top 5 rows



#### 3) Filter

In [21]:
df_filter_onlyTx = df_cust.filter("customer_state == 'TX'")

In [22]:
df_filter_onlyTx.show(5, 0)

+-----------+--------------+--------------+--------------+-----------------+--------------------------+-------------+--------------+----------------+
|customer_id|customer_fname|customer_lname|customer_email|customer_password|customer_street           |customer_city|customer_state|customer_zipcode|
+-----------+--------------+--------------+--------------+-----------------+--------------------------+-------------+--------------+----------------+
|1          |Richard       |Hernandez     |XXXXXXXXX     |XXXXXXXXX        |6303 Heather Plaza        |Brownsville  |TX            |78521           |
|12         |Christopher   |Smith         |XXXXXXXXX     |XXXXXXXXX        |5594 Jagged Embers By-pass|San Antonio  |TX            |78227           |
|29         |Mary          |Humphrey      |XXXXXXXXX     |XXXXXXXXX        |2469 Blue Brook Crossing  |Fort Worth   |TX            |76133           |
|82         |Jonathan      |Cook          |XXXXXXXXX     |XXXXXXXXX        |7885 Sleepy Cove        

In [23]:
df_filter_onlyTx.count()

635

In [39]:
df_filter = df_select.filter("customer_state in ('AR', 'TX') ")

In [25]:
df_filter.count()

647

#### 4) Order By *

In [40]:
df_order = df_filter.orderBy("customer_state", ascending=False)

In [41]:
df_order = df_filter.orderBy("customer_state", ascending=True)

In [42]:
df_order.show(20,0)

+-----------+--------------+-------------+--------------+-----------------------+
|customer_id|customer_fname|customer_city|customer_state|cre_ts                 |
+-----------+--------------+-------------+--------------+-----------------------+
|1947       |Mary          |Conway       |AR            |2023-07-22 02:43:20.519|
|1338       |Linda         |Conway       |AR            |2023-07-22 02:43:20.519|
|10039      |Mary          |Conway       |AR            |2023-07-22 02:43:20.519|
|12074      |Mary          |Conway       |AR            |2023-07-22 02:43:20.519|
|4561       |Scott         |Jonesboro    |AR            |2023-07-22 02:43:20.519|
|5026       |Christian     |Jonesboro    |AR            |2023-07-22 02:43:20.519|
|965        |Sean          |Jonesboro    |AR            |2023-07-22 02:43:20.519|
|5075       |Sharon        |Jonesboro    |AR            |2023-07-22 02:43:20.519|
|3971       |John          |Jonesboro    |AR            |2023-07-22 02:43:20.519|
|7189       |Pam

#### 5) Distinct *

In [56]:
df_distinct = df_order.select("customer_city", "customer_state").distinct()

In [57]:
df_distinct.count()

                                                                                

40

In [52]:
df_distinct = df_order.select("customer_state").distinct()

In [53]:
df_distinct.count()

                                                                                

2

In [54]:
df_distinct.show()

+--------------+
|customer_state|
+--------------+
|            TX|
|            AR|
+--------------+



#### 6) withColumnRenamed

In [58]:
df_distinct.show()

+--------------------+--------------+
|       customer_city|customer_state|
+--------------------+--------------+
|          San Marcos|            TX|
|             Weslaco|            TX|
|          Round Rock|            TX|
|       Grand Prairie|            TX|
|          San Benito|            TX|
|                Katy|            TX|
|          Richardson|            TX|
|           Arlington|            TX|
|              Laredo|            TX|
|          Lewisville|            TX|
|       New Braunfels|            TX|
|              Dallas|            TX|
|            Edinburg|            TX|
|              Spring|            TX|
|              Austin|            TX|
|North Richland Hills|            TX|
|          Fort Worth|            TX|
|               Pharr|            TX|
|         San Antonio|            TX|
|             Baytown|            TX|
+--------------------+--------------+
only showing top 20 rows



In [59]:
df_wcr = df_distinct.withColumnRenamed("customer_city", "City")

In [60]:
df_wcr.show()

+--------------------+--------------+
|                City|customer_state|
+--------------------+--------------+
|          San Marcos|            TX|
|             Weslaco|            TX|
|          Round Rock|            TX|
|       Grand Prairie|            TX|
|          San Benito|            TX|
|                Katy|            TX|
|          Richardson|            TX|
|           Arlington|            TX|
|              Laredo|            TX|
|          Lewisville|            TX|
|       New Braunfels|            TX|
|              Dallas|            TX|
|            Edinburg|            TX|
|              Spring|            TX|
|              Austin|            TX|
|North Richland Hills|            TX|
|          Fort Worth|            TX|
|               Pharr|            TX|
|         San Antonio|            TX|
|             Baytown|            TX|
+--------------------+--------------+
only showing top 20 rows



#### 7) DropDuplicate *

In [61]:
df_order.count()

647

In [62]:
df_distinct.count()

                                                                                

40

In [69]:
df_1 = df_order.select("customer_city", "customer_state")

In [74]:
df_1.count()

647

In [72]:
df_dropDup = df_1.dropDuplicates()

In [73]:
df_dropDup.count()

                                                                                

40

#### 8) Drop Column

In [75]:
df_1.show(5)

+-------------+--------------+
|customer_city|customer_state|
+-------------+--------------+
|    Jonesboro|            AR|
|    Jonesboro|            AR|
|       Conway|            AR|
|       Conway|            AR|
|    Jonesboro|            AR|
+-------------+--------------+
only showing top 5 rows



In [76]:
df_drop = df_1.drop("customer_state")

In [77]:
df_drop.show(5)

+-------------+
|customer_city|
+-------------+
|    Jonesboro|
|    Jonesboro|
|       Conway|
|       Conway|
|    Jonesboro|
+-------------+
only showing top 5 rows



#### 9) SelectExpr

In [79]:
df_select.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- customer_fname: string (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)
 |-- cre_ts: timestamp (nullable = false)



In [80]:
df_select.show(3)

+-----------+--------------+-------------+--------------+--------------------+
|customer_id|customer_fname|customer_city|customer_state|              cre_ts|
+-----------+--------------+-------------+--------------+--------------------+
|          1|       Richard|  Brownsville|            TX|2023-07-22 03:03:...|
|          2|          Mary|    Littleton|            CO|2023-07-22 03:03:...|
|          3|           Ann|       Caguas|            PR|2023-07-22 03:03:...|
+-----------+--------------+-------------+--------------+--------------------+
only showing top 3 rows



In [96]:
df_cast = df_select.selectExpr("*", 'cast(cre_ts as string) as sys_ts')

In [99]:
df_cast.show(3, 0)

+-----------+--------------+-------------+--------------+-----------------------+-----------------------+
|customer_id|customer_fname|customer_city|customer_state|cre_ts                 |sys_ts                 |
+-----------+--------------+-------------+--------------+-----------------------+-----------------------+
|1          |Richard       |Brownsville  |TX            |2023-07-22 03:10:17.674|2023-07-22 03:10:17.674|
|2          |Mary          |Littleton    |CO            |2023-07-22 03:10:17.674|2023-07-22 03:10:17.674|
|3          |Ann           |Caguas       |PR            |2023-07-22 03:10:17.674|2023-07-22 03:10:17.674|
+-----------+--------------+-------------+--------------+-----------------------+-----------------------+
only showing top 3 rows



In [98]:
df_cast.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- customer_fname: string (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)
 |-- cre_ts: timestamp (nullable = false)
 |-- sys_ts: string (nullable = false)



#### 10) GroupBy *

In [100]:
df_groupby = df_select.groupby("customer_state").count()

In [101]:
type(df_groupby)

pyspark.sql.dataframe.DataFrame

In [105]:
df_groupby.show(5,0)

+--------------+-----+
|customer_state|count|
+--------------+-----+
|AZ            |213  |
|SC            |41   |
|LA            |63   |
|MN            |39   |
|NJ            |219  |
+--------------+-----+
only showing top 5 rows



In [107]:
df_groupby_2 = df_select.groupby("customer_state").agg({'customer_state':'count'})

In [108]:
df_groupby_2.show(5,0)

+--------------+---------------------+
|customer_state|count(customer_state)|
+--------------+---------------------+
|AZ            |213                  |
|SC            |41                   |
|LA            |63                   |
|MN            |39                   |
|NJ            |219                  |
+--------------+---------------------+
only showing top 5 rows



In [102]:
df_groupby_1 = df_select.groupby("customer_state").count().count()

                                                                                

In [103]:
type(df_groupby_1)

int

In [104]:
df_groupby_1

44

In [111]:
df_groupby.show(3,0)

+--------------+-----+
|customer_state|count|
+--------------+-----+
|AZ            |213  |
|SC            |41   |
|LA            |63   |
+--------------+-----+
only showing top 3 rows



In [112]:
df_groupby.filter("customer_state == 'AZ'").show(3,0)

+--------------+-----+
|customer_state|count|
+--------------+-----+
|AZ            |213  |
+--------------+-----+



#### 11) Limit

In [143]:
df_select.count()

12435

In [144]:
df_limit_10 = df_select.limit(10)

In [145]:
df_limit_10.count()

10

#### 12) Joins *

In [151]:
df_cust.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- customer_fname: string (nullable = true)
 |-- customer_lname: string (nullable = true)
 |-- customer_email: string (nullable = true)
 |-- customer_password: string (nullable = true)
 |-- customer_street: string (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)
 |-- customer_zipcode: string (nullable = true)



In [146]:
!hdfs dfs -mkdir -p /user/bigdatapedia/input/orc

In [147]:
!hdfs dfs -put /home/bigdatapedia/data/new_orders.snappy.orc /user/bigdatapedia/input/orc/

In [148]:
!hdfs dfs -ls /user/bigdatapedia/input/orc

Found 1 items
-rw-r--r--   3 bigdatapedia supergroup     185892 2023-07-22 03:46 /user/bigdatapedia/input/orc/new_orders.snappy.orc


In [160]:
df_cust_sel.count()

12435

In [161]:
df_order.count()

68883

In [149]:
df_order = spark.read.orc("/user/bigdatapedia/input/orc")

In [150]:
df_order.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: timestamp (nullable = true)
 |-- order_customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



In [152]:
df_cust_sel = df_cust.select("customer_id", "customer_fname", "customer_city", "customer_state")

In [153]:
df_inner = df_cust_sel.join(df_order, df_cust_sel.customer_id == df_order.order_customer_id, "inner")

In [154]:
df_inner.show(5, 0)

[Stage 192:>                                                        (0 + 1) / 1]

+-----------+--------------+-------------+--------------+--------+-------------------+-----------------+------------+
|customer_id|customer_fname|customer_city|customer_state|order_id|order_date         |order_customer_id|order_status|
+-----------+--------------+-------------+--------------+--------+-------------------+-----------------+------------+
|148        |Stephanie     |Caguas       |PR            |15061   |2013-10-28 00:00:00|148              |CLOSED      |
|148        |Stephanie     |Caguas       |PR            |59569   |2013-10-03 00:00:00|148              |COMPLETE    |
|148        |Stephanie     |Caguas       |PR            |61124   |2013-12-02 00:00:00|148              |CLOSED      |
|463        |Harry         |Caguas       |PR            |6857    |2013-09-06 00:00:00|463              |COMPLETE    |
|463        |Harry         |Caguas       |PR            |14181   |2013-10-22 00:00:00|463              |COMPLETE    |
+-----------+--------------+-------------+--------------

                                                                                

In [158]:
df_inner.count()

                                                                                

68883

#### 13) Left Join *

In [155]:
df_left = df_cust_sel.join(df_order, df_cust_sel.customer_id == df_order.order_customer_id, "left")

In [157]:
df_left.show(5, 0)

+-----------+--------------+-------------+--------------+--------+-------------------+-----------------+------------+
|customer_id|customer_fname|customer_city|customer_state|order_id|order_date         |order_customer_id|order_status|
+-----------+--------------+-------------+--------------+--------+-------------------+-----------------+------------+
|148        |Stephanie     |Caguas       |PR            |15061   |2013-10-28 00:00:00|148              |CLOSED      |
|148        |Stephanie     |Caguas       |PR            |59569   |2013-10-03 00:00:00|148              |COMPLETE    |
|148        |Stephanie     |Caguas       |PR            |61124   |2013-12-02 00:00:00|148              |CLOSED      |
|463        |Harry         |Caguas       |PR            |6857    |2013-09-06 00:00:00|463              |COMPLETE    |
|463        |Harry         |Caguas       |PR            |14181   |2013-10-22 00:00:00|463              |COMPLETE    |
+-----------+--------------+-------------+--------------

In [159]:
df_left.count()

                                                                                

68913

#### 14) Right Join *

In [165]:
df_right = df_cust_sel.join(df_order, df_cust_sel.customer_id == df_order.order_customer_id, "right")

In [166]:
df_right.count()

                                                                                

68883

#### 15) Full *

In [167]:
df_full = df_cust_sel.join(df_order, df_cust_sel.customer_id == df_order.order_customer_id, "full")

In [168]:
df_full.count()

                                                                                

68913

#### 16) Cartesian *

In [173]:
df_cartesian = df_cust_sel.crossJoin(df_order)

In [174]:
df_cartesian.count()

                                                                                

856560105

In [175]:
12435 * 68883

856560105

#### 17) Union

In [176]:
df_cust_sel.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- customer_fname: string (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)



In [177]:
df_filter_onlyTx = df_cust.filter("customer_state == 'TX'")

In [179]:
df_filter_onlyTx.count()

635

In [178]:
df_filter_onlyAR = df_cust.filter("customer_state == 'AR'")

In [180]:
df_filter_onlyAR.count()

12

In [181]:
df_union = df_filter_onlyTx.union(df_filter_onlyAR)

In [182]:
df_union.count()

647

#### 18) Intersection *

In [184]:
df_intersect = df_filter_onlyTx.intersect(df_filter_onlyAR)

In [185]:
df_intersect.count()

                                                                                

0

### Functions

In [114]:
from pyspark.sql.functions import lit, current_timestamp, when, window

#### 1) Case when

In [115]:
df_select.show(5,0)

+-----------+--------------+-------------+--------------+-----------------------+
|customer_id|customer_fname|customer_city|customer_state|cre_ts                 |
+-----------+--------------+-------------+--------------+-----------------------+
|1          |Richard       |Brownsville  |TX            |2023-07-22 03:27:55.995|
|2          |Mary          |Littleton    |CO            |2023-07-22 03:27:55.995|
|3          |Ann           |Caguas       |PR            |2023-07-22 03:27:55.995|
|4          |Mary          |San Marcos   |CA            |2023-07-22 03:27:55.995|
|5          |Robert        |Caguas       |PR            |2023-07-22 03:27:55.995|
+-----------+--------------+-------------+--------------+-----------------------+
only showing top 5 rows



In [120]:
df_casewhen = df_select.select("*", when(df_select.customer_state == 'TX', 
                                         "Texas").otherwise("Non Texas").alias("New_State"))

In [121]:
df_casewhen.show(5,0)

+-----------+--------------+-------------+--------------+-----------------------+---------+
|customer_id|customer_fname|customer_city|customer_state|cre_ts                 |New_State|
+-----------+--------------+-------------+--------------+-----------------------+---------+
|1          |Richard       |Brownsville  |TX            |2023-07-22 03:31:07.668|Texas    |
|2          |Mary          |Littleton    |CO            |2023-07-22 03:31:07.668|Non Texas|
|3          |Ann           |Caguas       |PR            |2023-07-22 03:31:07.668|Non Texas|
|4          |Mary          |San Marcos   |CA            |2023-07-22 03:31:07.668|Non Texas|
|5          |Robert        |Caguas       |PR            |2023-07-22 03:31:07.668|Non Texas|
+-----------+--------------+-------------+--------------+-----------------------+---------+
only showing top 5 rows



#### 2) Window

In [128]:
from pyspark.sql.functions import row_number, rank, dense_rank

from pyspark.sql.window import Window

In [140]:
WindowSpec = Window.partitionBy("customer_city").orderBy("customer_city")

In [141]:
df_rank = df_select.withColumn("Ranking", rank().over(WindowSpec))

In [142]:
df_rank.show(5, 0)

+-----------+--------------+-------------+--------------+-----------------------+-------+
|customer_id|customer_fname|customer_city|customer_state|cre_ts                 |Ranking|
+-----------+--------------+-------------+--------------+-----------------------+-------+
|147        |Mary          |Hanover      |PA            |2023-07-22 03:40:53.351|1      |
|2544       |Mary          |Hanover      |PA            |2023-07-22 03:40:53.351|1      |
|2705       |Mary          |Hanover      |PA            |2023-07-22 03:40:53.351|1      |
|4650       |Mary          |Hanover      |PA            |2023-07-22 03:40:53.351|1      |
|6108       |Emma          |Hanover      |PA            |2023-07-22 03:40:53.351|1      |
+-----------+--------------+-------------+--------------+-----------------------+-------+
only showing top 5 rows



### Tomorrow
split, concat, month, year, quarter