# Spark Session

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from datetime import datetime

# Create spark session
spark = SparkSession.builder \
        .appName("Spark with Hive") \
        .enableHiveSupport() \
        .getOrCreate()

# Hardcoded data
data = [
    ["Product A", 1001, datetime.strptime("2023-07-20", "%Y-%m-%d"), datetime.strptime("2023-07-20 10:15:30", "%Y-%m-%d %H:%M:%S"), 29.99],
    ["Product B", 1002, datetime.strptime("2023-07-19", "%Y-%m-%d"), datetime.strptime("2023-07-19 14:20:45", "%Y-%m-%d %H:%M:%S"), 49.99],
    ["Product C", 1003, datetime.strptime("2023-07-18", "%Y-%m-%d"), datetime.strptime("2023-07-18 09:30:15", "%Y-%m-%d %H:%M:%S"), 39.99],
    ["Product D", 1004, datetime.strptime("2023-07-17", "%Y-%m-%d"), datetime.strptime("2023-07-17 16:45:00", "%Y-%m-%d %H:%M:%S"), 19.99]
]

# Define schema
schema = StructType([
    StructField("Product", StringType(), True),
    StructField("ID", IntegerType(), True),
    StructField("Date", DateType(), True),
    StructField("Timestamp", TimestampType(), True),
    StructField("Price", FloatType(), True)
])

# Create dataframe
df = spark.createDataFrame(data, schema)

# Print schema
df.printSchema()

# Print Data
df.show()

24/07/20 22:05:56 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


root
 |-- Product: string (nullable = true)
 |-- ID: integer (nullable = true)
 |-- Date: date (nullable = true)
 |-- Timestamp: timestamp (nullable = true)
 |-- Price: float (nullable = true)



                                                                                

+---------+----+----------+-------------------+-----+
|  Product|  ID|      Date|          Timestamp|Price|
+---------+----+----------+-------------------+-----+
|Product A|1001|2023-07-20|2023-07-20 10:15:30|29.99|
|Product B|1002|2023-07-19|2023-07-19 14:20:45|49.99|
|Product C|1003|2023-07-18|2023-07-18 09:30:15|39.99|
|Product D|1004|2023-07-17|2023-07-17 16:45:00|19.99|
+---------+----+----------+-------------------+-----+



## Read Data From HDFS

In [3]:
# First read example should not infer schema, ignore header row, provide explicit column name and datatype

# Define schema
schema = StructType([
    StructField("order_id", StringType(), True),
    StructField("order_item_id", IntegerType(), True),
    StructField("product_id", StringType(), True),
    StructField("seller_id", StringType(), True),
    StructField("shipping_limit_date", TimestampType(), True),
    StructField("price", DoubleType(), True),
    StructField("freight_value", DoubleType(), True)
])

hdfs_path = '/spark_input_data/order_items_dataset.csv'

df = spark.read.format('csv').option('header', 'true').option('inferSchema', 'false').schema(schema).load(hdfs_path)

df.printSchema()

df.show(5)

root
 |-- order_id: string (nullable = true)
 |-- order_item_id: integer (nullable = true)
 |-- product_id: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- shipping_limit_date: timestamp (nullable = true)
 |-- price: double (nullable = true)
 |-- freight_value: double (nullable = true)



[Stage 2:>                                                          (0 + 1) / 1]

+--------------------+-------------+--------------------+--------------------+-------------------+-----+-------------+
|            order_id|order_item_id|          product_id|           seller_id|shipping_limit_date|price|freight_value|
+--------------------+-------------+--------------------+--------------------+-------------------+-----+-------------+
|00010242fe8c5a6d1...|            1|4244733e06e7ecb49...|48436dade18ac8b2b...|2017-09-19 09:45:35| 58.9|        13.29|
|00018f77f2f0320c5...|            1|e5f2d52b802189ee6...|dd7ddc04e1b6c2c61...|2017-05-03 11:05:13|239.9|        19.93|
|000229ec398224ef6...|            1|c777355d18b72b67a...|5b51032eddd242adc...|2018-01-18 14:48:30|199.0|        17.87|
|00024acbcdf0a6daa...|            1|7634da152a4610f15...|9d7a1d34a50524090...|2018-08-15 10:10:18|12.99|        12.79|
|00042b26cf59d7ce6...|            1|ac6c3623068f30de0...|df560393f3a51e745...|2017-02-13 13:57:51|199.9|        18.14|
+--------------------+-------------+------------

                                                                                

## Schema Inference

In [4]:
# Second read example should infer schema, ignore header row

hdfs_path = '/spark_input_data/order_items_dataset.csv'

df2 = spark.read.format('csv').option('header', 'true').option('inferSchema', 'true').load(hdfs_path)

df2.printSchema()

df2.show(5)

                                                                                

root
 |-- order_id: string (nullable = true)
 |-- order_item_id: integer (nullable = true)
 |-- product_id: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- shipping_limit_date: timestamp (nullable = true)
 |-- price: double (nullable = true)
 |-- freight_value: double (nullable = true)

+--------------------+-------------+--------------------+--------------------+-------------------+-----+-------------+
|            order_id|order_item_id|          product_id|           seller_id|shipping_limit_date|price|freight_value|
+--------------------+-------------+--------------------+--------------------+-------------------+-----+-------------+
|00010242fe8c5a6d1...|            1|4244733e06e7ecb49...|48436dade18ac8b2b...|2017-09-19 09:45:35| 58.9|        13.29|
|00018f77f2f0320c5...|            1|e5f2d52b802189ee6...|dd7ddc04e1b6c2c61...|2017-05-03 11:05:13|239.9|        19.93|
|000229ec398224ef6...|            1|c777355d18b72b67a...|5b51032eddd242adc...|2018-01-18 14:48

## Repartition

In [6]:
# Number of partitions after reading from hdfs

print(f"Number of partitions: {df2.rdd.getNumPartitions()}")

df3 = df2.repartition(10)

# Number of partitions after repartition

print(f"Number of partitions: {df3.rdd.getNumPartitions()}")


Number of partitions: 2




Number of partitions: 10


## DataFrame Operations

In [8]:
# select columns in different options
from pyspark.sql.functions import *

# select one column
df3.select('order_id').show(5)
# select multiple columns
df3.select('order_id', 'shipping_limit_date').show(5)
# column aliasing
df3.select(col('order_id').alias('oid'), col('shipping_limit_date').alias('limit_date')).show(5)

                                                                                

+--------------------+
|            order_id|
+--------------------+
|a2a04ead650a00874...|
|f6974499820661b92...|
|ae544ab4fdaa9ad26...|
|b5c568b689a9f0c35...|
|ced16c8394f44cb7e...|
+--------------------+
only showing top 5 rows



                                                                                

+--------------------+-------------------+
|            order_id|shipping_limit_date|
+--------------------+-------------------+
|3bbf8f927f288e4a1...|2017-11-09 14:25:38|
|50c40cfcbb6ce3fca...|2018-06-14 09:52:04|
|51c3d73e0e9052253...|2018-02-22 19:15:27|
|183ee0e3ebd4c1c99...|2018-02-07 20:14:08|
|3a1400b5d4dd3082a...|2018-03-27 17:28:20|
+--------------------+-------------------+
only showing top 5 rows





+--------------------+-------------------+
|                 oid|         limit_date|
+--------------------+-------------------+
|f0b47dadd5f372c41...|2018-05-14 04:54:53|
|bbd319ae8e4b46101...|2017-11-24 02:28:04|
|e16fb24453a306d5d...|2018-08-10 03:24:54|
|d4de6d0debe2df72c...|2017-03-13 03:35:12|
|bdbe8da70dcc6e6a2...|2018-04-22 21:52:25|
+--------------------+-------------------+
only showing top 5 rows



                                                                                

## Derive New Columns using withColumn 

In [9]:
df4 = df3.withColumn('year', year(col('shipping_limit_date'))).withColumn('month', month(col('shipping_limit_date')))

df4.select('order_id', 'shipping_limit_date', 'year', 'month').show(5)



+--------------------+-------------------+----+-----+
|            order_id|shipping_limit_date|year|month|
+--------------------+-------------------+----+-----+
|f0b47dadd5f372c41...|2018-05-14 04:54:53|2018|    5|
|bbd319ae8e4b46101...|2017-11-24 02:28:04|2017|   11|
|e16fb24453a306d5d...|2018-08-10 03:24:54|2018|    8|
|d4de6d0debe2df72c...|2017-03-13 03:35:12|2017|    3|
|bdbe8da70dcc6e6a2...|2018-04-22 21:52:25|2018|    4|
+--------------------+-------------------+----+-----+
only showing top 5 rows



                                                                                

## Renaming the Column

In [10]:
# renaming existing column using withColumnRenamed

df5 = df4.withColumnRenamed("shipping_limit_date", "shipping_limit_datetime")
df5.select("order_id", "shipping_limit_datetime").show(5)



+--------------------+-----------------------+
|            order_id|shipping_limit_datetime|
+--------------------+-----------------------+
|f0b47dadd5f372c41...|    2018-05-14 04:54:53|
|bbd319ae8e4b46101...|    2017-11-24 02:28:04|
|e16fb24453a306d5d...|    2018-08-10 03:24:54|
|d4de6d0debe2df72c...|    2017-03-13 03:35:12|
|bdbe8da70dcc6e6a2...|    2018-04-22 21:52:25|
+--------------------+-----------------------+
only showing top 5 rows



                                                                                

### Filter Conditions

filter conditions can be written 2 ways:
1. object-based
2. sql type of expressions

In [13]:
# filter condition

df5.filter(col("order_id") == "00010242fe8c5a6d1ba2dd792cb16214").show()

order_id_list = ["00010242fe8c5a6d1ba2dd792cb16214", "00018f77f2f0320c557190d7a144bdd3"]

df5.filter(col("order_id").isin(order_id_list)).show()

df5.filter((col("price")<50) & (col("freight_value")<10)).show(5)

# SQL type expression

df5.filter("price < 50 and freight_value < 10").show(5)

+--------------------+-------------+--------------------+--------------------+-----------------------+-----+-------------+----+-----+
|            order_id|order_item_id|          product_id|           seller_id|shipping_limit_datetime|price|freight_value|year|month|
+--------------------+-------------+--------------------+--------------------+-----------------------+-----+-------------+----+-----+
|00010242fe8c5a6d1...|            1|4244733e06e7ecb49...|48436dade18ac8b2b...|    2017-09-19 09:45:35| 58.9|        13.29|2017|    9|
+--------------------+-------------+--------------------+--------------------+-----------------------+-----+-------------+----+-----+

+--------------------+-------------+--------------------+--------------------+-----------------------+-----+-------------+----+-----+
|            order_id|order_item_id|          product_id|           seller_id|shipping_limit_datetime|price|freight_value|year|month|
+--------------------+-------------+--------------------+----

#### Drop Columns, Drop duplicates 

In [None]:
df5.drop('month').show(5)

In [14]:
# drop duplicates rows based on multiple columns 
df5.dropDuplicates(["order_id", "order_item_id"]).show(5)


[Stage 51:>                                                         (0 + 1) / 1]

+--------------------+-------------+--------------------+--------------------+-----------------------+------+-------------+----+-----+
|            order_id|order_item_id|          product_id|           seller_id|shipping_limit_datetime| price|freight_value|year|month|
+--------------------+-------------+--------------------+--------------------+-----------------------+------+-------------+----+-----+
|00018f77f2f0320c5...|            1|e5f2d52b802189ee6...|dd7ddc04e1b6c2c61...|    2017-05-03 11:05:13| 239.9|        19.93|2017|    5|
|000229ec398224ef6...|            1|c777355d18b72b67a...|5b51032eddd242adc...|    2018-01-18 14:48:30| 199.0|        17.87|2018|    1|
|00048cc3ae777c65d...|            1|ef92defde845ab845...|6426d21aca402a131...|    2017-05-23 03:55:27|  21.9|        12.69|2017|    5|
|0005a1a1728c9d785...|            1|310ae3c140ff94b03...|a416b6a846a117243...|    2018-03-26 18:31:29|145.95|        11.65|2018|    3|
|0005f50442cb953dc...|            1|4535b0e1091c278df..

                                                                                

when drop duplicates whether shuffling will happen?
    yes, because to search data in all partitions. 
    even for distinct operation also shuffling will happen

In [15]:

df5.distinct().show(5)

# this will drop the records when entire record is duplicated. i.e. across all columns
df5.dropDuplicates().show(5)


                                                                                

+--------------------+-------------+--------------------+--------------------+-----------------------+-----+-------------+----+-----+
|            order_id|order_item_id|          product_id|           seller_id|shipping_limit_datetime|price|freight_value|year|month|
+--------------------+-------------+--------------------+--------------------+-----------------------+-----+-------------+----+-----+
|3fb11bd2ea68c2502...|            1|f908d3bf313a1308b...|25e6ffe976bd75618...|    2017-12-07 13:11:22| 35.0|        11.85|2017|   12|
|4c792ea6a2a9a0d04...|            1|06f2166336faca73c...|d5ba419e26d246a07...|    2018-08-06 16:35:11|105.0|         9.49|2018|    8|
|329865201ab68a53f...|            1|4a25d757ff72fad75...|92eb0f42c21942b65...|    2018-01-24 02:13:11|12.98|        34.15|2018|    1|
|9da95e5b2fd142848...|            1|689c51a11e9c5daef...|b561927807645834b...|    2018-04-04 20:35:26| 24.0|        18.23|2018|    4|
|0e41154364b2f30bb...|            1|6c3effec7c8ddba46...|37515



+--------------------+-------------+--------------------+--------------------+-----------------------+------+-------------+----+-----+
|            order_id|order_item_id|          product_id|           seller_id|shipping_limit_datetime| price|freight_value|year|month|
+--------------------+-------------+--------------------+--------------------+-----------------------+------+-------------+----+-----+
|f7ba0ab952e0165fe...|            1|96c1f14a0754e0f2c...|ececbfcff9804a2d6...|    2018-05-07 11:55:08|  78.5|         19.4|2018|    5|
|ff0c42884b1cfcd53...|            1|8729e9bc4b7f04224...|2e1a7d075abe038c1...|    2017-12-07 02:38:32|  67.9|        17.73|2017|   12|
|2d99133897553f4c8...|            1|d1c427060a0f73f6b...|a1043bafd471dff53...|    2017-08-10 19:35:19|139.99|        23.78|2017|    8|
|bb5cce57f8d80c481...|            1|6ec26b3516fecd18c...|c70c1b0d8ca86052f...|    2018-06-29 11:57:21|110.32|         8.03|2018|    6|
|c5119bb429cf05b92...|            1|192b332c511e484ea..

24/07/20 23:04:04 ERROR TransportResponseHandler: Still have 1 requests outstanding when connection from /10.190.0.2:41132 is closed
24/07/20 23:04:04 WARN BlockManagerMasterEndpoint: Error trying to remove broadcast 60 from block manager BlockManagerId(6, cluster-spark-w-0.asia-south2-c.c.hadoop-project-428213.internal, 39393, None)
java.io.IOException: Connection from /10.190.0.2:41132 closed
	at org.apache.spark.network.client.TransportResponseHandler.channelInactive(TransportResponseHandler.java:147) ~[spark-network-common_2.12-3.5.0.jar:3.5.0]
	at org.apache.spark.network.server.TransportChannelHandler.channelInactive(TransportChannelHandler.java:117) ~[spark-network-common_2.12-3.5.0.jar:3.5.0]
	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelInactive(AbstractChannelHandlerContext.java:305) ~[netty-transport-4.1.100.Final.jar:4.1.100.Final]
	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelInactive(AbstractChannelHandlerContext.java:281) ~[netty-trans

### sorting 

In [16]:
# arrange data using order by

df5.orderBy(col("price").desc()).show(5)

df5.orderBy(col("price").asc(), col("freight_value").desc()).show(5)

                                                                                

+--------------------+-------------+--------------------+--------------------+-----------------------+------+-------------+----+-----+
|            order_id|order_item_id|          product_id|           seller_id|shipping_limit_datetime| price|freight_value|year|month|
+--------------------+-------------+--------------------+--------------------+-----------------------+------+-------------+----+-----+
|0812eb902a67711a1...|            1|489ae2aa008f02150...|e3b4998c7a498169d...|    2017-02-16 20:37:36|6735.0|       194.31|2017|    2|
|fefacc66af859508b...|            1|69c590f7ffc7bf8db...|80ceebb4ee9b31afb...|    2018-08-02 04:05:13|6729.0|       193.21|2018|    8|
|f5136e38d1a14a4db...|            1|1bdf5e6731585cf01...|ee27a8f15b1dded4d...|    2017-06-15 02:45:17|6499.0|       227.66|2017|    6|
|a96610ab360d42a2e...|            1|a6492cc69376c469a...|59417c56835dd8e2e...|    2017-04-18 13:25:18|4799.0|       151.34|2017|    4|
|199af31afc78c699f...|            1|c3ed642d592594bb6..



+--------------------+-------------+--------------------+--------------------+-----------------------+-----+-------------+----+-----+
|            order_id|order_item_id|          product_id|           seller_id|shipping_limit_datetime|price|freight_value|year|month|
+--------------------+-------------+--------------------+--------------------+-----------------------+-----+-------------+----+-----+
|c5bdd8ef3c0ec4202...|            2|8a3254bee785a526d...|96804ea39d96eb908...|    2018-05-07 02:55:22| 0.85|         22.3|2018|    5|
|3ee6513ae7ea23bdf...|            1|8a3254bee785a526d...|96804ea39d96eb908...|    2018-05-04 03:55:26| 0.85|        18.23|2018|    5|
|6e864b3f0ec710311...|            1|8a3254bee785a526d...|96804ea39d96eb908...|    2018-05-02 20:30:34| 0.85|        18.23|2018|    5|
|8272b63d03f5f79c5...|            1|270516a3f41dc035a...|2709af9587499e95e...|    2017-07-21 18:25:23|  1.2|         7.89|2017|    7|
|8272b63d03f5f79c5...|            4|05b515fdc76e888aa...|2709a



#### Group By Operations

In [18]:
# single column
df5.groupBy('year').agg(count("*").alias("total_count"),
                        min("price").alias("min_price"),
                        avg("price").alias("avg_price"),
                        max("price").alias("max_price"),
                        sum("price").alias("total_price")).show(5)

# multi column
df5.groupBy('year', 'month').agg(count("*").alias("total_count"),
                        min("price").alias("min_price"),
                        avg("price").alias("avg_price"),
                        max("price").alias("max_price"),
                        sum("price").alias("total_price")).orderBy(col("year").asc(), col("month").asc()).show(5)

                                                                                

+----+-----------+---------+------------------+---------+-----------------+
|year|total_count|min_price|         avg_price|max_price|      total_price|
+----+-----------+---------+------------------+---------+-----------------+
|2018|      62511|     0.85|120.08515685239729|   6729.0|7506643.240000207|
|2017|      49765|      1.2|121.26732804179925|   6735.0| 6034868.58000014|
|2016|        370|      6.0|134.55654054054054|   1399.0|         49785.92|
|2020|          4|    69.99|             86.49|    99.99|           345.96|
+----+-----------+---------+------------------+---------+-----------------+





+----+-----+-----------+---------+------------------+---------+------------------+
|year|month|total_count|min_price|         avg_price|max_price|       total_price|
+----+-----+-----------+---------+------------------+---------+------------------+
|2016|    9|          4|    44.99| 48.61750000000001|     59.5|194.47000000000003|
|2016|   10|        365|      6.0| 135.8371232876712|   1399.0| 49580.54999999999|
|2016|   12|          1|     10.9|              10.9|     10.9|              10.9|
|2017|    1|        681|      2.9|117.65747430249623|   1999.0| 80124.73999999993|
|2017|    2|       1866|      3.9| 131.8231564844589|   6735.0| 245982.0100000003|
+----+-----+-----------+---------+------------------+---------+------------------+
only showing top 5 rows



                                                                                

### Accumulators
Accumulator is like a global variable.
aggregate data across all partitions 

In [19]:
accum = spark.sparkContext.accumulator(0)  # the variable is initialized with value 0

df5.foreach(lambda row: accum.add(row["price"])) # each record price is added to accumulator

#accessed by driver
print(accum.value)



13591643.699999437


                                                                                

#### Case When Statement
usually we are deriving a new column by writting case when statement. 
so, in spark to add a new column we use withColumn which takes column name, logic to derive the column


In [20]:
df5.withColumn("price_category", when(col("price")>= 100, "High")
                                .when((col("price")<100) & (col("price")>=50), "Medium")
                                .otherwise("low")).show(5)

[Stage 93:>                                                         (0 + 1) / 1]

+--------------------+-------------+--------------------+--------------------+-----------------------+------+-------------+----+-----+--------------+
|            order_id|order_item_id|          product_id|           seller_id|shipping_limit_datetime| price|freight_value|year|month|price_category|
+--------------------+-------------+--------------------+--------------------+-----------------------+------+-------------+----+-----+--------------+
|1e1bb536916a99649...|            2|0288f8dd74b931b4e...|1da3aeb70d7989d1e...|    2017-09-05 12:10:11| 49.99|        21.15|2017|    9|           low|
|62a0e822dd605871a...|            1|31dbb0d1815bdc83c...|6da1992f915d77be9...|    2017-06-08 11:50:18|  29.0|        15.79|2017|    6|           low|
|025c72e88fbf2358b...|            2|bef21943bc2335188...|e49c26c3edfa46d22...|    2017-03-21 21:24:27|  19.9|         20.8|2017|    3|           low|
|23d16dddab46fd3d0...|            1|cca8e09ba6f2d35e4...|43f8c9950d11ecd03...|    2018-01-31 22:17:5

                                                                                