In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

In [None]:
orders_data = [
    ("ORD001","C001","Delhi ","Electronics","Laptop","45000","2024-01-05","Completed"),
    ("ORD002","C002","Mumbai","Electronics","Mobile ","32000","05/01/2024","Completed"),
    ("ORD003","C003","Bangalore","Electronics","Tablet","30000","2024/01/06","Completed"),
    ("ORD004","C004","Delhi","Electronics","Laptop","","2024-01-07","Cancelled"),
    ("ORD005","C005","Chennai","Electronics","Mobile","invalid","2024-01-08","Completed"),
    ("ORD006","C006","Mumbai","Home","Mixer",None,"2024-01-08","Completed"),
    ("ORD007","C001","Delhi","Electronics","Laptop","47000","09-01-2024","Completed"),
    ("ORD008","C007","Bangalore","Home","Vacuum","28000","2024-01-09","Completed"),
    ("ORD009","C002","Mumbai","Electronics","Laptop","55000","2024-01-10","Completed"),
    ("ORD010","C008","Delhi","Home","AirPurifier","38000","2024-01-10","Completed"),
    ("ORD011","C009","Mumbai","Home","Vacuum","29000","2024-01-11","Completed"),
    ("ORD012","C010","Bangalore","Electronics","Mobile","33000","2024-01-11","Completed"),
    ("ORD013","C003","Bangalore","Home","Mixer","21000","2024-01-12","Completed"),
    ("ORD014","C004","Delhi","Electronics","Tablet","26000","2024-01-12","Completed"),
    ("ORD015","C005","Chennai","Electronics","Laptop","62000","2024-01-13","Completed"),
    ("ORD016","C006","Mumbai","Home","AirPurifier","40000","2024-01-13","Completed"),
    ("ORD017","C007","Bangalore","Electronics","Laptop","51000","2024-01-14","Completed"),
    ("ORD018","C008","Delhi","Home","Vacuum","31000","2024-01-14","Completed"),
    ("ORD019","C009","Mumbai","Electronics","Tablet","29000","2024-01-15","Completed"),
    ("ORD020","C010","Bangalore","Electronics","Laptop","54000","2024-01-15","Completed"),
    ("ORD020","C010","Bangalore","Electronics","Laptop","54000","2024-01-15","Completed")
]

In [None]:
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, DateType

In [None]:

order_schema=StructType([
    StructField("order_id",StringType(),True),
    StructField("customer_id",StringType(),True),
    StructField("city",StringType(),True),
    StructField("category",StringType(),True),
    StructField("product",StringType(),True),
    StructField("amount",StringType(),True),
    StructField("order_date",StringType(),True),
    StructField("status",StringType(),True)
])


In [None]:
order_df=spark.createDataFrame(data=orders_data,schema=order_schema)
order_df.show()
order_df.printSchema()

+--------+-----------+---------+-----------+-----------+-------+----------+---------+
|order_id|customer_id|     city|   category|    product| amount|order_date|   status|
+--------+-----------+---------+-----------+-----------+-------+----------+---------+
|  ORD001|       C001|   Delhi |Electronics|     Laptop|  45000|2024-01-05|Completed|
|  ORD002|       C002|   Mumbai|Electronics|    Mobile |  32000|05/01/2024|Completed|
|  ORD003|       C003|Bangalore|Electronics|     Tablet|  30000|2024/01/06|Completed|
|  ORD004|       C004|    Delhi|Electronics|     Laptop|       |2024-01-07|Cancelled|
|  ORD005|       C005|  Chennai|Electronics|     Mobile|invalid|2024-01-08|Completed|
|  ORD006|       C006|   Mumbai|       Home|      Mixer|   NULL|2024-01-08|Completed|
|  ORD007|       C001|    Delhi|Electronics|     Laptop|  47000|09-01-2024|Completed|
|  ORD008|       C007|Bangalore|       Home|     Vacuum|  28000|2024-01-09|Completed|
|  ORD009|       C002|   Mumbai|Electronics|     Lapto

In [None]:
from pyspark.sql import functions as f
col_list=["order_id","customer_id","city","category","product","amount","order_date","status"]
for i in col_list:
  order_df=order_df.withColumn(i,f.trim(f.col(i)))

order_df.show()

+--------+-----------+---------+-----------+-----------+-------+----------+---------+
|order_id|customer_id|     city|   category|    product| amount|order_date|   status|
+--------+-----------+---------+-----------+-----------+-------+----------+---------+
|  ORD001|       C001|    Delhi|Electronics|     Laptop|  45000|2024-01-05|Completed|
|  ORD002|       C002|   Mumbai|Electronics|     Mobile|  32000|05/01/2024|Completed|
|  ORD003|       C003|Bangalore|Electronics|     Tablet|  30000|2024/01/06|Completed|
|  ORD004|       C004|    Delhi|Electronics|     Laptop|       |2024-01-07|Cancelled|
|  ORD005|       C005|  Chennai|Electronics|     Mobile|invalid|2024-01-08|Completed|
|  ORD006|       C006|   Mumbai|       Home|      Mixer|   NULL|2024-01-08|Completed|
|  ORD007|       C001|    Delhi|Electronics|     Laptop|  47000|09-01-2024|Completed|
|  ORD008|       C007|Bangalore|       Home|     Vacuum|  28000|2024-01-09|Completed|
|  ORD009|       C002|   Mumbai|Electronics|     Lapto

In [None]:
check_exp=f.regexp_extract(f.col("amount"),r"(\d+)",0)
check=order_df.withColumn("amount",f.when((check_exp == "")|check_exp.isNull(),f.lit(0) ).otherwise(check_exp.cast(IntegerType())))
check.show()

+--------+-----------+---------+-----------+-----------+------+----------+---------+
|order_id|customer_id|     city|   category|    product|amount|order_date|   status|
+--------+-----------+---------+-----------+-----------+------+----------+---------+
|  ORD001|       C001|    Delhi|Electronics|     Laptop| 45000|2024-01-05|Completed|
|  ORD002|       C002|   Mumbai|Electronics|     Mobile| 32000|05/01/2024|Completed|
|  ORD003|       C003|Bangalore|Electronics|     Tablet| 30000|2024/01/06|Completed|
|  ORD004|       C004|    Delhi|Electronics|     Laptop|     0|2024-01-07|Cancelled|
|  ORD005|       C005|  Chennai|Electronics|     Mobile|     0|2024-01-08|Completed|
|  ORD006|       C006|   Mumbai|       Home|      Mixer|     0|2024-01-08|Completed|
|  ORD007|       C001|    Delhi|Electronics|     Laptop| 47000|09-01-2024|Completed|
|  ORD008|       C007|Bangalore|       Home|     Vacuum| 28000|2024-01-09|Completed|
|  ORD009|       C002|   Mumbai|Electronics|     Laptop| 55000|20

In [None]:
check.withColumn("order_date",f.coalesce(
    f.col("order_date"),
    f.to_date(f.col("order_date"),"dd-MM-yyyy"),
    f.to_date(f.col("order_date"),"yyyy-MM-dd"),
    f.to_date(f.col("order_date"),"yyyy/MM/dd")
))
check.show()

+--------+-----------+---------+-----------+-----------+------+----------+---------+
|order_id|customer_id|     city|   category|    product|amount|order_date|   status|
+--------+-----------+---------+-----------+-----------+------+----------+---------+
|  ORD001|       C001|    Delhi|Electronics|     Laptop| 45000|2024-01-05|Completed|
|  ORD002|       C002|   Mumbai|Electronics|     Mobile| 32000|05/01/2024|Completed|
|  ORD003|       C003|Bangalore|Electronics|     Tablet| 30000|2024/01/06|Completed|
|  ORD004|       C004|    Delhi|Electronics|     Laptop|     0|2024-01-07|Cancelled|
|  ORD005|       C005|  Chennai|Electronics|     Mobile|     0|2024-01-08|Completed|
|  ORD006|       C006|   Mumbai|       Home|      Mixer|     0|2024-01-08|Completed|
|  ORD007|       C001|    Delhi|Electronics|     Laptop| 47000|09-01-2024|Completed|
|  ORD008|       C007|Bangalore|       Home|     Vacuum| 28000|2024-01-09|Completed|
|  ORD009|       C002|   Mumbai|Electronics|     Laptop| 55000|20

In [None]:
check.drop_duplicates(["order_id"]).show()

+--------+-----------+---------+-----------+-----------+------+----------+---------+
|order_id|customer_id|     city|   category|    product|amount|order_date|   status|
+--------+-----------+---------+-----------+-----------+------+----------+---------+
|  ORD001|       C001|    Delhi|Electronics|     Laptop| 45000|2024-01-05|Completed|
|  ORD002|       C002|   Mumbai|Electronics|     Mobile| 32000|05/01/2024|Completed|
|  ORD003|       C003|Bangalore|Electronics|     Tablet| 30000|2024/01/06|Completed|
|  ORD004|       C004|    Delhi|Electronics|     Laptop|     0|2024-01-07|Cancelled|
|  ORD005|       C005|  Chennai|Electronics|     Mobile|     0|2024-01-08|Completed|
|  ORD006|       C006|   Mumbai|       Home|      Mixer|     0|2024-01-08|Completed|
|  ORD007|       C001|    Delhi|Electronics|     Laptop| 47000|09-01-2024|Completed|
|  ORD008|       C007|Bangalore|       Home|     Vacuum| 28000|2024-01-09|Completed|
|  ORD009|       C002|   Mumbai|Electronics|     Laptop| 55000|20

In [None]:
check.filter(f.col("status")=="Completed").show()

+--------+-----------+---------+-----------+-----------+------+----------+---------+
|order_id|customer_id|     city|   category|    product|amount|order_date|   status|
+--------+-----------+---------+-----------+-----------+------+----------+---------+
|  ORD001|       C001|    Delhi|Electronics|     Laptop| 45000|2024-01-05|Completed|
|  ORD002|       C002|   Mumbai|Electronics|     Mobile| 32000|05/01/2024|Completed|
|  ORD003|       C003|Bangalore|Electronics|     Tablet| 30000|2024/01/06|Completed|
|  ORD005|       C005|  Chennai|Electronics|     Mobile|     0|2024-01-08|Completed|
|  ORD006|       C006|   Mumbai|       Home|      Mixer|     0|2024-01-08|Completed|
|  ORD007|       C001|    Delhi|Electronics|     Laptop| 47000|09-01-2024|Completed|
|  ORD008|       C007|Bangalore|       Home|     Vacuum| 28000|2024-01-09|Completed|
|  ORD009|       C002|   Mumbai|Electronics|     Laptop| 55000|2024-01-10|Completed|
|  ORD010|       C008|    Delhi|       Home|AirPurifier| 38000|20

In [None]:
check.count()

21

In [None]:
check.filter(f.col("amount").isNotNull()& (f.col("order_id").isNotNull()) & (f.col("order_date").isNotNull()))
check.count()

21

In [None]:
check.printSchema()
check.show()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- status: string (nullable = true)

+--------+-----------+---------+-----------+-----------+------+----------+---------+
|order_id|customer_id|     city|   category|    product|amount|order_date|   status|
+--------+-----------+---------+-----------+-----------+------+----------+---------+
|  ORD001|       C001|    Delhi|Electronics|     Laptop| 45000|2024-01-05|Completed|
|  ORD002|       C002|   Mumbai|Electronics|     Mobile| 32000|05/01/2024|Completed|
|  ORD003|       C003|Bangalore|Electronics|     Tablet| 30000|2024/01/06|Completed|
|  ORD004|       C004|    Delhi|Electronics|     Laptop|     0|2024-01-07|Cancelled|
|  ORD005|       C005|  Chennai|Electronics|     Mobile|     0|2024-01-08|Complete

In [None]:
check.groupBy("city").agg(f.sum("amount")).alias("total_amount").show()
check.groupBy("category").agg(f.sum("amount")).alias("total_amount").show()
check.groupBy("product").agg(f.sum("amount")).alias("total_amount").show()
check.groupBy("city").agg(f.avg("amount")).alias("total_amount").show()



+---------+-----------+
|     city|sum(amount)|
+---------+-----------+
|Bangalore|     271000|
|  Chennai|      62000|
|   Mumbai|     185000|
|    Delhi|     187000|
+---------+-----------+

+-----------+-----------+
|   category|sum(amount)|
+-----------+-----------+
|       Home|     187000|
|Electronics|     518000|
+-----------+-----------+

+-----------+-----------+
|    product|sum(amount)|
+-----------+-----------+
|     Vacuum|      88000|
|AirPurifier|      78000|
|     Laptop|     368000|
|      Mixer|      21000|
|     Mobile|      65000|
|     Tablet|      85000|
+-----------+-----------+

+---------+------------------+
|     city|       avg(amount)|
+---------+------------------+
|Bangalore| 38714.28571428572|
|  Chennai|           31000.0|
|   Mumbai|30833.333333333332|
|    Delhi|31166.666666666668|
+---------+------------------+



In [None]:
s=check.groupBy("product").agg(f.sum("amount").alias("total_amount"))
s.orderBy(f.col("total_amount").desc()).show(3)

+-------+------------+
|product|total_amount|
+-------+------------+
| Laptop|      368000|
| Vacuum|       88000|
| Tablet|       85000|
+-------+------------+
only showing top 3 rows


In [None]:
from pyspark.sql.window import Window
s=check.groupBy("city").agg(f.sum("amount").alias("total_amount"))
w=Window.orderBy(f.col("total_amount").desc())
s.withColumn("rank",f.rank().over(w)).show()


+---------+------------+----+
|     city|total_amount|rank|
+---------+------------+----+
|Bangalore|      271000|   1|
|    Delhi|      187000|   2|
|   Mumbai|      185000|   3|
|  Chennai|       62000|   4|
+---------+------------+----+



In [None]:

s = check.groupBy("product", "category") .agg(f.sum("amount").alias("total_amount"))
w = Window.partitionBy("category").orderBy(f.col("total_amount").desc())
s.withColumn("rank", f.rank().over(w)) .show()



+-----------+-----------+------------+----+
|    product|   category|total_amount|rank|
+-----------+-----------+------------+----+
|     Laptop|Electronics|      368000|   1|
|     Tablet|Electronics|       85000|   2|
|     Mobile|Electronics|       65000|   3|
|     Vacuum|       Home|       88000|   1|
|AirPurifier|       Home|       78000|   2|
|      Mixer|       Home|       21000|   3|
+-----------+-----------+------------+----+



In [None]:
s = check.groupBy("product", "category") .agg(f.sum("amount").alias("total_amount"))
w = Window.partitionBy("category").orderBy(f.col("total_amount").desc())
s.withColumn("rank", f.rank().over(w)).filter(f.col("rank")==1).show()

+-------+-----------+------------+----+
|product|   category|total_amount|rank|
+-------+-----------+------------+----+
| Laptop|Electronics|      368000|   1|
| Vacuum|       Home|       88000|   1|
+-------+-----------+------------+----+



In [None]:

import pyspark.sql.functions as F
import time

t0 = time.time()
check.groupBy("product").sum("amount").show()
check.groupBy("category").sum("amount").show()
check.groupBy("city").sum("amount").show()
t1 = time.time()
print(f"Before caching total time: {t1 - t0:.3f}s")

check2 = check.cache()
check2.count()

t2 = time.time()
check2.groupBy("product").sum("amount").show()
check2.groupBy("category").sum("amount").show()
check2.groupBy("city").sum("amount").show()
t3 = time.time()
print(f"After caching total time: {t3 - t2:.3f}s")



+-----------+-----------+
|    product|sum(amount)|
+-----------+-----------+
|     Vacuum|      88000|
|AirPurifier|      78000|
|     Laptop|     368000|
|      Mixer|      21000|
|     Mobile|      65000|
|     Tablet|      85000|
+-----------+-----------+

+-----------+-----------+
|   category|sum(amount)|
+-----------+-----------+
|       Home|     187000|
|Electronics|     518000|
+-----------+-----------+

+---------+-----------+
|     city|sum(amount)|
+---------+-----------+
|Bangalore|     271000|
|  Chennai|      62000|
|   Mumbai|     185000|
|    Delhi|     187000|
+---------+-----------+

Before caching total time: 0.529s
+-----------+-----------+
|    product|sum(amount)|
+-----------+-----------+
|     Vacuum|      88000|
|AirPurifier|      78000|
|     Laptop|     368000|
|      Mixer|      21000|
|     Mobile|      65000|
|     Tablet|      85000|
+-----------+-----------+

+-----------+-----------+
|   category|sum(amount)|
+-----------+-----------+
|       Home|   

In [None]:
check2.explain(True)

== Parsed Logical Plan ==
'Project [unresolvedstarwithcolumns(amount, CASE WHEN 'or('`=`('regexp_extract('amount, (\d+), 0), ), 'isNull('regexp_extract('amount, (\d+), 0))) THEN 0 ELSE cast('regexp_extract('amount, (\d+), 0) as int) END, None)]
+- Project [order_id#66, customer_id#67, city#68, category#69, product#70, regexp_extract(amount#71, (\d+), 0) AS amount#99, order_date#72, status#73]
   +- Project [order_id#66, customer_id#67, city#68, category#69, product#70, amount#71, order_date#72, trim(status#40, None) AS status#73]
      +- Project [order_id#66, customer_id#67, city#68, category#69, product#70, amount#71, trim(order_date#39, None) AS order_date#72, status#40]
         +- Project [order_id#66, customer_id#67, city#68, category#69, product#70, trim(amount#38, None) AS amount#71, order_date#39, status#40]
            +- Project [order_id#66, customer_id#67, city#68, category#69, trim(product#37, None) AS product#70, amount#38, order_date#39, status#40]
               +- Pro

In [None]:
repart_df=check.repartition("city")


In [None]:
check.write.mode("overwrite").parquet("check_parquet")

In [None]:
check.write.mode("overwrite").orc("check_orc")

In [None]:
df = spark.read.parquet("check_parquet")
df2=spark.read.orc("check_orc")
df.printSchema()
df2.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- status: string (nullable = true)

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- status: string (nullable = true)



show() is an action that prints rows and returns None, not a DataFrame.
So youâ€™re assigning None to df, which breaks subsequent operations


In [None]:

df = df.filter(df["amount"] > 30000)
df.show()


+--------+-----------+---------+-----------+-----------+------+----------+---------+
|order_id|customer_id|     city|   category|    product|amount|order_date|   status|
+--------+-----------+---------+-----------+-----------+------+----------+---------+
|  ORD001|       C001|    Delhi|Electronics|     Laptop| 45000|2024-01-05|Completed|
|  ORD002|       C002|   Mumbai|Electronics|     Mobile| 32000|05/01/2024|Completed|
|  ORD007|       C001|    Delhi|Electronics|     Laptop| 47000|09-01-2024|Completed|
|  ORD009|       C002|   Mumbai|Electronics|     Laptop| 55000|2024-01-10|Completed|
|  ORD010|       C008|    Delhi|       Home|AirPurifier| 38000|2024-01-10|Completed|
|  ORD012|       C010|Bangalore|Electronics|     Mobile| 33000|2024-01-11|Completed|
|  ORD015|       C005|  Chennai|Electronics|     Laptop| 62000|2024-01-13|Completed|
|  ORD016|       C006|   Mumbai|       Home|AirPurifier| 40000|2024-01-13|Completed|
|  ORD017|       C007|Bangalore|Electronics|     Laptop| 51000|20