# COUNT(1) vs COUNT(*) vs COUNT(COL_NAME)

In [1]:
# Create Spark Session

from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Count(1) vs Count(*)") \
    .master("spark://spark-master:7077") \
    .getOrCreate()

spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/26 11:14:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# Lets read the dataframe to check the data
df = spark \
    .read \
    .format("orc") \
    .load("hdfs://namenode:9000/input/data/sales_orc")

df.show()

[Stage 0:>                                                          (0 + 1) / 1]

+-------------------+----------+-----------+--------------------+-------+----------+
|      transacted_at|    trx_id|retailer_id|         description| amount|   city_id|
+-------------------+----------+-----------+--------------------+-------+----------+
|2017-12-20 20:00:00| 561803551| 2077350195|Walgreen  ccd id:...|  69.66| 350411713|
|2017-12-10 22:00:00| 966498100| 1232435973|Toys R Us   ccd i...|  16.44|1554564545|
|2017-12-19 17:00:00|  40380012| 1898522855|Target   arc id: ...|2854.84| 920189167|
|2017-12-19 19:00:00|1735489522|  847200066|unkn     ccd id: ...| 280.31|2011632272|
|2017-01-11 14:00:00|1513345631| 1953761884|Home Depot   arc ...|  20.51|1528300441|
|2017-12-24 23:00:00| 884145953| 2001148981|Costco  ccd id: 4...|  11.75|2116046074|
|2017-05-14 19:00:00|1003554030| 1903529855|                unkn|  61.76|1710668653|
|2017-12-24 19:00:00| 921164309|  847200066|unkn   ppd id: 95...|  27.83|2074005445|
|2017-12-16 17:00:00| 549217139| 1070485878|          Amazon.com|

                                                                                

In [10]:
from pyspark.sql import functions as F
spark.sparkContext.setJobDescription("save count(1)")
df.groupBy("trx_id").agg(F.count(F.lit(1))).explain(True)
df.groupBy("trx_id").agg(F.count(F.lit(1))).write.format("noop").mode("overwrite").save()
spark.sparkContext.setJobDescription(None)

== Parsed Logical Plan ==
'Aggregate ['trx_id], ['trx_id, count(1) AS count(1)#205L]
+- Relation [transacted_at#0,trx_id#1,retailer_id#2,description#3,amount#4,city_id#5] orc

== Analyzed Logical Plan ==
trx_id: int, count(1): bigint
Aggregate [trx_id#1], [trx_id#1, count(1) AS count(1)#205L]
+- Relation [transacted_at#0,trx_id#1,retailer_id#2,description#3,amount#4,city_id#5] orc

== Optimized Logical Plan ==
Aggregate [trx_id#1], [trx_id#1, count(1) AS count(1)#205L]
+- Project [trx_id#1]
   +- Relation [transacted_at#0,trx_id#1,retailer_id#2,description#3,amount#4,city_id#5] orc

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[trx_id#1], functions=[count(1)], output=[trx_id#1, count(1)#205L])
   +- Exchange hashpartitioning(trx_id#1, 200), ENSURE_REQUIREMENTS, [plan_id=381]
      +- HashAggregate(keys=[trx_id#1], functions=[partial_count(1)], output=[trx_id#1, count#209L])
         +- FileScan orc [trx_id#1] Batched: true, DataFilters: [], Format: ORC,

In [12]:
# Get count(col_name) performance
spark.sparkContext.setJobDescription("save count(city_id)")
df.groupBy("trx_id").agg(F.count("city_id")).explain(True)
df.groupBy("trx_id").agg(F.count("city_id")).write.format("noop").mode("overwrite").save()
spark.sparkContext.setJobDescription(None)

== Parsed Logical Plan ==
'Aggregate ['trx_id], ['trx_id, count('city_id) AS count(city_id)#259]
+- Relation [transacted_at#0,trx_id#1,retailer_id#2,description#3,amount#4,city_id#5] orc

== Analyzed Logical Plan ==
trx_id: int, count(city_id): bigint
Aggregate [trx_id#1], [trx_id#1, count(city_id#5) AS count(city_id)#259L]
+- Relation [transacted_at#0,trx_id#1,retailer_id#2,description#3,amount#4,city_id#5] orc

== Optimized Logical Plan ==
Aggregate [trx_id#1], [trx_id#1, count(city_id#5) AS count(city_id)#259L]
+- Project [trx_id#1, city_id#5]
   +- Relation [transacted_at#0,trx_id#1,retailer_id#2,description#3,amount#4,city_id#5] orc

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[trx_id#1], functions=[count(city_id#5)], output=[trx_id#1, count(city_id)#259L])
   +- Exchange hashpartitioning(trx_id#1, 200), ENSURE_REQUIREMENTS, [plan_id=499]
      +- HashAggregate(keys=[trx_id#1], functions=[partial_count(city_id#5)], output=[trx_id#1, count#263L])
 

In [13]:
# Get count(*) performance
spark.sparkContext.setJobDescription("save count(*)")
df.groupBy("trx_id").agg(F.count("*")).explain(True)
df.groupBy("trx_id").agg(F.count("*")).write.format("noop").mode("overwrite").save()
spark.sparkContext.setJobDescription(None)

== Parsed Logical Plan ==
'Aggregate ['trx_id], ['trx_id, count(1) AS count(1)#287L]
+- Relation [transacted_at#0,trx_id#1,retailer_id#2,description#3,amount#4,city_id#5] orc

== Analyzed Logical Plan ==
trx_id: int, count(1): bigint
Aggregate [trx_id#1], [trx_id#1, count(1) AS count(1)#287L]
+- Relation [transacted_at#0,trx_id#1,retailer_id#2,description#3,amount#4,city_id#5] orc

== Optimized Logical Plan ==
Aggregate [trx_id#1], [trx_id#1, count(1) AS count(1)#287L]
+- Project [trx_id#1]
   +- Relation [transacted_at#0,trx_id#1,retailer_id#2,description#3,amount#4,city_id#5] orc

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[trx_id#1], functions=[count(1)], output=[trx_id#1, count(1)#287L])
   +- Exchange hashpartitioning(trx_id#1, 200), ENSURE_REQUIREMENTS, [plan_id=558]
      +- HashAggregate(keys=[trx_id#1], functions=[partial_count(1)], output=[trx_id#1, count#291L])
         +- FileScan orc [trx_id#1] Batched: true, DataFilters: [], Format: ORC,

In [17]:
# Get filter + count(*) performance
spark.sparkContext.setJobDescription("save filter-count(*)")
df.filter(F.month(F.col("transacted_at")) == 11).groupBy("trx_id").agg(F.count("*")).explain(True)
df.filter(F.month(F.col("transacted_at")) == 11).groupBy("trx_id").agg(F.count("*")).write.format("noop").mode("overwrite").save()
spark.sparkContext.setJobDescription(None)

== Parsed Logical Plan ==
'Aggregate ['trx_id], ['trx_id, count(1) AS count(1)#313L]
+- Filter (month(cast(transacted_at#0 as date)) = 11)
   +- Relation [transacted_at#0,trx_id#1,retailer_id#2,description#3,amount#4,city_id#5] orc

== Analyzed Logical Plan ==
trx_id: int, count(1): bigint
Aggregate [trx_id#1], [trx_id#1, count(1) AS count(1)#313L]
+- Filter (month(cast(transacted_at#0 as date)) = 11)
   +- Relation [transacted_at#0,trx_id#1,retailer_id#2,description#3,amount#4,city_id#5] orc

== Optimized Logical Plan ==
Aggregate [trx_id#1], [trx_id#1, count(1) AS count(1)#313L]
+- Project [trx_id#1]
   +- Filter (isnotnull(transacted_at#0) AND (month(cast(transacted_at#0 as date)) = 11))
      +- Relation [transacted_at#0,trx_id#1,retailer_id#2,description#3,amount#4,city_id#5] orc

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[trx_id#1], functions=[count(1)], output=[trx_id#1, count(1)#313L])
   +- Exchange hashpartitioning(trx_id#1, 200), ENSURE_RE

                                                                                

In [18]:
spark.stop()