# COUNT(1) vs COUNT(*) vs COUNT(COL_NAME)

In [None]:
# Create Spark Session

from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.appName("Count(1) vs Count(*)")
    .master("spark://spark-master:7077")
    .getOrCreate()
)

spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/31 16:44:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [None]:
# Lets read the dataframe to check the data
df = spark.read.format("parquet").load("hdfs://namenode:9000/input/data/sales.parquet")

df.show()



+-------------------+----------+-----------+--------------------+-------+----------+
|      transacted_at|    trx_id|retailer_id|         description| amount|   city_id|
+-------------------+----------+-----------+--------------------+-------+----------+
|2017-12-21 22:00:00|1982756348| 1953761884|Home Depot       ...|  11.51|1194163531|
|2017-12-06 12:00:00|2091468861| 2001148981|              Costco|  35.26| 957346984|
|2017-12-18 18:00:00|1043640082|  606497335|unkn      Bishkek...| 101.37|2056066328|
|2017-12-03 21:00:00|1980557911| 1076023740|unkn    ppd id: 5...|    5.8|1223420625|
|2017-11-27 12:00:00|1292471097|  643354906|unkn    ccd id: 6...|  90.79|  77397141|
|2017-12-26 19:00:00| 848534109|  400404203|CVS  arc id: 1070264|   4.33|2074005445|
|2017-11-28 14:00:00|1256298623|  847200066|Wal-Mart     arc ...| 476.57| 720701459|
|2017-10-03 21:00:00|1525949111|  103953879|Rite Aid       Ha...|  30.13| 576817662|
|2017-12-29 23:00:00| 436713004| 1273066548|7-Eleven  ccd id:...|

                                                                                

In [4]:
spark.conf.set("spark.sql.adaptive.enabled", False)
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", False)

In [None]:
from pyspark.sql import functions as F

spark.sparkContext.setJobDescription("save count(1)")
df.groupBy("trx_id").agg(F.count(F.lit(1))).explain(True)
df.groupBy("trx_id").agg(F.count(F.lit(1))).write.format("noop").mode(
    "overwrite"
).save()
spark.sparkContext.setJobDescription(None)

== Parsed Logical Plan ==
'Aggregate ['trx_id], ['trx_id, count(1) AS count(1)#71L]
+- Relation [transacted_at#0,trx_id#1L,retailer_id#2L,description#3,amount#4,city_id#5L] parquet

== Analyzed Logical Plan ==
trx_id: bigint, count(1): bigint
Aggregate [trx_id#1L], [trx_id#1L, count(1) AS count(1)#71L]
+- Relation [transacted_at#0,trx_id#1L,retailer_id#2L,description#3,amount#4,city_id#5L] parquet

== Optimized Logical Plan ==
Aggregate [trx_id#1L], [trx_id#1L, count(1) AS count(1)#71L]
+- Project [trx_id#1L]
   +- Relation [transacted_at#0,trx_id#1L,retailer_id#2L,description#3,amount#4,city_id#5L] parquet

== Physical Plan ==
*(2) HashAggregate(keys=[trx_id#1L], functions=[count(1)], output=[trx_id#1L, count(1)#71L])
+- Exchange hashpartitioning(trx_id#1L, 200), ENSURE_REQUIREMENTS, [plan_id=95]
   +- *(1) HashAggregate(keys=[trx_id#1L], functions=[partial_count(1)], output=[trx_id#1L, count#75L])
      +- *(1) ColumnarToRow
         +- FileScan parquet [trx_id#1L] Batched: true, Dat

                                                                                

In [None]:
# Get count(col_name) performance
spark.sparkContext.setJobDescription("save count(city_id)")
df.groupBy("trx_id").agg(F.count("city_id")).explain(True)
df.groupBy("trx_id").agg(F.count("city_id")).write.format("noop").mode(
    "overwrite"
).save()
spark.sparkContext.setJobDescription(None)

== Parsed Logical Plan ==
'Aggregate ['trx_id], ['trx_id, count('city_id) AS count(city_id)#97]
+- Relation [transacted_at#0,trx_id#1L,retailer_id#2L,description#3,amount#4,city_id#5L] parquet

== Analyzed Logical Plan ==
trx_id: bigint, count(city_id): bigint
Aggregate [trx_id#1L], [trx_id#1L, count(city_id#5L) AS count(city_id)#97L]
+- Relation [transacted_at#0,trx_id#1L,retailer_id#2L,description#3,amount#4,city_id#5L] parquet

== Optimized Logical Plan ==
Aggregate [trx_id#1L], [trx_id#1L, count(city_id#5L) AS count(city_id)#97L]
+- Project [trx_id#1L, city_id#5L]
   +- Relation [transacted_at#0,trx_id#1L,retailer_id#2L,description#3,amount#4,city_id#5L] parquet

== Physical Plan ==
*(2) HashAggregate(keys=[trx_id#1L], functions=[count(city_id#5L)], output=[trx_id#1L, count(city_id)#97L])
+- Exchange hashpartitioning(trx_id#1L, 200), ENSURE_REQUIREMENTS, [plan_id=166]
   +- *(1) HashAggregate(keys=[trx_id#1L], functions=[partial_count(city_id#5L)], output=[trx_id#1L, count#101L])
 

                                                                                

In [7]:
# Get count(*) performance
spark.sparkContext.setJobDescription("save count(*)")
df.groupBy("trx_id").agg(F.count("*")).explain(True)
df.groupBy("trx_id").agg(F.count("*")).write.format("noop").mode("overwrite").save()
spark.sparkContext.setJobDescription(None)

== Parsed Logical Plan ==
'Aggregate ['trx_id], ['trx_id, count(1) AS count(1)#125L]
+- Relation [transacted_at#0,trx_id#1L,retailer_id#2L,description#3,amount#4,city_id#5L] parquet

== Analyzed Logical Plan ==
trx_id: bigint, count(1): bigint
Aggregate [trx_id#1L], [trx_id#1L, count(1) AS count(1)#125L]
+- Relation [transacted_at#0,trx_id#1L,retailer_id#2L,description#3,amount#4,city_id#5L] parquet

== Optimized Logical Plan ==
Aggregate [trx_id#1L], [trx_id#1L, count(1) AS count(1)#125L]
+- Project [trx_id#1L]
   +- Relation [transacted_at#0,trx_id#1L,retailer_id#2L,description#3,amount#4,city_id#5L] parquet

== Physical Plan ==
*(2) HashAggregate(keys=[trx_id#1L], functions=[count(1)], output=[trx_id#1L, count(1)#125L])
+- Exchange hashpartitioning(trx_id#1L, 200), ENSURE_REQUIREMENTS, [plan_id=237]
   +- *(1) HashAggregate(keys=[trx_id#1L], functions=[partial_count(1)], output=[trx_id#1L, count#129L])
      +- *(1) ColumnarToRow
         +- FileScan parquet [trx_id#1L] Batched: tru

                                                                                

In [8]:
# Get count() performance
spark.sparkContext.setJobDescription("save count()")
df.groupBy("trx_id").count().explain(True)
df.groupBy("trx_id").count().write.format("noop").mode("overwrite").save()
spark.sparkContext.setJobDescription(None)

== Parsed Logical Plan ==
'Aggregate ['trx_id], ['trx_id, count(1) AS count#151L]
+- Relation [transacted_at#0,trx_id#1L,retailer_id#2L,description#3,amount#4,city_id#5L] parquet

== Analyzed Logical Plan ==
trx_id: bigint, count: bigint
Aggregate [trx_id#1L], [trx_id#1L, count(1) AS count#151L]
+- Relation [transacted_at#0,trx_id#1L,retailer_id#2L,description#3,amount#4,city_id#5L] parquet

== Optimized Logical Plan ==
Aggregate [trx_id#1L], [trx_id#1L, count(1) AS count#151L]
+- Project [trx_id#1L]
   +- Relation [transacted_at#0,trx_id#1L,retailer_id#2L,description#3,amount#4,city_id#5L] parquet

== Physical Plan ==
*(2) HashAggregate(keys=[trx_id#1L], functions=[count(1)], output=[trx_id#1L, count#151L])
+- Exchange hashpartitioning(trx_id#1L, 200), ENSURE_REQUIREMENTS, [plan_id=308]
   +- *(1) HashAggregate(keys=[trx_id#1L], functions=[partial_count(1)], output=[trx_id#1L, count#155L])
      +- *(1) ColumnarToRow
         +- FileScan parquet [trx_id#1L] Batched: true, DataFilters:

                                                                                

In [None]:
# Get filter + count(*) performance
spark.sparkContext.setJobDescription("save filter-count(*)")
df.filter(F.month(F.col("transacted_at")) == 11).groupBy("trx_id").agg(
    F.count("*")
).explain(True)
df.filter(F.month(F.col("transacted_at")) == 11).groupBy("trx_id").agg(
    F.count("*")
).write.format("noop").mode("overwrite").save()
spark.sparkContext.setJobDescription(None)

== Parsed Logical Plan ==
'Aggregate ['trx_id], ['trx_id, count(1) AS count(1)#177L]
+- Filter (month(cast(transacted_at#0 as date)) = 11)
   +- Relation [transacted_at#0,trx_id#1L,retailer_id#2L,description#3,amount#4,city_id#5L] parquet

== Analyzed Logical Plan ==
trx_id: bigint, count(1): bigint
Aggregate [trx_id#1L], [trx_id#1L, count(1) AS count(1)#177L]
+- Filter (month(cast(transacted_at#0 as date)) = 11)
   +- Relation [transacted_at#0,trx_id#1L,retailer_id#2L,description#3,amount#4,city_id#5L] parquet

== Optimized Logical Plan ==
Aggregate [trx_id#1L], [trx_id#1L, count(1) AS count(1)#177L]
+- Project [trx_id#1L]
   +- Filter (isnotnull(transacted_at#0) AND (month(cast(transacted_at#0 as date)) = 11))
      +- Relation [transacted_at#0,trx_id#1L,retailer_id#2L,description#3,amount#4,city_id#5L] parquet

== Physical Plan ==
*(2) HashAggregate(keys=[trx_id#1L], functions=[count(1)], output=[trx_id#1L, count(1)#177L])
+- Exchange hashpartitioning(trx_id#1L, 200), ENSURE_REQUIRE

                                                                                

In [10]:
spark.stop()