# COUNT(1) vs COUNT(*) vs COUNT(COL_NAME)

In [None]:
# Create Spark Session

from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Count(1) vs Count(*)") \
    .master("spark://spark-master:7077") \
    .getOrCreate()

spark

In [None]:
# Lets create a simple Python decorator - {get_time} to get the execution timings
# If you dont know about Python decorators - check out : https://www.geeksforgeeks.org/decorators-in-python/
import time

def get_time(func):
    def inner_get_time() -> str:
        start_time = time.time()
        func()
        end_time = time.time()
        return (f"Execution time: {(end_time - start_time)*1000} ms")
    print(inner_get_time())

In [None]:
# Lets read the dataframe to check the data
df = spark \
    .read \
    .format("csv") \
    .option("header", True) \
    .load("dataset/sales.csv")

df.show()

In [None]:
# Get count(1) performance
from pyspark.sql.functions import lit, count

@get_time
def x(): df.groupBy("trx_id").agg(count(lit(1))).write.format("noop").mode("overwrite").save()

In [None]:
# Get count(col_name) performance
@get_time
def x(): df.groupBy("trx_id").agg(count("city_id")).write.format("noop").mode("overwrite").save()

In [None]:
# Get count(*) performance
@get_time
def x(): df.groupBy("trx_id").agg(count("*")).write.format("noop").mode("overwrite").save()

In [None]:
# Explain Plan for count(*)
df.groupBy("trx_id").agg(count("*")).explain(True)

In [None]:
# Explain Plan for count(1)
df.groupBy("trx_id").agg(count(lit(1))).explain(True)

In [None]:
# Explain plan with count(col_name)
df.groupBy("trx_id").agg(count("city_id")).explain(True)