## No. of Cores

In [0]:
spark.sparkContext.defaultParallelism

Out[1]: 8

## max size of Partition

In [0]:
import re

parts = spark.conf.get("spark.sql.files.maxPartitionBytes")
parts_size = int(re.sub(re.compile('[^0-9]'), '', parts))/(1024**2)
parts_size

Out[2]: 128.0

## Open Cost 

In [0]:
buff = spark.conf.get("spark.sql.files.openCostInBytes")
buff_size = int(re.sub(re.compile('[^0-9]'), '', buff))/(1024**2)
buff_size

Out[3]: 4.0

## As of Spark 3.x, AQE is enabled by default
**Automatically solves data skewness in partitions, also it provides the required kind of join strategy dynamically, also shuffling will made as less as possible dynamically**

*we might use SALTING technique i.e. splitting larger partition to multiple partitions manually -- by creating a pseudo key*

In [0]:
spark.conf.get("spark.sql.adaptive.enabled")

Out[4]: 'true'

In [0]:
dbutils.fs.ls('dbfs:/FileStore/ExploreInternals/')

Out[5]: [FileInfo(path='dbfs:/FileStore/ExploreInternals/Calendar.csv', name='Calendar.csv', size=494637, modificationTime=1714129024000),
 FileInfo(path='dbfs:/FileStore/ExploreInternals/Customers.csv', name='Customers.csv', size=3061458, modificationTime=1714129025000),
 FileInfo(path='dbfs:/FileStore/ExploreInternals/Products.csv', name='Products.csv', size=500845, modificationTime=1714129023000),
 FileInfo(path='dbfs:/FileStore/ExploreInternals/Promotions.csv', name='Promotions.csv', size=4347, modificationTime=1714129023000),
 FileInfo(path='dbfs:/FileStore/ExploreInternals/Stores.csv', name='Stores.csv', size=29575, modificationTime=1714129024000)]

## read all csv files in a directory as a single dataframe
***Check for the number of jobs created***

In [0]:
df = spark.read.format('csv').option('header',True).load('dbfs:/FileStore/ExploreInternals/*.csv')

***by default it scans all the records and returns one single row with count of records -----> which is one job. (check in view)***

**if schema is specified in the code, then no jobs are created**

In [0]:
display(df.limit(5))

Customer_Key,Customer_Name,Birth_Date,Marital_Status,Gender,Yearly_Income,Total_Children,Number_of_Children_at_Home,Education,Occupation,House_Owner,Number_of_Cars_Owned,Date_of_First_Purchase,Customer_Type,Company_Name,Country,State
25,"Xie, Russell",1978-09-17 00:00:00.000,M,M,60000,0,0,Partial College,Skilled Manual,1,2,2003-12-29 00:00:00.000,Person,,United States,California
37,"Russell, Jennifer",1978-12-18 00:00:00.000,M,F,60000,0,0,Partial College,Skilled Manual,1,2,2003-07-26 00:00:00.000,Person,,United States,California
42,"Carter, Amanda",1977-10-16 00:00:00.000,M,F,60000,0,0,Partial College,Skilled Manual,1,2,2003-07-18 00:00:00.000,Person,,United States,California
44,"Simmons, Nathan",1976-02-24 00:00:00.000,M,M,60000,0,0,Partial College,Skilled Manual,1,2,2003-11-09 00:00:00.000,Person,,United States,California
929,"Morris, Isabella",1978-09-07 00:00:00.000,M,F,60000,0,0,Partial College,Skilled Manual,1,2,2003-12-02 00:00:00.000,Person,,United States,California


## Rows X Columns

In [0]:
print(df.count(), 'X' ,len(df.columns))

24276 X 17


## How many partitions ?

In [0]:
df.rdd.getNumPartitions()

Out[10]: 5

## How many records in each partition

In [0]:
from pyspark.sql.functions import spark_partition_id,col
parts_count = df.withColumn('partitions', spark_partition_id()).groupBy('partitions').count().\
                                                                select(col('partitions'),col('count').alias('Partition_Count'))
parts_count.show()

+----------+---------------+
|partitions|Partition_Count|
+----------+---------------+
|         0|          18869|
|         1|           2517|
|         2|           2556|
|         3|            306|
|         4|             28|
+----------+---------------+



***This is SKEWED DATA***

In [0]:
from pyspark.sql.functions import sum
parts_count.select(sum(parts_count.Partition_Count).alias('TotalRecs')).show()

+---------+
|TotalRecs|
+---------+
|    24276|
+---------+

