In [0]:
spark

In [0]:
df = spark.read.csv(path = "/FileStore/tables/employee_records.csv", header = True, inferSchema = True)

In [0]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- country: string (nullable = true)
 |-- emp_type: string (nullable = true)



#### Partition

In [0]:
df.show()

+--------+---+------+---------+--------+
|    name|age|gender|  country|emp_type|
+--------+---+------+---------+--------+
|    John| 34|  Male|   Brazil|   admin|
|    Liam| 46|  Male|    China|employee|
|Isabella| 38|  Male|    Japan|   admin|
| William| 43|  Male|    India|   admin|
|   James| 49|Female|   Brazil| manager|
|     Ava| 35|Female|   Canada|employee|
|    Noah| 57|  Male|   France|   admin|
|  Sophia| 25|Female|  Germany|employee|
|     Ava| 23|  Male|   Canada|employee|
|    Emma| 36|  Male|    Japan| manager|
|    Emma| 27|Female|       UK| manager|
|     Ava| 50|  Male|      USA| manager|
|    Emma| 54|  Male|    China| manager|
|   James| 43|  Male|       UK|employee|
|Isabella| 50|Female|  Germany|employee|
|  Olivia| 52|Female|       UK| manager|
| William| 50|Female|    Japan|   admin|
| William| 21|  Male|   France|employee|
|    Noah| 28|  Male|Australia| manager|
|    Liam| 25|Female|    Japan|   admin|
+--------+---+------+---------+--------+
only showing top

In [0]:
df.write.format("csv")\
    .option("header","true")\
    .option("mode","overwrite")\
    .partitionBy("gender")\
    .save(path = "/FileStore/tables/partitioned_employee_dataset")

In [0]:
%fs
ls FileStore/tables/partitioned_employee_dataset/gender=Female/

path,name,size,modificationTime
dbfs:/FileStore/tables/partitioned_employee_dataset/gender=Female/_SUCCESS,_SUCCESS,0,1721047133000
dbfs:/FileStore/tables/partitioned_employee_dataset/gender=Female/_committed_1583854544283189852,_committed_1583854544283189852,111,1721047133000
dbfs:/FileStore/tables/partitioned_employee_dataset/gender=Female/_started_1583854544283189852,_started_1583854544283189852,0,1721047132000
dbfs:/FileStore/tables/partitioned_employee_dataset/gender=Female/part-00000-tid-1583854544283189852-367a5bb8-7318-49cb-aed8-f94a2968167f-3-1.c000.csv,part-00000-tid-1583854544283189852-367a5bb8-7318-49cb-aed8-f94a2968167f-3-1.c000.csv,1287,1721047132000


In [0]:
path_for_female_dataset = r"/FileStore/tables/partitioned_employee_dataset/gender=Female/part-00000-tid-1583854544283189852-367a5bb8-7318-49cb-aed8-f94a2968167f-3-1.c000.csv"
df = spark.read\
    .option("header","true")\
    .option("inferschema","true")\
    .csv(path = path_for_female_dataset)
df.show(n = 100)

+--------+---+---------+--------+
|    name|age|  country|emp_type|
+--------+---+---------+--------+
|   James| 49|   Brazil| manager|
|     Ava| 35|   Canada|employee|
|  Sophia| 25|  Germany|employee|
|    Emma| 27|       UK| manager|
|Isabella| 50|  Germany|employee|
|  Olivia| 52|       UK| manager|
| William| 50|    Japan|   admin|
|    Liam| 25|    Japan|   admin|
|    John| 34|   Brazil| manager|
| William| 20|   Brazil| manager|
|    John| 40|    China| manager|
| William| 51|    China|employee|
|    Noah| 44|   France|employee|
|   James| 26|    Japan|   admin|
|Isabella| 54|       UK|employee|
|    John| 56|    China|   admin|
|    Liam| 54|   Brazil| manager|
|Isabella| 56|   Brazil| manager|
| William| 41|   Canada|   admin|
|    Emma| 58|Australia| manager|
|     Ava| 60|   France| manager|
|Isabella| 30|   France|employee|
| William| 35|   Canada| manager|
|     Ava| 26|   Brazil|   admin|
|  Sophia| 34|    Japan|   admin|
| William| 40|    China|   admin|
| William| 41|

#### Bucketing

In [0]:
df = spark.read.option("header","true")\
    .option("inferschema","true")\
    .csv(path = "/FileStore/tables/employee_records.csv")

In [0]:
df.show(100)

+--------+---+------+---------+--------+
|    name|age|gender|  country|emp_type|
+--------+---+------+---------+--------+
|    John| 34|  Male|   Brazil|   admin|
|    Liam| 46|  Male|    China|employee|
|Isabella| 38|  Male|    Japan|   admin|
| William| 43|  Male|    India|   admin|
|   James| 49|Female|   Brazil| manager|
|     Ava| 35|Female|   Canada|employee|
|    Noah| 57|  Male|   France|   admin|
|  Sophia| 25|Female|  Germany|employee|
|     Ava| 23|  Male|   Canada|employee|
|    Emma| 36|  Male|    Japan| manager|
|    Emma| 27|Female|       UK| manager|
|     Ava| 50|  Male|      USA| manager|
|    Emma| 54|  Male|    China| manager|
|   James| 43|  Male|       UK|employee|
|Isabella| 50|Female|  Germany|employee|
|  Olivia| 52|Female|       UK| manager|
| William| 50|Female|    Japan|   admin|
| William| 21|  Male|   France|employee|
|    Noah| 28|  Male|Australia| manager|
|    Liam| 25|Female|    Japan|   admin|
|    John| 34|Female|   Brazil| manager|
|    John| 60|  

In [0]:
df.write\
    .format("csv")\
    .option("header","true")\
    .option("mode","overwrite")\
    .option("path","/FileStore/tables/bucket_by_division/")\
    .bucketBy(2,"emp_type")\
    .saveAsTable("bucket_division")

In [0]:
%fs
ls /FileStore/tables/bucket_by_division/

path,name,size,modificationTime
dbfs:/FileStore/tables/bucket_by_division/_SUCCESS,_SUCCESS,0,1721050592000
dbfs:/FileStore/tables/bucket_by_division/_committed_8648901029701791816,_committed_8648901029701791816,212,1721050592000
dbfs:/FileStore/tables/bucket_by_division/_started_8648901029701791816,_started_8648901029701791816,0,1721050592000
dbfs:/FileStore/tables/bucket_by_division/part-00000-tid-8648901029701791816-f57a438f-7dbc-4837-922a-e56ed2616ef1-19-1_00000.c000.csv,part-00000-tid-8648901029701791816-f57a438f-7dbc-4837-922a-e56ed2616ef1-19-1_00000.c000.csv,2003,1721050592000
dbfs:/FileStore/tables/bucket_by_division/part-00000-tid-8648901029701791816-f57a438f-7dbc-4837-922a-e56ed2616ef1-19-2_00001.c000.csv,part-00000-tid-8648901029701791816-f57a438f-7dbc-4837-922a-e56ed2616ef1-19-2_00001.c000.csv,1018,1721050592000


In [0]:
df_part1 = spark.read\
    .option("header","true")\
    .option("inferschema","true")\
    .csv(path = "/FileStore/tables/bucket_by_division/part-00000-tid-8648901029701791816-f57a438f-7dbc-4837-922a-e56ed2616ef1-19-1_00000.c000.csv")

In [0]:
df_part1.show()

+--------+---+------+---------+--------+
|    name|age|gender|  country|emp_type|
+--------+---+------+---------+--------+
|    Liam| 46|  Male|    China|employee|
|   James| 49|Female|   Brazil| manager|
|     Ava| 35|Female|   Canada|employee|
|  Sophia| 25|Female|  Germany|employee|
|     Ava| 23|  Male|   Canada|employee|
|    Emma| 36|  Male|    Japan| manager|
|    Emma| 27|Female|       UK| manager|
|     Ava| 50|  Male|      USA| manager|
|    Emma| 54|  Male|    China| manager|
|   James| 43|  Male|       UK|employee|
|Isabella| 50|Female|  Germany|employee|
|  Olivia| 52|Female|       UK| manager|
| William| 21|  Male|   France|employee|
|    Noah| 28|  Male|Australia| manager|
|    John| 34|Female|   Brazil| manager|
| William| 20|Female|   Brazil| manager|
|    John| 40|Female|    China| manager|
|    Noah| 46|  Male|       UK| manager|
|  Olivia| 55|  Male|  Germany| manager|
|    Noah| 38|  Male|    China|employee|
+--------+---+------+---------+--------+
only showing top

In [0]:
df_part1.count()

Out[24]: 65

In [0]:
df_part2 = spark.read\
    .option("header","true")\
    .option("inferschema","true")\
    .csv(path = "/FileStore/tables/bucket_by_division/part-00000-tid-8648901029701791816-f57a438f-7dbc-4837-922a-e56ed2616ef1-19-2_00001.c000.csv")

In [0]:
df_part2.count()

Out[26]: 35

In [0]:
df_part1.show()

+--------+---+------+---------+--------+
|    name|age|gender|  country|emp_type|
+--------+---+------+---------+--------+
|    Liam| 46|  Male|    China|employee|
|   James| 49|Female|   Brazil| manager|
|     Ava| 35|Female|   Canada|employee|
|  Sophia| 25|Female|  Germany|employee|
|     Ava| 23|  Male|   Canada|employee|
|    Emma| 36|  Male|    Japan| manager|
|    Emma| 27|Female|       UK| manager|
|     Ava| 50|  Male|      USA| manager|
|    Emma| 54|  Male|    China| manager|
|   James| 43|  Male|       UK|employee|
|Isabella| 50|Female|  Germany|employee|
|  Olivia| 52|Female|       UK| manager|
| William| 21|  Male|   France|employee|
|    Noah| 28|  Male|Australia| manager|
|    John| 34|Female|   Brazil| manager|
| William| 20|Female|   Brazil| manager|
|    John| 40|Female|    China| manager|
|    Noah| 46|  Male|       UK| manager|
|  Olivia| 55|  Male|  Germany| manager|
|    Noah| 38|  Male|    China|employee|
+--------+---+------+---------+--------+
only showing top

In [0]:
df_part2.show()

+--------+---+------+---------+--------+
|    name|age|gender|  country|emp_type|
+--------+---+------+---------+--------+
|    John| 34|  Male|   Brazil|   admin|
|Isabella| 38|  Male|    Japan|   admin|
| William| 43|  Male|    India|   admin|
|    Noah| 57|  Male|   France|   admin|
| William| 50|Female|    Japan|   admin|
|    Liam| 25|Female|    Japan|   admin|
|    John| 60|  Male|    Japan|   admin|
|   James| 26|Female|    Japan|   admin|
|    Liam| 32|  Male|    China|   admin|
|    John| 56|Female|    China|   admin|
| William| 30|  Male|    India|   admin|
| William| 41|Female|   Canada|   admin|
|  Sophia| 29|  Male|  Germany|   admin|
|     Ava| 26|Female|   Brazil|   admin|
|  Sophia| 34|Female|    Japan|   admin|
| William| 40|Female|    China|   admin|
|    John| 54|  Male|Australia|   admin|
|Isabella| 21|Female|   Canada|   admin|
|  Olivia| 47|Female|   Brazil|   admin|
|    John| 26|  Male|   France|   admin|
+--------+---+------+---------+--------+
only showing top