In [18]:
import findspark
findspark.init()

In [41]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, count, column

In [20]:
# Initialize spark session
spark = SparkSession.builder.appName("PartitionManagement").getOrCreate()
spark

In [21]:
data = [(1, 'Alice', 30),
       (2, 'Bob', 25),
       (3, 'Charlie', 35),
       (4, 'David', 40),
       (5, 'Eve', 22),
       (6, 'Frank', 28),
       (7, 'Grace', 32),
       (8, 'Hannah', 26),
       (9, 'Ivy', 24),
       (10, 'Jack', 27),
       (11, 'Kara', 29),
       (12, 'Lian', 31),
       (13, 'Mia', 33),
       (14, 'Nina', 34),
       (15, 'Oliver', 36),
       (16, 'Paul', 37),
       (17, 'Quinn', 38),
       (18, 'Rita', 39),
       (19, 'Steve', 41),
       (20, 'Tom', 42),
       (21, 'Nina', 34),
       (22, 'Oliver', 36),
       (23, 'Paul', 37),
       (24, 'Quinn', 38),
       (25, 'Rita', 39),
       (26, 'Steve', 41),
       (27, 'Tom', 42),
       (28, 'Tom', 42),
       (29, 'Nina', 34),
       (30, 'Oliver', 36),
       (31, 'Paul', 37),
       (32, 'Quinn', 38),
       (33, 'Rita', 39),
       (34, 'Steve', 41),
       (35, 'Tom', 42)]
# Add more data to simulate large dataset

# Columns for the DataFrame
columns = ['id', 'name', 'age']

# Create DataFrame from the sample data 
df = spark.createDataFrame(data, columns)
df.show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|  Alice| 30|
|  2|    Bob| 25|
|  3|Charlie| 35|
|  4|  David| 40|
|  5|    Eve| 22|
|  6|  Frank| 28|
|  7|  Grace| 32|
|  8| Hannah| 26|
|  9|    Ivy| 24|
| 10|   Jack| 27|
| 11|   Kara| 29|
| 12|   Lian| 31|
| 13|    Mia| 33|
| 14|   Nina| 34|
| 15| Oliver| 36|
| 16|   Paul| 37|
| 17|  Quinn| 38|
| 18|   Rita| 39|
| 19|  Steve| 41|
| 20|    Tom| 42|
+---+-------+---+
only showing top 20 rows



In [22]:
# Repartition DataFrame into 20 partitions
df_repartitioned = df.repartition(20)
df_repartitioned

DataFrame[id: bigint, name: string, age: bigint]

In [23]:
# Show the number of partitions in the DataFrame
num_partitions = df_repartitioned.rdd.getNumPartitions()
num_partitions

20

In [24]:
# Print the data in each partition using glom()
# glom() groups the data in each patition as a list

partitions_data = df_repartitioned.rdd.glom().collect()
partitions_data

[[],
 [Row(id=30, name='Oliver', age=36)],
 [Row(id=34, name='Steve', age=41)],
 [Row(id=29, name='Nina', age=34)],
 [Row(id=8, name='Hannah', age=26), Row(id=33, name='Rita', age=39)],
 [Row(id=6, name='Frank', age=28), Row(id=35, name='Tom', age=42)],
 [Row(id=7, name='Grace', age=32),
  Row(id=21, name='Nina', age=34),
  Row(id=31, name='Paul', age=37)],
 [Row(id=5, name='Eve', age=22),
  Row(id=23, name='Paul', age=37),
  Row(id=32, name='Quinn', age=38)],
 [Row(id=14, name='Nina', age=34), Row(id=24, name='Quinn', age=38)],
 [Row(id=2, name='Bob', age=25),
  Row(id=11, name='Kara', age=29),
  Row(id=13, name='Mia', age=33),
  Row(id=22, name='Oliver', age=36)],
 [Row(id=3, name='Charlie', age=35),
  Row(id=9, name='Ivy', age=24),
  Row(id=15, name='Oliver', age=36)],
 [Row(id=1, name='Alice', age=30),
  Row(id=12, name='Lian', age=31),
  Row(id=16, name='Paul', age=37),
  Row(id=27, name='Tom', age=42)],
 [Row(id=4, name='David', age=40),
  Row(id=10, name='Jack', age=27),
  Row(i

In [25]:
# Print partition IDs and their corresponding data 
for idx, partition in enumerate(partitions_data):
    print(f"Partition ID: {idx}, Data: {partition}")

Partition ID: 0, Data: []
Partition ID: 1, Data: [Row(id=30, name='Oliver', age=36)]
Partition ID: 2, Data: [Row(id=34, name='Steve', age=41)]
Partition ID: 3, Data: [Row(id=29, name='Nina', age=34)]
Partition ID: 4, Data: [Row(id=8, name='Hannah', age=26), Row(id=33, name='Rita', age=39)]
Partition ID: 5, Data: [Row(id=6, name='Frank', age=28), Row(id=35, name='Tom', age=42)]
Partition ID: 6, Data: [Row(id=7, name='Grace', age=32), Row(id=21, name='Nina', age=34), Row(id=31, name='Paul', age=37)]
Partition ID: 7, Data: [Row(id=5, name='Eve', age=22), Row(id=23, name='Paul', age=37), Row(id=32, name='Quinn', age=38)]
Partition ID: 8, Data: [Row(id=14, name='Nina', age=34), Row(id=24, name='Quinn', age=38)]
Partition ID: 9, Data: [Row(id=2, name='Bob', age=25), Row(id=11, name='Kara', age=29), Row(id=13, name='Mia', age=33), Row(id=22, name='Oliver', age=36)]
Partition ID: 10, Data: [Row(id=3, name='Charlie', age=35), Row(id=9, name='Ivy', age=24), Row(id=15, name='Oliver', age=36)]
Par

In [26]:
# Now let's filter out partitions 16 to 20 using mapPartitionsWithIndex
#This will allow us to access the partition index and filter out unwanted partitions. 

def filter_partition_data(index, iterator):
    # keep data only for partitions 0 to 15 (excluding partitions 16 to 20)
    if index < 16:
        yield from iterator
        

In [27]:
# Apply the filter functions to remove data from partitions 10 to 20 

filtered_rdd = df_repartitioned.rdd.mapPartitionsWithIndex(filter_partition_data)

In [28]:
# Convert the filtered RDD back to DataFrame
filtered_df = spark.createDataFrame(filtered_rdd, columns)
filtered_df.show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
| 30| Oliver| 36|
| 34|  Steve| 41|
| 29|   Nina| 34|
|  8| Hannah| 26|
| 33|   Rita| 39|
|  6|  Frank| 28|
| 35|    Tom| 42|
|  7|  Grace| 32|
| 21|   Nina| 34|
| 31|   Paul| 37|
|  5|    Eve| 22|
| 23|   Paul| 37|
| 32|  Quinn| 38|
| 14|   Nina| 34|
| 24|  Quinn| 38|
|  2|    Bob| 25|
| 11|   Kara| 29|
| 13|    Mia| 33|
| 22| Oliver| 36|
|  3|Charlie| 35|
+---+-------+---+
only showing top 20 rows



In [29]:
data = [(1, 'Alice', 25, None), (2, 'Bob', None, 4000), (3, 'Charlie', 30, 4500), (4, None, None, 5000), (5, 'Eve', 28, -1000)]
columns = ['id', 'name', 'age', 'salary']
df = spark.createDataFrame(data, columns)
df.show()

+---+-------+----+------+
| id|   name| age|salary|
+---+-------+----+------+
|  1|  Alice|  25|  NULL|
|  2|    Bob|NULL|  4000|
|  3|Charlie|  30|  4500|
|  4|   NULL|NULL|  5000|
|  5|    Eve|  28| -1000|
+---+-------+----+------+



In [30]:
# Dropping rows with missing or null values in any column

df_dropped = df.dropna()
df_dropped.show()

+---+-------+---+------+
| id|   name|age|salary|
+---+-------+---+------+
|  3|Charlie| 30|  4500|
|  5|    Eve| 28| -1000|
+---+-------+---+------+



In [31]:
# Filling missing values (null) with a specified value

df_filled = df.fillna({'age':0, 'salary':3000, 'name':'Unknown'})
df_filled.show()

+---+-------+---+------+
| id|   name|age|salary|
+---+-------+---+------+
|  1|  Alice| 25|  3000|
|  2|    Bob|  0|  4000|
|  3|Charlie| 30|  4500|
|  4|Unknown|  0|  5000|
|  5|    Eve| 28| -1000|
+---+-------+---+------+



In [34]:
# Replacing invalid data (e.g., salary <0) with a valid value (e.g., 0)

df_invallid_replaced = df_filled.withColumn('salary', when(col('salary') <0, 0).otherwise(col('salary')))
df_invallid_replaced.show()

+---+-------+---+------+
| id|   name|age|salary|
+---+-------+---+------+
|  1|  Alice| 25|  3000|
|  2|    Bob|  0|  4000|
|  3|Charlie| 30|  4500|
|  4|Unknown|  0|  5000|
|  5|    Eve| 28|     0|
+---+-------+---+------+



In [35]:
# Handling invalid data types (e.g., if age is not an integer) 
# For this, we would attempt to cast age to an integer and filter out rows with invalid age data

df_invalid_age_handled = df.withColumn('age', when(col('age').cast('int').isNotNull(), col('age')).otherwise(0))
df_invalid_age_handled.show()

+---+-------+---+------+
| id|   name|age|salary|
+---+-------+---+------+
|  1|  Alice| 25|  NULL|
|  2|    Bob|  0|  4000|
|  3|Charlie| 30|  4500|
|  4|   NULL|  0|  5000|
|  5|    Eve| 28| -1000|
+---+-------+---+------+



In [36]:
# Handling missing values in a specific column (e.g., age), without affecting other columns

df_age_filled = df.fillna({'age':0})
df_age_filled.show()

+---+-------+---+------+
| id|   name|age|salary|
+---+-------+---+------+
|  1|  Alice| 25|  NULL|
|  2|    Bob|  0|  4000|
|  3|Charlie| 30|  4500|
|  4|   NULL|  0|  5000|
|  5|    Eve| 28| -1000|
+---+-------+---+------+



In [43]:
# Checking how many nulls are present in each column before and after cleaning

df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show()

+---+----+---+------+
| id|name|age|salary|
+---+----+---+------+
|  0|   1|  2|     1|
+---+----+---+------+



In [44]:
# count of Null values after handling.

df_invallid_replaced.select([count(when(col(c).isNull(), c)).alias(c) for c in df_invallid_replaced.columns]).show()

+---+----+---+------+
| id|name|age|salary|
+---+----+---+------+
|  0|   0|  0|     0|
+---+----+---+------+

