### Pyspark Handling Missing Values
- Dropping Columns
- Dropping Rows
- Various Parameter In Dropping functionalities
- Handling Missing values by Mean, Median And Mode

In [None]:
! pip install python-language-server[all]

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Dataframe").getOrCreate()
spark

In [39]:
df_pyspark = spark.read.csv(path="test2.csv", header=True, inferSchema=True)

In [3]:
df_pyspark.show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
|     NULL|  36|      NULL|  NULL|
+---------+----+----------+------+



## Drop Value

In [None]:
df_pyspark.na.drop().show()

# same

df_pyspark.dropna().show()

# same

df_pyspark.dropna(how="any").show()

# same

df_pyspark.na.drop(
    how="any", thresh=4
).show()  # number of columns that should have atleast have value

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+-

In [None]:
df_pyspark.dropna(how="all").show()  # remove rows only if all columns are blank

df_pyspark.na.drop(
    how="any", subset=["Age"]
).show()  # remove rows if value not present in specific column

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
|     NULL|  36|      NULL|  NULL|
+---------+----+----------+------+

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
|     NULL| 34|        10| 38000|
|     NULL| 36|      NULL|  NULL|
+---------+---+----------+------+



## Fill Value

In [31]:
fill_value = "Missing Values"
df_pyspark.na.fill(fill_value).show()  # Fill in all columns

+--------------+----+----------+------+
|          Name| age|Experience|Salary|
+--------------+----+----------+------+
|         Krish|  31|        10| 30000|
|     Sudhanshu|  30|         8| 25000|
|         Sunny|  29|         4| 20000|
|          Paul|  24|         3| 20000|
|        Harsha|  21|         1| 15000|
|       Shubham|  23|         2| 18000|
|        Mahesh|NULL|      NULL| 40000|
|Missing Values|  34|        10| 38000|
|Missing Values|  36|      NULL|  NULL|
+--------------+----+----------+------+



In [None]:
df_pyspark.na.fill(
    fill_value, subset=["Name", "age", "Salary"]
).show()  # Fill in specific columns, value to be filled depends of column data type

+--------------+--------------+----------+--------------+
|          Name|           age|Experience|        Salary|
+--------------+--------------+----------+--------------+
|         Krish|            31|        10|         30000|
|     Sudhanshu|            30|         8|         25000|
|         Sunny|            29|         4|         20000|
|          Paul|            24|         3|         20000|
|        Harsha|            21|         1|         15000|
|       Shubham|            23|         2|         18000|
|        Mahesh|Missing Values|      NULL|         40000|
|Missing Values|            34|        10|         38000|
|Missing Values|            36|      NULL|Missing Values|
+--------------+--------------+----------+--------------+



In [None]:
from pyspark.ml.feature import Imputer

In [54]:
strategy_type: list[str] = ["mean", "median", "mode"]

for strategy in strategy_type:
    imputer: Imputer = Imputer(
        inputCols=["age", "Experience", "Salary"],
        outputCols=[f"{c}_{strategy}_imputed" for c in ["age", "Experience", "Salary"]],
    ).setStrategy(value=strategy)

    print(f"{strategy}_imputed")

    imputer.fit(dataset=df_pyspark).transform(dataset=df_pyspark).show()

mean_imputed
+---------+----+----------+------+----------------+-----------------------+-------------------+
|     Name| age|Experience|Salary|age_mean_imputed|Experience_mean_imputed|Salary_mean_imputed|
+---------+----+----------+------+----------------+-----------------------+-------------------+
|    Krish|  31|        10| 30000|              31|                     10|              30000|
|Sudhanshu|  30|         8| 25000|              30|                      8|              25000|
|    Sunny|  29|         4| 20000|              29|                      4|              20000|
|     Paul|  24|         3| 20000|              24|                      3|              20000|
|   Harsha|  21|         1| 15000|              21|                      1|              15000|
|  Shubham|  23|         2| 18000|              23|                      2|              18000|
|   Mahesh|NULL|      NULL| 40000|              28|                      5|              40000|
|     NULL|  34|        10|