# Pyspark Handling Missing Values
- Dropping Columns
- Dropping Rows 
- Various Parameter in dropping functionalities 
- Handling Missing values by Mean, Median & Mode 

In [1]:
import findspark
findspark.init()

In [2]:
import pyspark
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('Missing values').getOrCreate()

In [4]:
df_p = spark.read.csv('test2.csv', header=True, inferSchema = True)
df_p.show()

+-------+----+----------+------+
|   Name| age|Experience|Salary|
+-------+----+----------+------+
|shambhu|  33|        10| 20000|
| mahesh|  34|         8| 35000|
| natraj|  35|         5| 25000|
| kishor|  28|         3| 30000|
| vishal|  29|         5| 15000|
|vaibhav|  27|         2| 18000|
|      a|NULL|      NULL| 40000|
|   NULL|  30|        10| 80000|
|   NULL|  43|      NULL|  NULL|
+-------+----+----------+------+



In [5]:
# Drop the columns

In [6]:
df_p.drop('Salary').show()

+-------+----+----------+
|   Name| age|Experience|
+-------+----+----------+
|shambhu|  33|        10|
| mahesh|  34|         8|
| natraj|  35|         5|
| kishor|  28|         3|
| vishal|  29|         5|
|vaibhav|  27|         2|
|      a|NULL|      NULL|
|   NULL|  30|        10|
|   NULL|  43|      NULL|
+-------+----+----------+



In [7]:
df_p.na.drop().show()

+-------+---+----------+------+
|   Name|age|Experience|Salary|
+-------+---+----------+------+
|shambhu| 33|        10| 20000|
| mahesh| 34|         8| 35000|
| natraj| 35|         5| 25000|
| kishor| 28|         3| 30000|
| vishal| 29|         5| 15000|
|vaibhav| 27|         2| 18000|
+-------+---+----------+------+



In [8]:
df_p.na.drop(how='any').show()

+-------+---+----------+------+
|   Name|age|Experience|Salary|
+-------+---+----------+------+
|shambhu| 33|        10| 20000|
| mahesh| 34|         8| 35000|
| natraj| 35|         5| 25000|
| kishor| 28|         3| 30000|
| vishal| 29|         5| 15000|
|vaibhav| 27|         2| 18000|
+-------+---+----------+------+



In [9]:
df_p.na.drop(how='all').show()

+-------+----+----------+------+
|   Name| age|Experience|Salary|
+-------+----+----------+------+
|shambhu|  33|        10| 20000|
| mahesh|  34|         8| 35000|
| natraj|  35|         5| 25000|
| kishor|  28|         3| 30000|
| vishal|  29|         5| 15000|
|vaibhav|  27|         2| 18000|
|      a|NULL|      NULL| 40000|
|   NULL|  30|        10| 80000|
|   NULL|  43|      NULL|  NULL|
+-------+----+----------+------+



In [10]:
## thresholds 
# If you have 2 non null value(thresh=2) then it will keep that, otherwise it will be deleted. 

In [5]:
df_p.na.drop(how='any', thresh= 2).show()

+-------+----+----------+------+
|   Name| age|Experience|Salary|
+-------+----+----------+------+
|shambhu|  33|        10| 20000|
| mahesh|  34|         8| 35000|
| natraj|  35|         5| 25000|
| kishor|  28|         3| 30000|
| vishal|  29|         5| 15000|
|vaibhav|  27|         2| 18000|
|      a|NULL|      NULL| 40000|
|   NULL|  30|        10| 80000|
+-------+----+----------+------+



In [6]:
df_p.na.drop(how='any', thresh = 3).show()

+-------+---+----------+------+
|   Name|age|Experience|Salary|
+-------+---+----------+------+
|shambhu| 33|        10| 20000|
| mahesh| 34|         8| 35000|
| natraj| 35|         5| 25000|
| kishor| 28|         3| 30000|
| vishal| 29|         5| 15000|
|vaibhav| 27|         2| 18000|
|   NULL| 30|        10| 80000|
+-------+---+----------+------+



In [16]:
## subset 
# delete all records if any particular column has null values

In [17]:
df_p.na.drop(how='any', subset = ['age']).show()

+-------+---+----------+------+
|   Name|age|Experience|Salary|
+-------+---+----------+------+
|shambhu| 33|        10| 20000|
| mahesh| 34|         8| 35000|
| natraj| 35|         5| 25000|
| kishor| 28|         3| 30000|
| vishal| 29|         5| 15000|
|vaibhav| 27|         2| 18000|
|   NULL| 30|        10| 80000|
|   NULL| 43|      NULL|  NULL|
+-------+---+----------+------+



In [18]:
# Filling the Missing values 

In [19]:
df_p.na.fill('Missing Values').show()

+--------------+----+----------+------+
|          Name| age|Experience|Salary|
+--------------+----+----------+------+
|       shambhu|  33|        10| 20000|
|        mahesh|  34|         8| 35000|
|        natraj|  35|         5| 25000|
|        kishor|  28|         3| 30000|
|        vishal|  29|         5| 15000|
|       vaibhav|  27|         2| 18000|
|             a|NULL|      NULL| 40000|
|Missing Values|  30|        10| 80000|
|Missing Values|  43|      NULL|  NULL|
+--------------+----+----------+------+



In [20]:
df = spark.read.csv('test2.csv', header=True, inferSchema=False)
df

DataFrame[Name: string, age: string, Experience: string, Salary: string]

In [21]:
df.na.fill('Missing Values').show()

+--------------+--------------+--------------+--------------+
|          Name|           age|    Experience|        Salary|
+--------------+--------------+--------------+--------------+
|       shambhu|            33|            10|         20000|
|        mahesh|            34|             8|         35000|
|        natraj|            35|             5|         25000|
|        kishor|            28|             3|         30000|
|        vishal|            29|             5|         15000|
|       vaibhav|            27|             2|         18000|
|             a|Missing Values|Missing Values|         40000|
|Missing Values|            30|            10|         80000|
|Missing Values|            43|Missing Values|Missing Values|
+--------------+--------------+--------------+--------------+



In [22]:
# specific column

df.na.fill('Missing Values',  'age').show()

+-------+--------------+----------+------+
|   Name|           age|Experience|Salary|
+-------+--------------+----------+------+
|shambhu|            33|        10| 20000|
| mahesh|            34|         8| 35000|
| natraj|            35|         5| 25000|
| kishor|            28|         3| 30000|
| vishal|            29|         5| 15000|
|vaibhav|            27|         2| 18000|
|      a|Missing Values|      NULL| 40000|
|   NULL|            30|        10| 80000|
|   NULL|            43|      NULL|  NULL|
+-------+--------------+----------+------+



In [23]:
df.na.fill('Missing Values',  ['age', 'Experience']).show()

+-------+--------------+--------------+------+
|   Name|           age|    Experience|Salary|
+-------+--------------+--------------+------+
|shambhu|            33|            10| 20000|
| mahesh|            34|             8| 35000|
| natraj|            35|             5| 25000|
| kishor|            28|             3| 30000|
| vishal|            29|             5| 15000|
|vaibhav|            27|             2| 18000|
|      a|Missing Values|Missing Values| 40000|
|   NULL|            30|            10| 80000|
|   NULL|            43|Missing Values|  NULL|
+-------+--------------+--------------+------+



In [24]:
# Handiing Missing values based on columns mean values

In [4]:
df = spark.read.csv('test2.csv', header = True, inferSchema=True)
df

DataFrame[Name: string, age: int, Experience: int, Salary: int]

In [5]:
df.show()

+-------+----+----------+------+
|   Name| age|Experience|Salary|
+-------+----+----------+------+
|shambhu|  33|        10| 20000|
| mahesh|  34|         8| 35000|
| natraj|  35|         5| 25000|
| kishor|  28|         3| 30000|
| vishal|  29|         5| 15000|
|vaibhav|  27|         2| 18000|
|      a|NULL|      NULL| 40000|
|   NULL|  30|        10| 80000|
|   NULL|  43|      NULL|  NULL|
+-------+----+----------+------+



In [6]:
from pyspark.ml.feature import Imputer
imputer = Imputer(inputCols=['age','Experience','Salary'],
                  outputCols = ["{}_imputed".format(c) for c in ['age','Experience','Salary']]
                 ).setStrategy('mean')
#median

In [7]:
imputer.fit(df).transform(df).show()

+-------+----+----------+------+-----------+------------------+--------------+
|   Name| age|Experience|Salary|age_imputed|Experience_imputed|Salary_imputed|
+-------+----+----------+------+-----------+------------------+--------------+
|shambhu|  33|        10| 20000|         33|                10|         20000|
| mahesh|  34|         8| 35000|         34|                 8|         35000|
| natraj|  35|         5| 25000|         35|                 5|         25000|
| kishor|  28|         3| 30000|         28|                 3|         30000|
| vishal|  29|         5| 15000|         29|                 5|         15000|
|vaibhav|  27|         2| 18000|         27|                 2|         18000|
|      a|NULL|      NULL| 40000|         32|                 6|         40000|
|   NULL|  30|        10| 80000|         30|                10|         80000|
|   NULL|  43|      NULL|  NULL|         43|                 6|         32875|
+-------+----+----------+------+-----------+--------

In [7]:
spark.stop()