In [11]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Missing_Value').getOrCreate()

In [12]:
spark

In [13]:
df_pyspark = spark.read.csv('Test1.csv', header=True, inferSchema=True)
df_pyspark

DataFrame[Name: string, Age: int, Experience: int, Salary: int]

In [14]:
df_pyspark.show()

+------+----+----------+------+
|  Name| Age|Experience|Salary|
+------+----+----------+------+
| Illia|  20|        10|  1000|
| Artem|  21|         8|  8000|
|  Vlad|  30|        20| 20000|
| Lesia|  45|        10| 15000|
|Nastya|  20|         2| 18000|
| Vania|null|      null| 30000|
|  null|  34|         5|  3000|
|  Roma|null|         4|  null|
+------+----+----------+------+



In [15]:
### drop the columns
df_pyspark.drop('Name').show()

+----+----------+------+
| Age|Experience|Salary|
+----+----------+------+
|  20|        10|  1000|
|  21|         8|  8000|
|  30|        20| 20000|
|  45|        10| 15000|
|  20|         2| 18000|
|null|      null| 30000|
|  34|         5|  3000|
|null|         4|  null|
+----+----------+------+



In [16]:
df_pyspark.show()

+------+----+----------+------+
|  Name| Age|Experience|Salary|
+------+----+----------+------+
| Illia|  20|        10|  1000|
| Artem|  21|         8|  8000|
|  Vlad|  30|        20| 20000|
| Lesia|  45|        10| 15000|
|Nastya|  20|         2| 18000|
| Vania|null|      null| 30000|
|  null|  34|         5|  3000|
|  Roma|null|         4|  null|
+------+----+----------+------+



In [17]:
### droping the null value
df_pyspark.na.drop().show()

+------+---+----------+------+
|  Name|Age|Experience|Salary|
+------+---+----------+------+
| Illia| 20|        10|  1000|
| Artem| 21|         8|  8000|
|  Vlad| 30|        20| 20000|
| Lesia| 45|        10| 15000|
|Nastya| 20|         2| 18000|
+------+---+----------+------+



In [18]:
##### argument how=='all' or 'any'
# all == delete row if all value in this row are null
# any == delete row if at least one value in row is null
df_pyspark.na.drop(how='all').show()

+------+----+----------+------+
|  Name| Age|Experience|Salary|
+------+----+----------+------+
| Illia|  20|        10|  1000|
| Artem|  21|         8|  8000|
|  Vlad|  30|        20| 20000|
| Lesia|  45|        10| 15000|
|Nastya|  20|         2| 18000|
| Vania|null|      null| 30000|
|  null|  34|         5|  3000|
|  Roma|null|         4|  null|
+------+----+----------+------+



In [21]:
### threshold == left value with some count of not null value in row
df_pyspark.na.drop(how='any', thresh = 2).show()

+------+----+----------+------+
|  Name| Age|Experience|Salary|
+------+----+----------+------+
| Illia|  20|        10|  1000|
| Artem|  21|         8|  8000|
|  Vlad|  30|        20| 20000|
| Lesia|  45|        10| 15000|
|Nastya|  20|         2| 18000|
| Vania|null|      null| 30000|
|  null|  34|         5|  3000|
|  Roma|null|         4|  null|
+------+----+----------+------+



In [23]:
### subset == delete null value in collumns in subset argument list
df_pyspark.na.drop(how='any', subset=['Experience']).show()

+------+----+----------+------+
|  Name| Age|Experience|Salary|
+------+----+----------+------+
| Illia|  20|        10|  1000|
| Artem|  21|         8|  8000|
|  Vlad|  30|        20| 20000|
| Lesia|  45|        10| 15000|
|Nastya|  20|         2| 18000|
|  null|  34|         5|  3000|
|  Roma|null|         4|  null|
+------+----+----------+------+



In [26]:
##### filling the missing value
df_pyspark.na.fill('Missing Values').show()

+--------------+----+----------+------+
|          Name| Age|Experience|Salary|
+--------------+----+----------+------+
|         Illia|  20|        10|  1000|
|         Artem|  21|         8|  8000|
|          Vlad|  30|        20| 20000|
|         Lesia|  45|        10| 15000|
|        Nastya|  20|         2| 18000|
|         Vania|null|      null| 30000|
|Missing Values|  34|         5|  3000|
|          Roma|null|         4|  null|
+--------------+----+----------+------+



In [27]:
##### filling the missing value in specific columns
df_pyspark.na.fill(0, subset=['Age','Experience','Salary']).show()

+------+---+----------+------+
|  Name|Age|Experience|Salary|
+------+---+----------+------+
| Illia| 20|        10|  1000|
| Artem| 21|         8|  8000|
|  Vlad| 30|        20| 20000|
| Lesia| 45|        10| 15000|
|Nastya| 20|         2| 18000|
| Vania|  0|         0| 30000|
|  null| 34|         5|  3000|
|  Roma|  0|         4|     0|
+------+---+----------+------+



In [28]:
#### Imputer == Imputation estimator for completing missing values, 
#using the mean, median or mode of the columns in which the missing values are located.
from pyspark.ml.feature import Imputer

imputer = Imputer(
inputCols=['Age','Experience','Salary'],
outputCols=['{}_imputed'.format(c) for c in ['Age', 'Experience','Salary']]
).setStrategy('mean')

In [30]:
# Add imputation cols to df
imputer.fit(df_pyspark).transform(df_pyspark).show()

+------+----+----------+------+-----------+------------------+--------------+
|  Name| Age|Experience|Salary|Age_imputed|Experience_imputed|Salary_imputed|
+------+----+----------+------+-----------+------------------+--------------+
| Illia|  20|        10|  1000|         20|                10|          1000|
| Artem|  21|         8|  8000|         21|                 8|          8000|
|  Vlad|  30|        20| 20000|         30|                20|         20000|
| Lesia|  45|        10| 15000|         45|                10|         15000|
|Nastya|  20|         2| 18000|         20|                 2|         18000|
| Vania|null|      null| 30000|         28|                 8|         30000|
|  null|  34|         5|  3000|         34|                 5|          3000|
|  Roma|null|         4|  null|         28|                 4|         13571|
+------+----+----------+------+-----------+------------------+--------------+

