In [1]:
import pyspark
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('Practise').getOrCreate()

In [3]:
df = spark.read.csv('Salary.csv', header=True, inferSchema=True)
df.show()

+--------+----+----------+------+
|    Name| Age|Experience|Salary|
+--------+----+----------+------+
|Harrison|  24|         2| 30000|
|  Edward|  27|         2| 30000|
| Charlie|  25|         4| 25000|
|    Luka|  25|         4| 40000|
| Kirstin|  51|        30| 60000|
|   Harry|  55|        34| 34000|
|   Sunny|  29|         4| 20000|
|  Sophia|null|      null| 76000|
|    null|  34|        10| 38000|
|    null|  36|      null|  null|
+--------+----+----------+------+



In [4]:
# Drop the columns
df.na.drop().show()

+--------+---+----------+------+
|    Name|Age|Experience|Salary|
+--------+---+----------+------+
|Harrison| 24|         2| 30000|
|  Edward| 27|         2| 30000|
| Charlie| 25|         4| 25000|
|    Luka| 25|         4| 40000|
| Kirstin| 51|        30| 60000|
|   Harry| 55|        34| 34000|
|   Sunny| 29|         4| 20000|
+--------+---+----------+------+



In [6]:
# how='any' will drop a row where there are any null values
df.na.drop(how='any').show()

+--------+----+----------+------+
|    Name| Age|Experience|Salary|
+--------+----+----------+------+
|Harrison|  24|         2| 30000|
|  Edward|  27|         2| 30000|
| Charlie|  25|         4| 25000|
|    Luka|  25|         4| 40000|
| Kirstin|  51|        30| 60000|
|   Harry|  55|        34| 34000|
|   Sunny|  29|         4| 20000|
|  Sophia|null|      null| 76000|
|    null|  34|        10| 38000|
|    null|  36|      null|  null|
+--------+----+----------+------+



In [8]:
df.na.drop(how='any', thresh=3).show()

+--------+---+----------+------+
|    Name|Age|Experience|Salary|
+--------+---+----------+------+
|Harrison| 24|         2| 30000|
|  Edward| 27|         2| 30000|
| Charlie| 25|         4| 25000|
|    Luka| 25|         4| 40000|
| Kirstin| 51|        30| 60000|
|   Harry| 55|        34| 34000|
|   Sunny| 29|         4| 20000|
|    null| 34|        10| 38000|
+--------+---+----------+------+



In [9]:
# Subset to only drop null values where they appear in one column
df.na.drop(how='any', subset=['Experience']).show()

+--------+---+----------+------+
|    Name|Age|Experience|Salary|
+--------+---+----------+------+
|Harrison| 24|         2| 30000|
|  Edward| 27|         2| 30000|
| Charlie| 25|         4| 25000|
|    Luka| 25|         4| 40000|
| Kirstin| 51|        30| 60000|
|   Harry| 55|        34| 34000|
|   Sunny| 29|         4| 20000|
|    null| 34|        10| 38000|
+--------+---+----------+------+



In [11]:
df.show()

+--------+----+----------+------+
|    Name| Age|Experience|Salary|
+--------+----+----------+------+
|Harrison|  24|         2| 30000|
|  Edward|  27|         2| 30000|
| Charlie|  25|         4| 25000|
|    Luka|  25|         4| 40000|
| Kirstin|  51|        30| 60000|
|   Harry|  55|        34| 34000|
|   Sunny|  29|         4| 20000|
|  Sophia|null|      null| 76000|
|    null|  34|        10| 38000|
|    null|  36|      null|  null|
+--------+----+----------+------+



In [13]:
# Filling missing values
df.na.fill(0, 'Experience').show()

+--------+----+----------+------+
|    Name| Age|Experience|Salary|
+--------+----+----------+------+
|Harrison|  24|         2| 30000|
|  Edward|  27|         2| 30000|
| Charlie|  25|         4| 25000|
|    Luka|  25|         4| 40000|
| Kirstin|  51|        30| 60000|
|   Harry|  55|        34| 34000|
|   Sunny|  29|         4| 20000|
|  Sophia|null|         0| 76000|
|    null|  34|        10| 38000|
|    null|  36|         0|  null|
+--------+----+----------+------+



In [14]:
df.show()

+--------+----+----------+------+
|    Name| Age|Experience|Salary|
+--------+----+----------+------+
|Harrison|  24|         2| 30000|
|  Edward|  27|         2| 30000|
| Charlie|  25|         4| 25000|
|    Luka|  25|         4| 40000|
| Kirstin|  51|        30| 60000|
|   Harry|  55|        34| 34000|
|   Sunny|  29|         4| 20000|
|  Sophia|null|      null| 76000|
|    null|  34|        10| 38000|
|    null|  36|      null|  null|
+--------+----+----------+------+



In [15]:
# Fill null values with the mean of the column
from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols=['Age', 'Experience', 'Salary'],
    outputCols=['{}_imputed'.format(c) for c in ['Age', 'Experience', 'Salary']]
    ).setStrategy('mean')

In [16]:
# Add imputation cols to df
imputer.fit(df).transform(df).show()

+--------+----+----------+------+-----------+------------------+--------------+
|    Name| Age|Experience|Salary|Age_imputed|Experience_imputed|Salary_imputed|
+--------+----+----------+------+-----------+------------------+--------------+
|Harrison|  24|         2| 30000|         24|                 2|         30000|
|  Edward|  27|         2| 30000|         27|                 2|         30000|
| Charlie|  25|         4| 25000|         25|                 4|         25000|
|    Luka|  25|         4| 40000|         25|                 4|         40000|
| Kirstin|  51|        30| 60000|         51|                30|         60000|
|   Harry|  55|        34| 34000|         55|                34|         34000|
|   Sunny|  29|         4| 20000|         29|                 4|         20000|
|  Sophia|null|      null| 76000|         34|                11|         76000|
|    null|  34|        10| 38000|         34|                10|         38000|
|    null|  36|      null|  null|       