In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [3]:
spark = SparkSession.builder.master("local").appName("Data Cleansing").getOrCreate()

25/09/16 10:35:15 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [4]:
data_student = [
    ("Michael", "Science", 80, "P", 90),
    ("Nancy", "Mathematics", 90, "P", None),
    ("David", "English", 20, "F", 80),
    ("John", "Science", None, "F", None),
    ("Blessy", None, 30, "F", 50),
    ("Martin", "Mathematics", None, None, 70),
    (None,None,None,None,None)
]


In [5]:
print(type(data_student))

<class 'list'>


In [7]:
schema = StructType([
    StructField("name", StringType(), True),
    StructField("Subject", StringType(), True),
    StructField("Mark", IntegerType(), True),
    StructField("Status", StringType(), True),
    StructField("Attendance", IntegerType(), True)
])

In [8]:
df = spark.createDataFrame(data=data_student, schema=schema)

In [9]:
df.show()

                                                                                

+-------+-----------+----+------+----------+
|   name|    Subject|Mark|Status|Attendance|
+-------+-----------+----+------+----------+
|Michael|    Science|  80|     P|        90|
|  Nancy|Mathematics|  90|     P|      NULL|
|  David|    English|  20|     F|        80|
|   John|    Science|NULL|     F|      NULL|
| Blessy|       NULL|  30|     F|        50|
| Martin|Mathematics|NULL|  NULL|        70|
|   NULL|       NULL|NULL|  NULL|      NULL|
+-------+-----------+----+------+----------+



In [10]:
from pyspark.sql.functions import col

In [13]:
df.filter(df.Mark.isNull()).show()

+------+-----------+----+------+----------+
|  name|    Subject|Mark|Status|Attendance|
+------+-----------+----+------+----------+
|  John|    Science|NULL|     F|      NULL|
|Martin|Mathematics|NULL|  NULL|        70|
|  NULL|       NULL|NULL|  NULL|      NULL|
+------+-----------+----+------+----------+



In [14]:
df.filter(col("Mark").isNull()).show()

+------+-----------+----+------+----------+
|  name|    Subject|Mark|Status|Attendance|
+------+-----------+----+------+----------+
|  John|    Science|NULL|     F|      NULL|
|Martin|Mathematics|NULL|  NULL|        70|
|  NULL|       NULL|NULL|  NULL|      NULL|
+------+-----------+----+------+----------+



In [21]:
from pyspark.sql.functions import col, isnull
from functools import reduce

In [22]:
# Filter rows where ALL columns are null
df_all_null = df.filter(
    reduce(lambda x, y: x & y, [isnull(col(c)) for c in df.columns])
)
print("Rows where ALL columns are null:")
df_all_null.show()

Rows where ALL columns are null:
+----+-------+----+------+----------+
|name|Subject|Mark|Status|Attendance|
+----+-------+----+------+----------+
|NULL|   NULL|NULL|  NULL|      NULL|
+----+-------+----+------+----------+



In [23]:
df_na_drop = df.na.drop()
df_na_drop.show()

+-------+-------+----+------+----------+
|   name|Subject|Mark|Status|Attendance|
+-------+-------+----+------+----------+
|Michael|Science|  80|     P|        90|
|  David|English|  20|     F|        80|
+-------+-------+----+------+----------+



In [24]:
df_drop_any = df.na.drop(how='any')
df_drop_any.show()

+-------+-------+----+------+----------+
|   name|Subject|Mark|Status|Attendance|
+-------+-------+----+------+----------+
|Michael|Science|  80|     P|        90|
|  David|English|  20|     F|        80|
+-------+-------+----+------+----------+



                                                                                

In [25]:
df_drop_all = df.na.drop(how='all')
df_drop_all.show()

+-------+-----------+----+------+----------+
|   name|    Subject|Mark|Status|Attendance|
+-------+-----------+----+------+----------+
|Michael|    Science|  80|     P|        90|
|  Nancy|Mathematics|  90|     P|      NULL|
|  David|    English|  20|     F|        80|
|   John|    Science|NULL|     F|      NULL|
| Blessy|       NULL|  30|     F|        50|
| Martin|Mathematics|NULL|  NULL|        70|
+-------+-----------+----+------+----------+



In [27]:
demo1  = df.na.drop(subset="Mark")
demo1.show()

+-------+-----------+----+------+----------+
|   name|    Subject|Mark|Status|Attendance|
+-------+-----------+----+------+----------+
|Michael|    Science|  80|     P|        90|
|  Nancy|Mathematics|  90|     P|      NULL|
|  David|    English|  20|     F|        80|
| Blessy|       NULL|  30|     F|        50|
+-------+-----------+----+------+----------+



In [29]:
demo2 = df.na.drop(subset=["Mark","Attendance"])
demo2.show()

+-------+-------+----+------+----------+
|   name|Subject|Mark|Status|Attendance|
+-------+-------+----+------+----------+
|Michael|Science|  80|     P|        90|
|  David|English|  20|     F|        80|
| Blessy|   NULL|  30|     F|        50|
+-------+-------+----+------+----------+



In [30]:
df_na_fill = df.na.fill({'Mark': -1, 'Attendance': -1})
df_na_fill.show()

+-------+-----------+----+------+----------+
|   name|    Subject|Mark|Status|Attendance|
+-------+-----------+----+------+----------+
|Michael|    Science|  80|     P|        90|
|  Nancy|Mathematics|  90|     P|        -1|
|  David|    English|  20|     F|        80|
|   John|    Science|  -1|     F|        -1|
| Blessy|       NULL|  30|     F|        50|
| Martin|Mathematics|  -1|  NULL|        70|
|   NULL|       NULL|  -1|  NULL|        -1|
+-------+-----------+----+------+----------+

