In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [3]:
spark = SparkSession.builder.master("local").appName("Data Cleansing").getOrCreate()

25/09/16 10:35:15 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [4]:
data_student = [
    ("Michael", "Science", 80, "P", 90),
    ("Nancy", "Mathematics", 90, "P", None),
    ("David", "English", 20, "F", 80),
    ("John", "Science", None, "F", None),
    ("Blessy", None, 30, "F", 50),
    ("Martin", "Mathematics", None, None, 70),
    (None,None,None,None,None)
]


In [5]:
print(type(data_student))

<class 'list'>


In [7]:
schema = StructType([
    StructField("name", StringType(), True),
    StructField("Subject", StringType(), True),
    StructField("Mark", IntegerType(), True),
    StructField("Status", StringType(), True),
    StructField("Attendance", IntegerType(), True)
])

In [8]:
df = spark.createDataFrame(data=data_student, schema=schema)

In [9]:
df.show()

                                                                                

+-------+-----------+----+------+----------+
|   name|    Subject|Mark|Status|Attendance|
+-------+-----------+----+------+----------+
|Michael|    Science|  80|     P|        90|
|  Nancy|Mathematics|  90|     P|      NULL|
|  David|    English|  20|     F|        80|
|   John|    Science|NULL|     F|      NULL|
| Blessy|       NULL|  30|     F|        50|
| Martin|Mathematics|NULL|  NULL|        70|
|   NULL|       NULL|NULL|  NULL|      NULL|
+-------+-----------+----+------+----------+



In [10]:
from pyspark.sql.functions import col

In [13]:
df.filter(df.Mark.isNull()).show()

+------+-----------+----+------+----------+
|  name|    Subject|Mark|Status|Attendance|
+------+-----------+----+------+----------+
|  John|    Science|NULL|     F|      NULL|
|Martin|Mathematics|NULL|  NULL|        70|
|  NULL|       NULL|NULL|  NULL|      NULL|
+------+-----------+----+------+----------+



In [14]:
df.filter(col("Mark").isNull()).show()

+------+-----------+----+------+----------+
|  name|    Subject|Mark|Status|Attendance|
+------+-----------+----+------+----------+
|  John|    Science|NULL|     F|      NULL|
|Martin|Mathematics|NULL|  NULL|        70|
|  NULL|       NULL|NULL|  NULL|      NULL|
+------+-----------+----+------+----------+



In [21]:
from pyspark.sql.functions import col, isnull
from functools import reduce

In [22]:
# Filter rows where ALL columns are null
df_all_null = df.filter(
    reduce(lambda x, y: x & y, [isnull(col(c)) for c in df.columns])
)
print("Rows where ALL columns are null:")
df_all_null.show()

Rows where ALL columns are null:
+----+-------+----+------+----------+
|name|Subject|Mark|Status|Attendance|
+----+-------+----+------+----------+
|NULL|   NULL|NULL|  NULL|      NULL|
+----+-------+----+------+----------+



In [23]:
df_na_drop = df.na.drop()
df_na_drop.show()

+-------+-------+----+------+----------+
|   name|Subject|Mark|Status|Attendance|
+-------+-------+----+------+----------+
|Michael|Science|  80|     P|        90|
|  David|English|  20|     F|        80|
+-------+-------+----+------+----------+



In [24]:
df_drop_any = df.na.drop(how='any')
df_drop_any.show()

+-------+-------+----+------+----------+
|   name|Subject|Mark|Status|Attendance|
+-------+-------+----+------+----------+
|Michael|Science|  80|     P|        90|
|  David|English|  20|     F|        80|
+-------+-------+----+------+----------+



                                                                                

In [25]:
df_drop_all = df.na.drop(how='all')
df_drop_all.show()

+-------+-----------+----+------+----------+
|   name|    Subject|Mark|Status|Attendance|
+-------+-----------+----+------+----------+
|Michael|    Science|  80|     P|        90|
|  Nancy|Mathematics|  90|     P|      NULL|
|  David|    English|  20|     F|        80|
|   John|    Science|NULL|     F|      NULL|
| Blessy|       NULL|  30|     F|        50|
| Martin|Mathematics|NULL|  NULL|        70|
+-------+-----------+----+------+----------+



In [27]:
demo1  = df.na.drop(subset="Mark")
demo1.show()

+-------+-----------+----+------+----------+
|   name|    Subject|Mark|Status|Attendance|
+-------+-----------+----+------+----------+
|Michael|    Science|  80|     P|        90|
|  Nancy|Mathematics|  90|     P|      NULL|
|  David|    English|  20|     F|        80|
| Blessy|       NULL|  30|     F|        50|
+-------+-----------+----+------+----------+



In [29]:
demo2 = df.na.drop(subset=["Mark","Attendance"])
demo2.show()

+-------+-------+----+------+----------+
|   name|Subject|Mark|Status|Attendance|
+-------+-------+----+------+----------+
|Michael|Science|  80|     P|        90|
|  David|English|  20|     F|        80|
| Blessy|   NULL|  30|     F|        50|
+-------+-------+----+------+----------+



In [30]:
df_na_fill = df.na.fill({'Mark': -1, 'Attendance': -1})
df_na_fill.show()

+-------+-----------+----+------+----------+
|   name|    Subject|Mark|Status|Attendance|
+-------+-----------+----+------+----------+
|Michael|    Science|  80|     P|        90|
|  Nancy|Mathematics|  90|     P|        -1|
|  David|    English|  20|     F|        80|
|   John|    Science|  -1|     F|        -1|
| Blessy|       NULL|  30|     F|        50|
| Martin|Mathematics|  -1|  NULL|        70|
|   NULL|       NULL|  -1|  NULL|        -1|
+-------+-----------+----+------+----------+



In [31]:
df_na_fill = df.na.fill(value=0)
df_na_fill.show()

+-------+-----------+----+------+----------+
|   name|    Subject|Mark|Status|Attendance|
+-------+-----------+----+------+----------+
|Michael|    Science|  80|     P|        90|
|  Nancy|Mathematics|  90|     P|         0|
|  David|    English|  20|     F|        80|
|   John|    Science|   0|     F|         0|
| Blessy|       NULL|  30|     F|        50|
| Martin|Mathematics|   0|  NULL|        70|
|   NULL|       NULL|   0|  NULL|         0|
+-------+-----------+----+------+----------+



In [32]:
df_na_fill = df.na.fill(value='NA')
df_na_fill.show()

+-------+-----------+----+------+----------+
|   name|    Subject|Mark|Status|Attendance|
+-------+-----------+----+------+----------+
|Michael|    Science|  80|     P|        90|
|  Nancy|Mathematics|  90|     P|      NULL|
|  David|    English|  20|     F|        80|
|   John|    Science|NULL|     F|      NULL|
| Blessy|         NA|  30|     F|        50|
| Martin|Mathematics|NULL|    NA|        70|
|     NA|         NA|NULL|    NA|      NULL|
+-------+-----------+----+------+----------+



                                                                                

In [33]:
data = [
    ("John Doe", "New York", "USA", 25, "john@email.com"),
    ("Jane Smith", "LONDON", "UK", 30, "JANE@EMAIL.COM"),
    ("BOB JOHNSON", "paris", "France", 35, "Bob@Email.Com")
]

In [34]:
df = spark.createDataFrame(data, ["name", "city", "country", "age", "email"])
df.show()

+-----------+--------+-------+---+--------------+
|       name|    city|country|age|         email|
+-----------+--------+-------+---+--------------+
|   John Doe|New York|    USA| 25|john@email.com|
| Jane Smith|  LONDON|     UK| 30|JANE@EMAIL.COM|
|BOB JOHNSON|   paris| France| 35| Bob@Email.Com|
+-----------+--------+-------+---+--------------+



In [35]:
from pyspark.sql.functions import upper, lower, col

In [36]:
df_upper = df.select([upper(col(c)).alias(c) if dtype == 'string' else col(c) 
                     for c, dtype in df.dtypes])
print("All string columns to UPPERCASE:")
df_upper.show()

All string columns to UPPERCASE:
+-----------+--------+-------+---+--------------+
|       name|    city|country|age|         email|
+-----------+--------+-------+---+--------------+
|   JOHN DOE|NEW YORK|    USA| 25|JOHN@EMAIL.COM|
| JANE SMITH|  LONDON|     UK| 30|JANE@EMAIL.COM|
|BOB JOHNSON|   PARIS| FRANCE| 35| BOB@EMAIL.COM|
+-----------+--------+-------+---+--------------+



In [37]:
df_mixed = df.select(
    upper(col("name")).alias("name"),
    lower(col("city")).alias("city"),
    upper(col("country")).alias("country"),
    col("age"),
    lower(col("email")).alias("email")
)
print("Mixed case conversion:")
df_mixed.show()

Mixed case conversion:
+-----------+--------+-------+---+--------------+
|       name|    city|country|age|         email|
+-----------+--------+-------+---+--------------+
|   JOHN DOE|new york|    USA| 25|john@email.com|
| JANE SMITH|  london|     UK| 30|jane@email.com|
|BOB JOHNSON|   paris| FRANCE| 35| bob@email.com|
+-----------+--------+-------+---+--------------+



In [38]:
data = [
    ("JOHN DOE", "new york", "USA", "JOHN@EMAIL.COM"),
    ("Jane Smith", "LONDON", "uk", "jane@email.com"),
    ("BOB JOHNSON", "paris", "france", "Bob@Email.Com"),
    ("ALICE BROWN", "BERLIN", "GERMANY", "ALICE@EMAIL.COM"),
    ("charlie wilson", "london", "uk", "charlie@email.com")
]

In [39]:
df = spark.createDataFrame(data, ["name", "city", "country", "email"])
df.show()

+--------------+--------+-------+-----------------+
|          name|    city|country|            email|
+--------------+--------+-------+-----------------+
|      JOHN DOE|new york|    USA|   JOHN@EMAIL.COM|
|    Jane Smith|  LONDON|     uk|   jane@email.com|
|   BOB JOHNSON|   paris| france|    Bob@Email.Com|
|   ALICE BROWN|  BERLIN|GERMANY|  ALICE@EMAIL.COM|
|charlie wilson|  london|     uk|charlie@email.com|
+--------------+--------+-------+-----------------+



In [40]:
df_lower_city = df.filter(col("city") == lower(col("city")))
print("Rows where city is lowercase:")
df_lower_city.show()

Rows where city is lowercase:
+--------------+--------+-------+-----------------+
|          name|    city|country|            email|
+--------------+--------+-------+-----------------+
|      JOHN DOE|new york|    USA|   JOHN@EMAIL.COM|
|   BOB JOHNSON|   paris| france|    Bob@Email.Com|
|charlie wilson|  london|     uk|charlie@email.com|
+--------------+--------+-------+-----------------+



In [41]:
df_lower_city = df.filter(col("city") == upper(col("city")))
print("Rows where city is lowercase:")
df_lower_city.show()

Rows where city is lowercase:
+-----------+------+-------+---------------+
|       name|  city|country|          email|
+-----------+------+-------+---------------+
| Jane Smith|LONDON|     uk| jane@email.com|
|ALICE BROWN|BERLIN|GERMANY|ALICE@EMAIL.COM|
+-----------+------+-------+---------------+



In [43]:
df_city = df.filter(lower(col("city")) == upper(col("city")))