# DataFrame Filter (or Where)

In [None]:
from pyspark.sql import functions as F
from pyspark.sql import types as T

In [None]:
arrayStructureData = [
    (("James","","Smith"),["Java","Scala","C++"],"OH","M"),
    (("Anna","Rose",""),["Spark","Java","C++"],"NY","F"),
    (("Julia","","Williams"),["CSharp","VB"],"OH","F"),
    (("Maria","Anne","Jones"),["CSharp","VB"],"NY","M"),
    (("Jen","Mary","Brown"),["CSharp","VB"],"NY","M"),
    (("Mike","Mary","Williams"),["Python","VB"],"OH","M")
]

In [None]:
arrayStructureSchema = T.StructType([
    T.StructField('name', T.StructType([
        T.StructField('firstname', T.StringType(), True),
        T.StructField('middlename', T.StringType(), True),
        T.StructField('lastname', T.StringType(), True)
    ])),
    T.StructField('languages', T.ArrayType(T.StringType()), True),
    T.StructField('state', T.StringType(), True),
    T.StructField('gender', T.StringType(), True),
])

In [None]:
df = spark.createDataFrame(data=arrayStructureData, schema=arrayStructureSchema)
df.printSchema()
df.show()

**REMEMBER** All examples below can replace `filter()` with `where()`. Both are aliases.

Filterby using a column property in a DataFrame

In [None]:
df.filter(df.state == "OH").show(truncate=False)

In [None]:
df.filter(df.state != "OH").show(truncate=False)

Filter by using `col()` to reference a column.

In [None]:
df.filter(F.col("state") == "OH").show(truncate=False)

Filter by using condition string as argument in `filter()`.

In [None]:
df.filter("gender == 'M'").show(truncate=False)

In [None]:
df.filter("gender <> 'M'").show(truncate=False)

## Membership

In [None]:
li = ["OH", "CA", "DE"]

df.filter(df.state.isin(li)).show()

In [None]:
df.filter(~df.state.isin(li)).show()

Accessing nested struct property to make a condition.

In [None]:
df.filter(df.name.lastname == "Williams").show(truncate=False)

## Logical conjunction

Use `&` and `|` to associate two conditional expression as one.

In [None]:
df.filter((df.state == "OH") & (df.gender == "M")).show(truncate=False)

## Logical functions

There are both functions in `psypark.sql.functions` package and `Column` methods which are useful to make a conditional expression.

For full list of functions in `pyspark.sql.functions` package, you can look at [here](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/functions.html)

In [None]:
df.filter(array_contains(df.language, "Java")).show()

In [None]:
df.filter(df.state.startswith("N")).show()

In [None]:
df.filter(df.state.endswith("H")).show()

In [None]:
df.filter(df.state.like("N%")).show()

Examples of filter with multiple steps

In [None]:
(df
 .withColumn(
     "fullname",
     F.lower(
         F.concat_ws(
             " ",
             F.col("name.firstname"),
             F.col("name.middlename"),
             F.col("name.lastname"),
         )
     )
 ).filter(F.col("fullname").like("%rose%"))
).show()

## Filter Null

In [None]:
data = [
    ("James", None, "M"),
    ("Anna", "NY", "F"),
    ("Julia", None, None),
]

columns = ["name", "state", "gender"]
df = spark.createDataFrame(data, columns)
df.show()

Using string expression as a condition by using "is NULL" like `SQL`.

In [None]:
df.filter("state is NULL").show()

It is the same result as using `isNull()` from `Column` instance.

In [None]:
df.filter(df.state.isNull()).show()

In [None]:
df.filter(F.col("state").isNull()).show()

Moreover, you can use `and` and `or` in string expression like `SQL'.

In [None]:
df.filter("state is NULL AND gender is NULL").show()

It is the same as this:

In [None]:
df.filter(df.state.isNull() & df.gender.isNull()).show()

For negative null checks, it is easy by using `not` in string condition. 

In [None]:
df.filter("state is not NULL").show()

In [None]:
df.filter("NOT state is NULL").show()

In [None]:
df.filter(df.state.isNotNull()).show()

In [None]:
df.filter(F.col("state").isNotNull()).show()

## Drop Null

In [None]:
df = (
    spark
        .read
        .options(header="true", inferSchema="true")
        .csv("s3a://datalake/examples/small_zipcode.csv")
)

df.printSchema()
df.show(truncate=False)

In [None]:
df.na.drop().show(truncate=False)

In [None]:
df.na.drop(how="any").show(truncate=False)

In [None]:
df.na.drop(subset=["population", "type"]).show()

In [None]:
df.dropna().show(truncate=False)

## Fill NA

In [None]:
df = (
    spark
        .read
        .options(header="true", inferSchema="true")
        .csv("s3a://datalake/examples/small_zipcode.csv")
)

df.printSchema()
df.show(truncate=False)

Fill all columns which are IntegerType() as `0` value.

In [None]:
df.fillna(value=0).show()

In [None]:
df.fillna(value=0, subset=["population"]).show()

Alternative ways by using `na` property of a dataframe.

In [None]:
df.na.fill(value=0).show()

In [None]:
df.na.fill(value=0, subset=["population"]).show()

Fill al columns which are StringType() as `""`

In [None]:
df.show()

In [None]:
df.fillna(value="").show()

In [None]:
df.na.fill(value="").show()

Fill with different values for columns.

In [None]:
(
df.fillna("unknown", ["city"])
  .fillna("",["type"])
).show()

Or using a dictionary instead.

In [None]:
df.fillna({"city": "unknown", "type": ""}).show()

In [None]:
(
df.na.fill("unknwon", ["city"])
  .na.fill("", ["type"])
).show()

In [None]:
df.na.fill({"city": "unknown", "type": ""}).show()