### PySpark Dataframe - Filters

For the data pre-processing, if you want to retrieve the data based on certain conditions you may use filters.

- Filter operation
- &, |, ==
- ~

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark=SparkSession.builder.appName("DataframePartTwo").getOrCreate()

In [6]:
csv_df = spark.read.csv('./notebooks/datasets/country_wise_latest.csv', header=True, inferSchema=True)
csv_df.show()

+-------------------+---------+------+---------+------+---------+----------+-------------+--------------------+
|     Country/Region|Confirmed|Deaths|Recovered|Active|New cases|New deaths|New recovered|          WHO Region|
+-------------------+---------+------+---------+------+---------+----------+-------------+--------------------+
|        Afghanistan|    36263|  1269|    25198|  9796|      106|        10|           18|Eastern Mediterra...|
|            Albania|     4880|   144|     2745|  1991|      117|         6|           63|              Europe|
|            Algeria|    27973|  1163|    18837|  7973|      616|         8|          749|              Africa|
|            Andorra|      907|    52|      803|    52|       10|         0|            0|              Europe|
|             Angola|      950|    41|      242|   667|       18|         1|            0|              Africa|
|Antigua and Barbuda|       86|     3|       65|    18|        4|         0|            5|            Am

### Filter Operation

In [7]:
# Active covid cases greater than or equal to 5000
csv_df.filter("Active>=5000").show()

+--------------------+---------+------+---------+------+---------+----------+-------------+--------------------+
|      Country/Region|Confirmed|Deaths|Recovered|Active|New cases|New deaths|New recovered|          WHO Region|
+--------------------+---------+------+---------+------+---------+----------+-------------+--------------------+
|         Afghanistan|    36263|  1269|    25198|  9796|      106|        10|           18|Eastern Mediterra...|
|             Algeria|    27973|  1163|    18837|  7973|      616|         8|          749|              Africa|
|           Argentina|   167416|  3059|    72575| 91782|     4890|       120|         2057|            Americas|
|             Armenia|    37390|   711|    26665| 10014|       73|         6|          187|              Europe|
|           Australia|    15303|   167|     9311|  5825|      368|         6|          137|     Western Pacific|
|          Azerbaijan|    30446|   423|    23242|  6781|      396|         6|          558|     

In [10]:
# Fetch the filtered records with selected columns
csv_df.filter("Active>=5000").select(['Country/Region', 'Active']).show()

+--------------------+------+
|      Country/Region|Active|
+--------------------+------+
|         Afghanistan|  9796|
|             Algeria|  7973|
|           Argentina| 91782|
|             Armenia| 10014|
|           Australia|  5825|
|          Azerbaijan|  6781|
|          Bangladesh| 97577|
|             Belarus|  6221|
|             Belgium| 39154|
|             Bolivia| 47056|
|Bosnia and Herzeg...|  5274|
|              Brazil|508116|
|              Canada|107514|
|               Chile| 18782|
|            Colombia|117163|
|          Costa Rica| 11902|
|       Cote d'Ivoire|  5198|
|  Dominican Republic| 32869|
|             Ecuador| 40733|
|               Egypt| 52992|
+--------------------+------+
only showing top 20 rows



In [18]:
# Fetch the records based on multiple conditions (&)
csv_df.filter((csv_df['Active']>5000) & (csv_df['Active']<10000)).show()

+--------------------+---------+------+---------+------+---------+----------+-------------+--------------------+
|      Country/Region|Confirmed|Deaths|Recovered|Active|New cases|New deaths|New recovered|          WHO Region|
+--------------------+---------+------+---------+------+---------+----------+-------------+--------------------+
|         Afghanistan|    36263|  1269|    25198|  9796|      106|        10|           18|Eastern Mediterra...|
|             Algeria|    27973|  1163|    18837|  7973|      616|         8|          749|              Africa|
|           Australia|    15303|   167|     9311|  5825|      368|         6|          137|     Western Pacific|
|          Azerbaijan|    30446|   423|    23242|  6781|      396|         6|          558|              Europe|
|             Belarus|    67251|   538|    60492|  6221|      119|         4|           67|              Europe|
|Bosnia and Herzeg...|    10498|   294|     4930|  5274|      731|        14|          375|     

In [19]:
# Fetch the records based on multiple OR conditions
csv_df.filter((csv_df['Active']>5000) | (csv_df['Active']<10000)).show()

+-------------------+---------+------+---------+------+---------+----------+-------------+--------------------+
|     Country/Region|Confirmed|Deaths|Recovered|Active|New cases|New deaths|New recovered|          WHO Region|
+-------------------+---------+------+---------+------+---------+----------+-------------+--------------------+
|        Afghanistan|    36263|  1269|    25198|  9796|      106|        10|           18|Eastern Mediterra...|
|            Albania|     4880|   144|     2745|  1991|      117|         6|           63|              Europe|
|            Algeria|    27973|  1163|    18837|  7973|      616|         8|          749|              Africa|
|            Andorra|      907|    52|      803|    52|       10|         0|            0|              Europe|
|             Angola|      950|    41|      242|   667|       18|         1|            0|              Africa|
|Antigua and Barbuda|       86|     3|       65|    18|        4|         0|            5|            Am

In [20]:
# NOT or Inverse operation
# Fetch the records based on multiple conditions
csv_df.filter(~(csv_df['Active']>5000)).show()

+-------------------+---------+------+---------+------+---------+----------+-------------+--------------------+
|     Country/Region|Confirmed|Deaths|Recovered|Active|New cases|New deaths|New recovered|          WHO Region|
+-------------------+---------+------+---------+------+---------+----------+-------------+--------------------+
|            Albania|     4880|   144|     2745|  1991|      117|         6|           63|              Europe|
|            Andorra|      907|    52|      803|    52|       10|         0|            0|              Europe|
|             Angola|      950|    41|      242|   667|       18|         1|            0|              Africa|
|Antigua and Barbuda|       86|     3|       65|    18|        4|         0|            5|            Americas|
|            Austria|    20558|   713|    18246|  1599|       86|         1|           37|              Europe|
|            Bahamas|      382|    11|       91|   280|       40|         0|            0|            Am