## Import libraries

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import array_contains, col, explode

## Build Spark Session object

In [2]:
spark = (SparkSession.builder
         .appName("filter-data")
         .master("spark://spark-master:7077")
         .config("spark.executor.memory", "512m")
         .getOrCreate())

spark.sparkContext.setLogLevel("ERROR")



Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/05 14:37:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Read the data

In [3]:
df = (spark.read.format("csv")
      .option("header", "true")
      .option("nullValue", "null")
      .option("dateFormat", "LLLL d, y")
      .load("../../data/netflix_titles.csv"))


In [4]:
df.printSchema()

root
 |-- show_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- country: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- release_year: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)
 |-- description: string (nullable = true)



In [9]:
df.show(5,truncate=False)

+-------+-------+---------------------+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------+------------------+------------+------+---------+-------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------+
|show_id|type   |title                |director       |cast                                                                                                                                                                                                                                                                                                           |cou

## Get the unique values in the column "year"

In [10]:
df.select("release_year").distinct().show()

+-----------------+
|     release_year|
+-----------------+
|     Ted Ferguson|
|             1987|
|             1956|
|             2016|
|             2020|
|             2012|
|             1958|
|             1943|
|             1972|
| Marquell Manning|
|             1988|
|             2019|
|             2017|
|             1977|
|             2014|
|             1971|
|             1984|
|             2013|
|             1982|
|             2005|
+-----------------+
only showing top 20 rows



## Print unique values as a list

In [12]:
unique_values = [row["release_year"] for row in df.select("release_year").distinct().collect()]
print(unique_values)

[' Ted Ferguson', '1987', '1956', '2016', '2020', '2012', '1958', '1943', '1972', ' Marquell Manning', '1988', '2019', '2017', '1977', '2014', '1971', '1984', '2013', '1982', '2005', '2000', '1965', '1962', '1954', ' Charles Rocket', 'December 15, 2020', '1981', ' Peter Ferriero', '1978', '1974', '2002', '1959', ' Paul Sambo', '2018', '2009', 'United States', '1995', '1964', ' Dr. Dre', '1946', '2006', ' Nick Kroll', '1976', ' Imanol Arias', '1942', '1947', '1967', '1968', '2004', 'January 15, 2021', '2011', '1989', '1992', '1961', '1966', '2008', 'January 13, 2021', '1955', '1999', '1963', '1994', '1997', '1973', '1925', '2007', ' Kristen Johnston', '1996', 'June 12, 2021', '1983', ' Álvaro Cervantes', ' Nse Ikpe-Etim', '1969', ' Jade Eshete', '1980', '1944', '1960', '2021', ' Francis Weddey', '1986', '1985', 'August 13, 2020', '1979', '2015', '1998', '1993', '2001', '2010', '1990', '2003', '1991', '1945', '1975', '1970']


## Removing the non-years from the year column

In [13]:
from pyspark.sql.functions import col, trim

In [14]:
df_years_only = df.filter(trim(col("release_year")).rlike("^[0-9]{4}$"))



In [16]:
df_years_only.select("release_year").show()

+------------+
|release_year|
+------------+
|        2020|
|        2021|
|        2021|
|        2021|
|        2021|
|        2021|
|        2021|
|        1993|
|        2021|
|        2021|
|        2021|
|        2021|
|        2021|
|        2021|
|        2021|
|        2021|
|        2020|
|        2020|
|        2021|
|        2021|
+------------+
only showing top 20 rows



## Filter the dataframe

In [19]:
filtered_df = df_years_only.filter(col("release_year") > 2020)

filtered_df.select("release_year").show()

+------------+
|release_year|
+------------+
|        2021|
|        2021|
|        2021|
|        2021|
|        2021|
|        2021|
|        2021|
|        2021|
|        2021|
|        2021|
|        2021|
|        2021|
|        2021|
|        2021|
|        2021|
|        2021|
|        2021|
|        2021|
|        2021|
|        2021|
+------------+
only showing top 20 rows



## Chaining multiple conditions

df_filtered = df.filter(
    (trim(col("your_column")).rlike("^[0-9]{4}$")) & 
    (trim(col("your_column")).cast("int") > 2020)
)

## Filtering with multiple conditions

In [23]:
unique_values = [row["country"] for row in df.select("country").distinct().collect()]
print(unique_values)

['Peru, United States, United Kingdom', 'India, United Kingdom, France, Qatar', 'Japan, Canada, United States', 'United Kingdom, China', 'India, Germany', 'South Africa, United States, Germany', 'Russia', 'United Kingdom, Germany, United States, France', 'Chile, United States, France', 'South Africa, Angola', 'United States, Poland', 'Philippines, Qatar', 'Hong Kong, China, Singapore', 'Germany, Sri Lanka', 'Denmark, France, United States, Sweden', 'United States, France, South Korea, Indonesia', 'United Kingdom, Nigeria', 'Australia, United Arab Emirates', 'Brazil, France, Germany', 'United States, Ireland', 'United States, Greece', 'United States, France, Italy, United Kingdom', 'France, Iran, United States', 'India, Soviet Union', 'United States, Germany, Australia', 'Brazil, United States', 'United Kingdom, India, United States', 'France, United States', 'Turkey, United States', 'Senegal', 'United Kingdom, Canada, United States, Germany', 'Ireland, United Kingdom, Italy, United Sta

In [24]:
df.count()

8806

In [28]:
len(df.columns)

12

In [29]:
filtered_df = (df.filter((col("country") == 'United States') & (col('release_year') >2020)))

filtered_df.count()

137

In [30]:
filtered_df.show()

+-------+-------+--------------------+--------------------+--------------------+-------------+------------------+------------+------+---------+--------------------+--------------------+
|show_id|   type|               title|            director|                cast|      country|        date_added|release_year|rating| duration|           listed_in|         description|
+-------+-------+--------------------+--------------------+--------------------+-------------+------------------+------------+------+---------+--------------------+--------------------+
|    s10|  Movie|        The Starling|      Theodore Melfi|Melissa McCarthy,...|United States|September 24, 2021|        2021| PG-13|  104 min|    Comedies, Dramas|A woman adjusting...|
|    s16|TV Show|   Dear White People|                null|Logan Browning, B...|United States|September 22, 2021|        2021| TV-MA|4 Seasons|TV Comedies, TV D...|"Students of colo...|
|    s41|TV Show|He-Man and the Ma...|                null|Yuri Lowent

## Filter based on a list of values

In [31]:
filtered_df = (df.filter(col("country").isin(["United States", "United Kingdom", "India"])))

filtered_df.count()

4196

In [32]:
filtered_df.show()

+-------+-------+--------------------+----------------+--------------------+--------------+------------------+------------+------+---------+--------------------+--------------------+
|show_id|   type|               title|        director|                cast|       country|        date_added|release_year|rating| duration|           listed_in|         description|
+-------+-------+--------------------+----------------+--------------------+--------------+------------------+------------+------+---------+--------------------+--------------------+
|     s1|  Movie|Dick Johnson Is Dead| Kirsten Johnson|                null| United States|September 25, 2021|        2020| PG-13|   90 min|       Documentaries|As her father nea...|
|     s5|TV Show|        Kota Factory|            null|Mayur More, Jiten...|         India|September 24, 2021|        2021| TV-MA|2 Seasons|International TV ...|In a city of coac...|
|     s9|TV Show|The Great British...| Andy Devonshire|Mel Giedroyc, Sue...|United Ki

## Filtering on strings

In [34]:
filtered_df = df.filter(col("listed_in").like("Crime%"))

filtered_df.select("listed_in").show()

+--------------------+
|           listed_in|
+--------------------+
|Crime TV Shows, I...|
|Crime TV Shows, D...|
|Crime TV Shows, I...|
|Crime TV Shows, S...|
|Crime TV Shows, D...|
|Crime TV Shows, T...|
|Crime TV Shows, I...|
|Crime TV Shows, T...|
|Crime TV Shows, I...|
|Crime TV Shows, T...|
|Crime TV Shows, D...|
|Crime TV Shows, I...|
|Crime TV Shows, I...|
|Crime TV Shows, I...|
|Crime TV Shows, I...|
|Crime TV Shows, D...|
|Crime TV Shows, T...|
|Crime TV Shows, I...|
|Crime TV Shows, I...|
|Crime TV Shows, I...|
+--------------------+
only showing top 20 rows



## Filtering based on regular expression

In [39]:
filtered_df = df.filter(col("listed_in").rlike("(Crime | Thrillers)"))

filtered_df.select("listed_in").show(truncate=False)

+-----------------------------------------------------------------+
|listed_in                                                        |
+-----------------------------------------------------------------+
|Crime TV Shows, International TV Shows, TV Action & Adventure    |
|Crime TV Shows, Docuseries, International TV Shows               |
|Crime TV Shows, International TV Shows, TV Action & Adventure    |
|British TV Shows, Crime TV Shows, Docuseries                     |
|Crime TV Shows, Spanish-Language TV Shows, TV Dramas             |
|Crime TV Shows, Docuseries, International TV Shows               |
|International TV Shows, TV Dramas, TV Thrillers                  |
|Dramas, International Movies, Thrillers                          |
|Dramas, Horror Movies, Thrillers                                 |
|Action & Adventure, Horror Movies, Thrillers                     |
|Action & Adventure, Horror Movies, Thrillers                     |
|Dramas, Thrillers                              

## Filtering on Date Ranges

In [47]:
from pyspark.sql.functions import to_date, date_format, col

In [48]:
# convert string to date format

df_temp = df.withColumn("date_added", to_date(col("date_added"), "LLLL d, yyyy"))

# conbver the date format to the desired format

df_new = df_temp.withColumn("date_added", date_format(col("date_added"), "yyyy-mm-dd"))





In [49]:
filtered_df = df_new.filter((col("date_added") >= "2011-02-01") & (col("date_added") <= "2021-03-01"))

filtered_df.show()

+-------+-------+--------------------+--------------------+--------------------+--------------------+----------+------------+------+---------+--------------------+--------------------+
|show_id|   type|               title|            director|                cast|             country|date_added|release_year|rating| duration|           listed_in|         description|
+-------+-------+--------------------+--------------------+--------------------+--------------------+----------+------------+------+---------+--------------------+--------------------+
|     s1|  Movie|Dick Johnson Is Dead|     Kirsten Johnson|                null|       United States|2021-00-25|        2020| PG-13|   90 min|       Documentaries|As her father nea...|
|     s2|TV Show|       Blood & Water|                null|Ama Qamata, Khosi...|        South Africa|2021-00-24|        2021| TV-MA|2 Seasons|International TV ...|After crossing pa...|
|     s3|TV Show|           Ganglands|     Julien Leclercq|Sami Bouajila, T

In [50]:
df.count()

8806

In [52]:
filtered_df.count()

8671

## Get the date range from a column

In [54]:
from pyspark.sql.functions import min, max

# Calculate the minimum and maximum dates in the date_added column
date_range = filtered_df.agg(
    min("date_added").alias("min_date"),
    max("date_added").alias("max_date")
)

date_range.show(truncate=False)

+----------+----------+
|min_date  |max_date  |
+----------+----------+
|2012-00-01|2021-00-31|
+----------+----------+



In [55]:
spark.stop()