In [5]:
# installing java
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# downloading apache spark 3.5.3
!wget -q https://dlcdn.apache.org/spark/spark-3.5.3/spark-3.5.3-bin-hadoop3.tgz

# to extract the donwloaded file
!tar xf spark-3.5.3-bin-hadoop3.tgz

# Installing findspark so we can locate Spark
!pip install -q findspark


In [6]:
import os

# setting java environment variable
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

# setting spark environment variable
os.environ["SPARK_HOME"] = "/content/spark-3.5.3-bin-hadoop3"


In [7]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

# creating a soark session
spark = SparkSession.builder \
    .appName("Colab Spark Test") \
    .getOrCreate()

# testing the Spark session by displaying numbers 1-10
spark.range(10).show()


+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
+---+



In [8]:
# path of the netflix movies and shows dataset
file_path = '/content/netflix_titles.csv'

In [9]:
# loading the dataset using spark
netflix_df = spark.read.csv(file_path, header=True, inferSchema=True)

# showing the schema
netflix_df.printSchema()

# displaying first 5 rows
netflix_df.show(5)


root
 |-- show_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- country: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- release_year: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)
 |-- description: string (nullable = true)

+-------+-------+--------------------+---------------+--------------------+-------------+------------------+------------+------+---------+--------------------+--------------------+
|show_id|   type|               title|       director|                cast|      country|        date_added|release_year|rating| duration|           listed_in|         description|
+-------+-------+--------------------+---------------+--------------------+-------------+------------------+------------+------+---------+-------------------

In [10]:
print(f"Number of rows: {netflix_df.count()}, Number of columns: {len(netflix_df.columns)}")


Number of rows: 8809, Number of columns: 12


In [11]:
# some statistics for the columns
netflix_df.describe().show()

+-------+--------------------+-------------+---------------------------------+--------------------+--------------------+----------------+---------------+-----------------+-----------------+-------------+--------------------+--------------------+
|summary|             show_id|         type|                            title|            director|                cast|         country|     date_added|     release_year|           rating|     duration|           listed_in|         description|
+-------+--------------------+-------------+---------------------------------+--------------------+--------------------+----------------+---------------+-----------------+-----------------+-------------+--------------------+--------------------+
|  count|                8809|         8808|                             8807|                6173|                7983|            7977|           8796|             8807|             8803|         8804|                8806|                8806|
|   mean|       

In [12]:
from pyspark.sql.functions import col, sum

# counting the number of missing values in each column
missing_values = netflix_df.select(
    [(sum(col(c).isNull().cast("int")).alias(c)) for c in netflix_df.columns]
)
missing_values.show()


+-------+----+-----+--------+----+-------+----------+------------+------+--------+---------+-----------+
|show_id|type|title|director|cast|country|date_added|release_year|rating|duration|listed_in|description|
+-------+----+-----+--------+----+-------+----------+------------+------+--------+---------+-----------+
|      0|   1|    2|    2636| 826|    832|        13|           2|     6|       5|        3|          3|
+-------+----+-----+--------+----+-------+----------+------------+------+--------+---------+-----------+



In [13]:
# counting by type (movie or tv show)
netflix_df.groupBy("type").count().show()

+-------------+-----+
|         type|count|
+-------------+-----+
|         NULL|    1|
|      TV Show| 2676|
|        Movie| 6131|
|William Wyler|    1|
+-------------+-----+



In [17]:
from pyspark.sql.functions import year, to_date

# extracting year from date_added column. then counting the number of shows added each year
netflix_df = netflix_df.withColumn("year_added", year(to_date(col("date_added"), "MMMM d, yyyy")))
content_by_year = netflix_df.groupBy("year_added").count().orderBy("year_added")
content_by_year.show()

+----------+-----+
|year_added|count|
+----------+-----+
|      NULL|  120|
|      2008|    2|
|      2009|    2|
|      2010|    1|
|      2011|   13|
|      2012|    3|
|      2013|   10|
|      2014|   23|
|      2015|   72|
|      2016|  418|
|      2017| 1162|
|      2018| 1623|
|      2019| 1997|
|      2020| 1872|
|      2021| 1491|
+----------+-----+



In [21]:
# counting number of movies/shows in top 10 genres
from pyspark.sql.functions import explode, split

genres_df = netflix_df.withColumn("genre", explode(split(col("listed_in"), ", ")))
popular_genres = genres_df.groupBy("genre").count().orderBy(col("count").desc())
popular_genres.show(10)

+--------------------+-----+
|               genre|count|
+--------------------+-----+
|International Movies| 2748|
|              Dramas| 2419|
|            Comedies| 1670|
|International TV ...| 1350|
|       Documentaries|  866|
|  Action & Adventure|  857|
|           TV Dramas|  762|
|  Independent Movies|  751|
|Children & Family...|  641|
|     Romantic Movies|  616|
+--------------------+-----+
only showing top 10 rows



In [22]:
# counting the number of movies/shows by rating
ratings_distribution = netflix_df.groupBy("rating").count().orderBy(col("count").desc())
ratings_distribution.show()

+-----------------+-----+
|           rating|count|
+-----------------+-----+
|            TV-MA| 3195|
|            TV-14| 2158|
|            TV-PG|  862|
|                R|  796|
|            PG-13|  489|
|            TV-Y7|  334|
|             TV-Y|  307|
|               PG|  286|
|             TV-G|  220|
|               NR|   80|
|                G|   41|
|             NULL|    6|
|         TV-Y7-FV|    6|
|               UR|    3|
|            NC-17|    3|
|             2021|    2|
| November 1, 2020|    1|
| Shavidee Trotter|    1|
|    Adriane Lenox|    1|
|    Maury Chaykin|    1|
+-----------------+-----+
only showing top 20 rows

