In [1]:
# Importación de librerías necesarias
from pyspark.sql import SparkSession
from pyspark.sql.functions import regexp_extract
from pyspark.sql.functions import corr, col, count, when

In [2]:
# Creación de la sesión de Spark
spark = SparkSession.builder \
    .appName("Análsis de Títulos en Netflix") \
    .master("local[*]") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "2g") \
    .config("spark.executor.cores", "2") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.sql.files.maxPartitionBytes", "128MB") \
    .config("spark.sql.shuffle.partitions", "200") \
    .config("spark.sql.execution.arrow.enabled", "true") \
    .getOrCreate()

In [3]:
# Carga del archivo CSV a Spark
file_path = "netflix_titles.csv"
df = spark.read.csv(file_path, header=True, inferSchema=True)

In [4]:
# Mostrar las primeras filas
df.show(5)

+-------+-------+--------------------+---------------+--------------------+-------------+------------------+------------+------+---------+--------------------+--------------------+
|show_id|   type|               title|       director|                cast|      country|        date_added|release_year|rating| duration|           listed_in|         description|
+-------+-------+--------------------+---------------+--------------------+-------------+------------------+------------+------+---------+--------------------+--------------------+
|     s1|  Movie|Dick Johnson Is Dead|Kirsten Johnson|                NULL|United States|September 25, 2021|        2020| PG-13|   90 min|       Documentaries|As her father nea...|
|     s2|TV Show|       Blood & Water|           NULL|Ama Qamata, Khosi...| South Africa|September 24, 2021|        2021| TV-MA|2 Seasons|International TV ...|After crossing pa...|
|     s3|TV Show|           Ganglands|Julien Leclercq|Sami Bouajila, Tr...|         NULL|Septem

In [5]:
# Mostrar el esquema del DataFrame
df.printSchema()

root
 |-- show_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- country: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- release_year: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)
 |-- description: string (nullable = true)



In [6]:
# Mostrar los valores que exiten en type
df.select("type").distinct().show()

+-------------+
|         type|
+-------------+
|      TV Show|
|        Movie|
|William Wyler|
|         NULL|
+-------------+



In [7]:
# Contar la cantidad de registros para cada valor
df.groupBy("type").count().show()

+-------------+-----+
|         type|count|
+-------------+-----+
|         NULL|    1|
|      TV Show| 2676|
|        Movie| 6131|
|William Wyler|    1|
+-------------+-----+



In [8]:
# Filtrar por tipo de contenido "Movie"
movies_df = df.filter(df["type"] == "Movie")
movies_df.show(10)

+-------+-----+--------------------+--------------------+--------------------+--------------------+------------------+------------+------+--------+--------------------+--------------------+
|show_id| type|               title|            director|                cast|             country|        date_added|release_year|rating|duration|           listed_in|         description|
+-------+-----+--------------------+--------------------+--------------------+--------------------+------------------+------------+------+--------+--------------------+--------------------+
|     s1|Movie|Dick Johnson Is Dead|     Kirsten Johnson|                NULL|       United States|September 25, 2021|        2020| PG-13|  90 min|       Documentaries|As her father nea...|
|     s7|Movie|My Little Pony: A...|Robert Cullen, Jo...|Vanessa Hudgens, ...|                NULL|September 24, 2021|        2021|    PG|  91 min|Children & Family...|Equestria's divid...|
|     s8|Movie|             Sankofa|        Haile 

In [9]:
# Contar la cantidad de producciones por director
df.groupBy("director").count().orderBy("count", ascending=False).show(10)

+--------------------+-----+
|            director|count|
+--------------------+-----+
|                NULL| 2636|
|       Rajiv Chilaka|   19|
|Raúl Campos, Jan ...|   18|
|        Marcus Raboy|   16|
|         Suhas Kadav|   16|
|           Jay Karas|   14|
| Cathy Garcia-Molina|   13|
|     Youssef Chahine|   12|
|     Martin Scorsese|   12|
|         Jay Chapman|   12|
+--------------------+-----+
only showing top 10 rows



In [10]:
# Filtrar por país
mx_content = df.filter(df["country"] == "Mexico")
mx_content.show(10)

+-------+-------+--------------------+--------------------+--------------------+-------+------------------+------------+------+---------+--------------------+--------------------+
|show_id|   type|               title|            director|                cast|country|        date_added|release_year|rating| duration|           listed_in|         description|
+-------+-------+--------------------+--------------------+--------------------+-------+------------------+------------+------+---------+--------------------+--------------------+
|    s18|TV Show|     Falsa identidad|                NULL|Luis Ernesto Fran...| Mexico|September 22, 2021|        2020| TV-MA|2 Seasons|Crime TV Shows, S...|Strangers Diego a...|
|   s283|  Movie|La diosa del asfalto|    Julián Hernández|Ximena Romo, Mabe...| Mexico|   August 11, 2021|        2020| TV-MA|  127 min|Dramas, Independe...|A woman from a to...|
|   s312|TV Show|           Control Z|                NULL|Ana Valeria Becer...| Mexico|    August 4

In [11]:
# Obtener lista de directores
df.select("director").distinct().show()

+--------------------+
|            director|
+--------------------+
|           Eric Meza|
|         Tommy Chong|
|          John Wells|
|         Erol Özlevi|
|  Alejandro Hartmann|
|   Jennifer Westcott|
|       Neeraj Pandey|
|   Wash Westmoreland|
|       Patricia Font|
|          Bobby Roth|
|      Aleksey German|
| Rhiannon Bannenberg|
|      Elizabeth Wood|
|          Manish Jha|
|        April Mullen|
|Harry Elfont, Deb...|
|   Abdul Aziz Hashad|
|          Will Gluck|
|Jesús Magaña Vázquez|
|      Gail Willumsen|
+--------------------+
only showing top 20 rows



In [12]:
# Contar valores nulos en la columna director
df.select(count(when(col("director").isNull(), 1)).alias("null_directors")).show()

+--------------+
|null_directors|
+--------------+
|          2636|
+--------------+



In [13]:
df.select("duration").distinct().show()

+-----------------+
|         duration|
+-----------------+
|          100 min|
|          153 min|
|           71 min|
|           56 min|
| Donnell Rawlings|
|           13 min|
|          119 min|
|           33 min|
|          165 min|
|       10 Seasons|
|           12 min|
|          204 min|
|          142 min|
|          173 min|
|           27 min|
|          157 min|
|           30 min|
|           39 min|
|        8 Seasons|
|           82 min|
+-----------------+
only showing top 20 rows



In [14]:
# Ordenar las series de mayor a menor temporadas
series_df = df.filter(df["type"] == "TV Show")
series_df = series_df.withColumn("num_seasons", regexp_extract(col("duration"), r'(\d+)', 1).cast("int"))
series_df.orderBy(col("num_seasons").desc()).select("title", "num_seasons").show()

+--------------------+-----------+
|               title|num_seasons|
+--------------------+-----------+
|      Grey's Anatomy|         17|
|        Supernatural|         15|
|                NCIS|         15|
|       The Challenge|         14|
|           Heartland|         13|
|COMEDIANS of the ...|         13|
|        Red vs. Blue|         13|
|   Trailer Park Boys|         12|
|      Criminal Minds|         12|
|              Cheers|         11|
|             Frasier|         11|
|       Stargate SG-1|         10|
|    The Walking Dead|         10|
|LEGO Ninjago: Mas...|         10|
|    Shameless (U.S.)|         10|
|Danger Mouse: Cla...|         10|
|          Dad's Army|         10|
|             Friends|         10|
|    Call the Midwife|          9|
|American Horror S...|          9|
+--------------------+-----------+
only showing top 20 rows



In [15]:
# Mostrar la duración de películas
movies_df.orderBy(col("duration")).select("title", "duration").distinct().show()

+--------------------+--------+
|               title|duration|
+--------------------+--------+
|     Vampire Academy| 104 min|
|      Bigfoot Family|  88 min|
|           Homefront| 100 min|
|            Wish You| 102 min|
|Just Another Chri...| 101 min|
|StarBeam: Hallowe...|  33 min|
|High & Low The Re...| 111 min|
|            Bheeshma| 138 min|
|         Moms at War|  92 min|
|        Jai Mummy Di| 101 min|
| True: Winter Wishes|  46 min|
|Master Z: The Ip ...| 108 min|
|Mo Gilligan: Mome...|  64 min|
|              Patria|  90 min|
|Edmilson Filho: N...|  80 min|
|  The Debt Collector|  96 min|
|     The Bittersweet|  98 min|
|                1922| 103 min|
|Tini: The New Lif...|  99 min|
|100 Years: One Wo...|  75 min|
+--------------------+--------+
only showing top 20 rows



In [16]:
# Filtrar directores quitando los valores nulos
df.filter(col("director").isNotNull()) \
  .groupBy("director") \
  .count() \
  .orderBy(col("count").desc()) \
  .select("director", "count") \
  .show(10, truncate=False)

+----------------------+-----+
|director              |count|
+----------------------+-----+
|Rajiv Chilaka         |19   |
|Raúl Campos, Jan Suter|18   |
|Marcus Raboy          |16   |
|Suhas Kadav           |16   |
|Jay Karas             |14   |
|Cathy Garcia-Molina   |13   |
|Youssef Chahine       |12   |
|Martin Scorsese       |12   |
|Jay Chapman           |12   |
|Steven Spielberg      |11   |
+----------------------+-----+
only showing top 10 rows

