In [0]:
#import findspark
#findspark.init('/spark/spark-3.5.1-bin-hadoop3')
from pyspark import *
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, DateType, TimestampType, LongType
from pyspark.sql.types import ArrayType, DoubleType, BooleanType, DecimalType
from pyspark.sql.functions import regexp_extract, split, from_unixtime, col, avg, min, max, desc
from pyspark.sql.functions import grouping, explode, array_contains



#spark = SparkSession.builder.appName("movielens").getOrCreate()

# Consultas sobre Movielens

## Schema de Movielens

![Schema](movielens_schema.png)

## ¿Cuál es la distribución de la clasificación de las películas? 

Proceso:

1. Cargar la tabla ratings
1. Cargar la tabla movies
1. Unir ambas tablas
1. Agrupar por calificación
1. Contar las películas


### Cargar las tablas

In [0]:
# Tabla Ratings
ratings_schema  = StructType(fields=[
    StructField("userId",IntegerType(),True), 
    StructField("movieId",StringType(),True),
    StructField("rating",DecimalType(precision=2,scale=1),True),
    StructField("timestamp",LongType(),True)
])
ratingsDf = spark.read\
    .option("header", True)\
    .option("dateFormat", "yyyyMMdd")\
    .schema(ratings_schema)\
    .csv("/Volumes/big_data_ii_2025/spark_examples/spark_data/ratings_full.csv")\
    .withColumn(\
            "date",\
            from_unixtime("timestamp", "yyyyMMdd"))\
                .drop('timestamp')

# Tabla Movies
movies_schema  = StructType(fields=[
    StructField("movieId",StringType(),True), 
    StructField("title",StringType(),True),
    StructField("genres",StringType(),True)
])

moviesDf = spark.read\
    .option("header", True)\
    .schema(movies_schema)\
    .csv("/Volumes/big_data_ii_2025/spark_examples/spark_data/movies.csv")

moviesDf = moviesDf.withColumn("genresSplit", split(moviesDf["genres"],"\|"))\
                        .drop('genres').withColumnRenamed("genresSplit","genres")\
                            .withColumn(\
                                "year",\
                                regexp_extract(\
                                           moviesDf["title"],\
                                           "^.+\(([0-9]+)\)$",\
                                           1)\
                                .try_cast(IntegerType()))\
                            .withColumn(\
                            "title_temp",\
                            regexp_extract(\
                                           moviesDf["title"],\
                                           "^(.+?) \([0-9]+\)$",\
                                           1))\
                            .drop('title')\
                        .withColumnRenamed("title_temp","title")

In [0]:
moviesDf.count()

86537

In [0]:
ratingsDf.count()

33832162

### Unir Ambas tablas

In [0]:
movie_ratingsDF = ratingsDf.join(moviesDf,on="movieId",how="inner")
movie_ratingsDF.show(3)

+-------+------+------+--------+--------------------+----+----------+
|movieId|userId|rating|    date|              genres|year|     title|
+-------+------+------+--------+--------------------+----+----------+
|      1|     1|   4.0|20081103|[Adventure, Anima...|1995| Toy Story|
|    110|     1|   4.0|20081105|[Action, Drama, War]|1995|Braveheart|
|    158|     1|   4.0|20081103|[Adventure, Child...|1995|    Casper|
+-------+------+------+--------+--------------------+----+----------+
only showing top 3 rows


### Mostrar la tabla

In [0]:
movie_ratingsDF.show(2)

+-------+------+------+--------+--------------------+----+----------+
|movieId|userId|rating|    date|              genres|year|     title|
+-------+------+------+--------+--------------------+----+----------+
|      1|     1|   4.0|20081103|[Adventure, Anima...|1995| Toy Story|
|    110|     1|   4.0|20081105|[Action, Drama, War]|1995|Braveheart|
+-------+------+------+--------+--------------------+----+----------+
only showing top 2 rows


Obtener todas las películas con una calificación superior a 4.

In [0]:
movie_ratingsDF.createOrReplaceTempView("movies")
high_rated_movies = spark.sql("SELECT * FROM movies WHERE rating > 4")
high_rated_movies.show(5)

+-------+------+------+--------+--------------------+----+--------------------+
|movieId|userId|rating|    date|              genres|year|               title|
+-------+------+------+--------+--------------------+----+--------------------+
|    260|     1|   4.5|20081103|[Action, Adventur...|1977|Star Wars: Episod...|
|    356|     1|   5.0|20081103|[Comedy, Drama, R...|1994|        Forrest Gump|
|   1036|     1|   5.0|20081103|[Action, Crime, T...|1988|            Die Hard|
|   1210|     1|   4.5|20081103|[Action, Adventur...|1983|Star Wars: Episod...|
|   1291|     1|   5.0|20081103| [Action, Adventure]|1989|Indiana Jones and...|
+-------+------+------+--------+--------------------+----+--------------------+
only showing top 5 rows


Calcular la calificación promedio por año.

In [0]:
avg_ratings_by_year = spark.sql("SELECT year, AVG(rating) as avg_rating FROM movies GROUP BY year ORDER BY year DESC")
avg_ratings_by_year.show(30)

+----+----------+
|year|avg_rating|
+----+----------+
|2023|   3.28833|
|2022|   3.37017|
|2021|   3.33446|
|2020|   3.34064|
|2019|   3.57170|
|2018|   3.49244|
|2017|   3.55593|
|2016|   3.54984|
|2015|   3.57558|
|2014|   3.64666|
|2013|   3.50208|
|2012|   3.55204|
|2011|   3.49184|
|2010|   3.57566|
|2009|   3.52738|
|2008|   3.52775|
|2007|   3.51211|
|2006|   3.56065|
|2005|   3.43242|
|2004|   3.52082|
|2003|   3.48517|
|2002|   3.48609|
|2001|   3.51340|
|2000|   3.43224|
|1999|   3.53721|
|1998|   3.46645|
|1997|   3.42968|
|1996|   3.35992|
|1995|   3.47224|
|1994|   3.54942|
+----+----------+
only showing top 30 rows


Convertir la fecha de string a timestamp y extraer el año.

In [0]:
from pyspark.sql.functions import col, year, to_date

movie_ratingsDF = movie_ratingsDF.withColumn("date2", to_date(col("date"), "yyyyMMdd"))
movie_ratingsDF = movie_ratingsDF.withColumn("year2", year(col("date2")))
movie_ratingsDF.show(20)


+-------+------+------+--------+--------------------+----+--------------------+----------+-----+
|movieId|userId|rating|    date|              genres|year|               title|     date2|year2|
+-------+------+------+--------+--------------------+----+--------------------+----------+-----+
|      1|     1|   4.0|20081103|[Adventure, Anima...|1995|           Toy Story|2008-11-03| 2008|
|    110|     1|   4.0|20081105|[Action, Drama, War]|1995|          Braveheart|2008-11-05| 2008|
|    158|     1|   4.0|20081103|[Adventure, Child...|1995|              Casper|2008-11-03| 2008|
|    260|     1|   4.5|20081103|[Action, Adventur...|1977|Star Wars: Episod...|2008-11-03| 2008|
|    356|     1|   5.0|20081103|[Comedy, Drama, R...|1994|        Forrest Gump|2008-11-03| 2008|
|    381|     1|   3.5|20081103|    [Drama, Romance]|1994|When a Man Loves ...|2008-11-03| 2008|
|    596|     1|   4.0|20081103|[Animation, Child...|1940|           Pinocchio|2008-11-03| 2008|
|   1036|     1|   5.0|2008110

Filtrar películas del género 'Comedy' y ordenarlas por calificación descendente.

In [0]:
comedy_movies = spark.sql("""
  SELECT * 
  FROM movies 
  WHERE array_contains(genres, 'Comedy') 
  ORDER BY rating DESC
""")
comedy_movies.show()


+-------+------+------+--------+--------------------+----+--------------------+
|movieId|userId|rating|    date|              genres|year|               title|
+-------+------+------+--------+--------------------+----+--------------------+
| 115617|     3|   5.0|20180905|[Action, Animatio...|2014|          Big Hero 6|
|   3114|     3|   5.0|20180905|[Adventure, Anima...|1999|         Toy Story 2|
|      1|     2|   5.0|19960626|[Adventure, Anima...|1995|           Toy Story|
|    356|     2|   5.0|19960626|[Comedy, Drama, R...|1994|        Forrest Gump|
|   4886|     1|   5.0|20081103|[Adventure, Anima...|2001|      Monsters, Inc.|
|    296|     3|   5.0|20180905|[Comedy, Crime, D...|1994|        Pulp Fiction|
|    357|     2|   5.0|19960626|   [Comedy, Romance]|1994|Four Weddings and...|
|     21|     2|   5.0|19960626|[Comedy, Crime, T...|1995|          Get Shorty|
|  49647|     1|   5.0|20081103|[Children, Comedy...|2006|     Charlotte's Web|
|    141|     2|   5.0|19960626|        