In [0]:
#import findspark
#findspark.init('/spark/spark-3.5.1-bin-hadoop3')
from pyspark import *
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, DateType, TimestampType, LongType
from pyspark.sql.types import ArrayType, DoubleType, BooleanType, DecimalType
from pyspark.sql.functions import regexp_extract, split, from_unixtime, col, avg, min, max, desc
from pyspark.sql.functions import grouping, explode, array_contains



#spark = SparkSession.builder.appName("movielens").getOrCreate()

# Consultas sobre Movielens

## Schema de Movielens

![Schema](movielens_schema.png)

## ¿Cuál es la distribución de la clasificación de las películas? 

Proceso:

1. Cargar la tabla ratings
1. Cargar la tabla movies
1. Unir ambas tablas
1. Agrupar por calificación
1. Contar las películas


### Cargar las tablas

In [0]:
# Tabla Ratings
ratings_schema  = StructType(fields=[
    StructField("userId",IntegerType(),True), 
    StructField("movieId",StringType(),True),
    StructField("rating",DecimalType(precision=2,scale=1),True),
    StructField("timestamp",LongType(),True)
])
ratingsDf = spark.read\
    .option("header", True)\
    .option("dateFormat", "yyyyMMdd")\
    .schema(ratings_schema)\
    .csv("/Volumes/big_data_ii_2025/spark_examples/spark_data/ratings_full.csv")\
    .withColumn(\
            "date",\
            from_unixtime("timestamp", "yyyyMMdd"))\
                .drop('timestamp')

# Tabla Movies
movies_schema  = StructType(fields=[
    StructField("movieId",StringType(),True), 
    StructField("title",StringType(),True),
    StructField("genres",StringType(),True)
])

moviesDf = spark.read\
    .option("header", True)\
    .schema(movies_schema)\
    .csv("/Volumes/big_data_ii_2025/spark_examples/spark_data/movies.csv")

moviesDf = moviesDf.withColumn("genresSplit", split(moviesDf["genres"],"\|"))\
                        .drop('genres').withColumnRenamed("genresSplit","genres")\
                            .withColumn(\
                                "year",\
                                regexp_extract(\
                                           moviesDf["title"],\
                                           "^.+\(([0-9]+)\)$",\
                                           1)\
                                .try_cast(IntegerType()))\
                            .withColumn(\
                            "title_temp",\
                            regexp_extract(\
                                           moviesDf["title"],\
                                           "^(.+?) \([0-9]+\)$",\
                                           1))\
                            .drop('title')\
                        .withColumnRenamed("title_temp","title")

In [0]:
moviesDf.count()

In [0]:
ratingsDf.count()

### Unir Ambas tablas

In [0]:
movie_ratingsDF = ratingsDf.join(moviesDf,on="movieId",how="inner")
movie_ratingsDF.show(3)

### Mostrar la tabla

In [0]:
movie_ratingsDF.show(2)

Obtener todas las películas con una calificación superior a 4.

In [0]:
movie_ratingsDF.createOrReplaceTempView("movies")
high_rated_movies = spark.sql("SELECT * FROM movies WHERE rating > 4")
high_rated_movies.show(5)

Calcular la calificación promedio por año.

In [0]:
avg_ratings_by_year = spark.sql("SELECT year, AVG(rating) as avg_rating FROM movies GROUP BY year ORDER BY year DESC")
avg_ratings_by_year.show(30)

Convertir la fecha de string a timestamp y extraer el año.

In [0]:
from pyspark.sql.functions import col, year, to_date

movie_ratingsDF = movie_ratingsDF.withColumn("date2", to_date(col("date"), "yyyyMMdd"))
movie_ratingsDF = movie_ratingsDF.withColumn("year2", year(col("date2")))
movie_ratingsDF.show(20)


Filtrar películas del género 'Comedy' y ordenarlas por calificación descendente.

In [0]:
comedy_movies = spark.sql("""
  SELECT * 
  FROM movies 
  WHERE array_contains(genres, 'Comedy') 
  ORDER BY rating DESC
""")
comedy_movies.show()
