In [0]:
import time
from pyspark import *
from pyspark.sql import SparkSession, Window
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, DateType, TimestampType, LongType
from pyspark.sql.types import ArrayType, DoubleType, BooleanType, DecimalType
from pyspark.sql.functions import regexp_extract, split, from_unixtime, col, avg, min, max, desc
from pyspark.sql.functions import grouping, explode, array_contains, struct, collect_list, row_number

# Optimización con Persistencia: cache()



In [0]:
# Cargar datos MovieLens (ratings.csv) suponiendo que está en /dbfs/FileStore/movielens/ratings.csv
# Tabla Ratings
ratings_schema  = StructType(fields=[
    StructField("userId",IntegerType(),True), 
    StructField("movieId",IntegerType(),True),
    StructField("rating",DecimalType(precision=2,scale=1),True),
    StructField("timestamp",LongType(),True)
])
ratingsDf = spark.read\
    .option("header", True)\
    .option("dateFormat", "yyyyMMdd")\
    .schema(ratings_schema)\
    .csv("/Volumes/big_data_ii_2025/spark_examples/spark_data/ratings_full.csv")\
    .withColumn(\
            "date",\
            from_unixtime("timestamp", "yyyyMMdd"))\
                .drop('timestamp')

In [0]:
%time
# Primera consulta: calcular promedio de calificación por película
start = time.time()
ratingsDf.groupBy("movieId").avg("rating").count()
print("Sin cache:", time.time() - start, "segundos")

CPU times: user 3 μs, sys: 1e+03 ns, total: 4 μs
Wall time: 5.48 μs
Sin cache: 4.341420412063599 segundos


In [0]:
%time
# Aplicar cache
ratingsDf.cache()
print("Tamaño: ", ratingsDf.count())  # "cachear"



CPU times: user 3 μs, sys: 0 ns, total: 3 μs
Wall time: 6.68 μs


[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-6212165207225505>, line 3[0m
[1;32m      1[0m get_ipython()[38;5;241m.[39mrun_line_magic([38;5;124m'[39m[38;5;124mtime[39m[38;5;124m'[39m, [38;5;124m'[39m[38;5;124m'[39m)
[1;32m      2[0m [38;5;66;03m# Aplicar cache[39;00m
[0;32m----> 3[0m ratingsDf[38;5;241m.[39mcache()
[1;32m      4[0m [38;5;28mprint[39m([38;5;124m"[39m[38;5;124mTamaño: [39m[38;5;124m"[39m, ratingsDf[38;5;241m.[39mcount())

File [0;32m/databricks/python/lib/python3.11/site-packages/pyspark/sql/connect/dataframe.py:2163[0m, in [0;36mDataFrame.cache[0;34m(self)[0m
[1;32m   2162[0m [38;5;28;01mdef[39;00m [38;5;21mcache[39m([38;5;28mself[39m) [38;5;241m-[39m[38;5;241m>[39m [38;5;124m"[39m[38;5;124mDataFrame[39m[38;5;124m"[39m:
[0;32m-> 2163[0m     [38;5;28;01mreturn[39

In [0]:
%time
# Probar con el cache
start = time.time()
ratingsDf.groupBy("userId").avg("rating").count()
print("Con cache:", time.time() - start, "segundos")





In [0]:
%time
# Limpiar caché
ratingsDf.unpersist()

