In [0]:
import time
from pyspark import *
from pyspark.sql import SparkSession, Window
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, DateType, TimestampType, LongType
from pyspark.sql.types import ArrayType, DoubleType, BooleanType, DecimalType
from pyspark.sql.functions import regexp_extract, split, from_unixtime, col, avg, min, max, desc
from pyspark.sql.functions import grouping, explode, array_contains, struct, collect_list, row_number

# Optimización con Persistencia: cache()



In [0]:
# Cargar datos MovieLens (ratings.csv) suponiendo que está en /dbfs/FileStore/movielens/ratings.csv
# Tabla Ratings
ratings_schema  = StructType(fields=[
    StructField("userId",IntegerType(),True), 
    StructField("movieId",IntegerType(),True),
    StructField("rating",DecimalType(precision=2,scale=1),True),
    StructField("timestamp",LongType(),True)
])
ratingsDf = spark.read\
    .option("header", True)\
    .option("dateFormat", "yyyyMMdd")\
    .schema(ratings_schema)\
    .csv("dbfs:/FileStore/tables/ratings_full.csv")\
    .withColumn(\
            "date",\
            from_unixtime("timestamp", "yyyyMMdd"))\
                .drop('timestamp')

In [0]:
%time
# Primera consulta: calcular promedio de calificación por película
start = time.time()
ratingsDf.groupBy("movieId").avg("rating").count()
print("Sin cache:", time.time() - start, "segundos")

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.87 µs
Sin cache: 3.757044792175293 segundos


In [0]:
%time
# Aplicar cache
ratingsDf.cache()
print("Tamaño: ", ratingsDf.count())  # "cachear"



CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 4.77 µs
Out[4]: 33832162

In [0]:
%time
# Probar con el cache
start = time.time()
ratingsDf.groupBy("userId").avg("rating").count()
print("Con cache:", time.time() - start, "segundos")



CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.53 µs
Con cache: 32.33386182785034 segundos


In [0]:
%time
# Limpiar caché
ratingsDf.unpersist()

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 6.91 µs
Out[7]: DataFrame[userId: int, movieId: int, rating: decimal(2,1), date: string]