# Buenas prácticas de Ventanas en Spark

In [0]:

from pyspark.sql import Window
from pyspark.sql.functions import col, row_number
from time import time
from pyspark import *
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, DateType, TimestampType, LongType
from pyspark.sql.types import ArrayType, DoubleType, BooleanType, DecimalType
from pyspark.sql.functions import regexp_extract, split, from_unixtime, col, avg, min, max, desc
from pyspark.sql.functions import grouping, explode, array_contains, row_number

spark.conf.set('spark.sql.shuffle.partitions', '200') # Fijar el numero de particiones


In [0]:
%fs ls /Volumes/big_data_ii_2025/spark_examples/spark_data/

path,name,size,modificationTime
dbfs:/Volumes/big_data_ii_2025/spark_examples/spark_data/PADRON_COMPLETO.csv,PADRON_COMPLETO.csv,437542254,1750034299000
dbfs:/Volumes/big_data_ii_2025/spark_examples/spark_data/distelec.csv,distelec.csv,175692,1750034273000
dbfs:/Volumes/big_data_ii_2025/spark_examples/spark_data/movies.csv,movies.csv,4192335,1750034274000
dbfs:/Volumes/big_data_ii_2025/spark_examples/spark_data/por-ciclo-2016-2018.csv,por-ciclo-2016-2018.csv,904931,1750034273000
dbfs:/Volumes/big_data_ii_2025/spark_examples/spark_data/ratings_full.csv,ratings_full.csv,933898879,1750034299000
dbfs:/Volumes/big_data_ii_2025/spark_examples/spark_data/tags.csv,tags.csv,85361813,1750034296000


In [0]:
# Tabla Ratings
ratings_schema  = StructType(fields=[
    StructField("userId",IntegerType(),True), 
    StructField("movieId",IntegerType(),True),
    StructField("rating",DecimalType(precision=2,scale=1),True),
    StructField("timestamp",LongType(),True)
])
ratingsDf = spark.read\
    .option("header", True)\
    .option("dateFormat", "yyyyMMdd")\
    .schema(ratings_schema)\
    .csv("/Volumes/big_data_ii_2025/spark_examples/spark_data/ratings_full.csv")\
    .withColumn(\
            "date",\
            from_unixtime("timestamp", "yyyyMMdd"))

# Tabla Movies
movies_schema  = StructType(fields=[
    StructField("movieId",IntegerType(),True), 
    StructField("title",StringType(),True),
    StructField("genres",StringType(),True)
])

moviesDf = spark.read\
    .option("header", True)\
    .schema(movies_schema)\
    .csv("/Volumes/big_data_ii_2025/spark_examples/spark_data/movies.csv")

moviesDf = moviesDf.withColumn("genresSplit", split(moviesDf["genres"],"\|"))\
                        .drop('genres').withColumnRenamed("genresSplit","genres")\
                            .withColumn(\
                                "year",\
                                regexp_extract(\
                                           moviesDf["title"],\
                                           "^.+\(([0-9]+)\)$",\
                                           1)\
                                .cast(IntegerType()))\
                            .withColumn(\
                            "title_temp",\
                            regexp_extract(\
                                           moviesDf["title"],\
                                           "^(.+?) \([0-9]+\)$",\
                                           1))\
                            .drop('title')\
                        .withColumnRenamed("title_temp","title")
#ratingsDf.cache()
print('Registros totales:', ratingsDf.count())

Registros totales: 33832162


In [0]:
w_bad = Window.partitionBy('userId').orderBy(col('rating').asc())  # cardinalidad baja
t0 = time()
top3_bad = (ratingsDf.withColumn('rn', row_number().over(w_bad))
                    .where('rn <= 3'))
print('Contraejemplo → filas:', top3_bad.count(), ' | tiempo (s):', round(time() - t0, 2))


Contraejemplo → filas: 970126  | tiempo (s): 6.48


In [0]:
w_good = Window.partitionBy('userId').orderBy(col('timestamp').desc())
t0 = time()
top3_good = (ratingsDf.withColumn('rn', row_number().over(w_good))
                     .where('rn <= 3'))
print('Buena práctica → filas:', top3_good.count(), ' | tiempo (s):', round(time() - t0, 2))


Buena práctica → filas: 970126  | tiempo (s): 6.68


# Marco Pequeño

In [0]:
w_big = (Window.partitionBy('userId')
                 .orderBy(col('timestamp').desc())
                 .rowsBetween(Window.unboundedPreceding, Window.currentRow))
t0 = time()
ratingsDf.withColumn('mov_avg', avg('rating').over(w_big)) \
       .write.format('noop').mode('overwrite').save()
print('Ventana UNBOUNDED →', round(time() - t0, 2), 's')


Ventana UNBOUNDED → 10.77 s


In [0]:
w_small = (Window.partitionBy('userId')
                   .orderBy(col('timestamp').desc())
                   .rowsBetween(-30, 0))
t0 = time()
ratingsDf.withColumn('mov_avg', avg('rating').over(w_small)) \
       .write.format('noop').mode('overwrite').save()
print('Ventana 30 filas →', round(time() - t0, 2), 's')


Ventana 30 filas → 10.62 s


# Particiones **wide** vs **tall**

In [0]:


def run_exp(key_col, label):
    """
    key_col : str
        Nombre de la columna que será la clave de partición (wide vs tall).
    label   : str
        Texto descriptivo para la impresión de resultados.
    """
    df = ratingsDf.withColumn("partition_key", col(key_col))  # conserva rating intacto
    t0 = time()
    (df
     .repartition("partition_key")           # reorganiza datos según la clave
     .groupBy("partition_key")
     .agg(avg("rating"))                   # rating sigue existiendo
     .count())
    print(label, "| tiempo (s):", round(time() - t0, 2))

# Experimentos
run_exp("userId",  "Tall (userId)")   # muchas particiones pequeñas
run_exp("rating",  "Wide (rating)")   # pocas particiones grandes


Tall (userId) | tiempo (s): 4.8
Wide (rating) | tiempo (s): 5.11
