In [0]:
from pyspark import *
from pyspark.sql import SparkSession, Window
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, DateType, TimestampType, LongType
from pyspark.sql.types import ArrayType, DoubleType, BooleanType, DecimalType
from pyspark.sql.functions import regexp_extract, split, from_unixtime, col, avg, min, max, desc
from pyspark.sql.functions import grouping, explode, array_contains, struct, collect_list, row_number

In [0]:
%fs ls /Volumes/big_data_ii_2025/spark_examples/spark_data

path,name,size,modificationTime
dbfs:/Volumes/big_data_ii_2025/spark_examples/spark_data/PADRON_COMPLETO.csv,PADRON_COMPLETO.csv,437542254,1750034299000
dbfs:/Volumes/big_data_ii_2025/spark_examples/spark_data/distelec.csv,distelec.csv,175692,1750034273000
dbfs:/Volumes/big_data_ii_2025/spark_examples/spark_data/movies.csv,movies.csv,4192335,1750034274000
dbfs:/Volumes/big_data_ii_2025/spark_examples/spark_data/padron_limpio.csv/,padron_limpio.csv/,0,1750039677659
dbfs:/Volumes/big_data_ii_2025/spark_examples/spark_data/por-ciclo-2016-2018.csv,por-ciclo-2016-2018.csv,904931,1750034273000
dbfs:/Volumes/big_data_ii_2025/spark_examples/spark_data/ratings_full.csv,ratings_full.csv,933898879,1750034299000
dbfs:/Volumes/big_data_ii_2025/spark_examples/spark_data/tags.csv,tags.csv,85361813,1750034296000


# Manejo y Optimización de Particiones en Spark

En este notebook aprenderás a:
- Visualizar y modificar el número de particiones de un DataFrame.
- Usar las funciones `repartition` y `coalesce`.
- Guardar datos con particionado por columna y medir el impacto en el rendimiento.
Trabajaremos con el dataset MovieLens.


In [0]:
# Cargar datos MovieLens (ratings.csv) suponiendo que está en /dbfs/FileStore/movielens/ratings.csv
# Tabla Ratings
ratings_schema  = StructType(fields=[
    StructField("userId",IntegerType(),True), 
    StructField("movieId",IntegerType(),True),
    StructField("rating",DecimalType(precision=2,scale=1),True),
    StructField("timestamp",LongType(),True)
])
ratingsDf = spark.read\
    .option("header", True)\
    .option("dateFormat", "yyyyMMdd")\
    .schema(ratings_schema)\
    .csv("/Volumes/big_data_ii_2025/spark_examples/spark_data/ratings_full.csv")\
    .withColumn(\
            "date",\
            from_unixtime("timestamp", "yyyyMMdd"))\
                .drop('timestamp')



In [0]:
ratingsDf.printSchema()


root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: decimal(2,1) (nullable = true)
 |-- date: string (nullable = true)



In [0]:
print(f"Ratings count: {ratingsDf.count():,.0f}")



Ratings count: 33,832,162


In [0]:
# Ver número de particiones
print("Particiones iniciales:", ratingsDf.rdd.getNumPartitions())



[0;31m---------------------------------------------------------------------------[0m
[0;31mPySparkNotImplementedError[0m                Traceback (most recent call last)
File [0;32m<command-6212165207225491>, line 2[0m
[1;32m      1[0m [38;5;66;03m# Ver número de particiones[39;00m
[0;32m----> 2[0m [38;5;28mprint[39m([38;5;124m"[39m[38;5;124mParticiones iniciales:[39m[38;5;124m"[39m, ratingsDf[38;5;241m.[39mrdd[38;5;241m.[39mgetNumPartitions())

File [0;32m/databricks/python/lib/python3.11/site-packages/pyspark/sql/connect/dataframe.py:2432[0m, in [0;36mDataFrame.rdd[0;34m(self)[0m
[1;32m   2430[0m [38;5;129m@property[39m
[1;32m   2431[0m [38;5;28;01mdef[39;00m [38;5;21mrdd[39m([38;5;28mself[39m) [38;5;241m-[39m[38;5;241m>[39m [38;5;124m"[39m[38;5;124mRDD[Row][39m[38;5;124m"[39m:
[0;32m-> 2432[0m     [38;5;28;01mraise[39;00m PySparkNotImplementedError(
[1;32m   2433[0m         error_class[38;5;241m=[39m[38;5;124m"[39m[38;5;1

In [0]:
%time
# Cambiar el número de particiones a 16
df_repart = ratingsDf.repartition(16)
print("Después de repartition(16):", df_repart.rdd.getNumPartitions())



Después de repartition(16): 16


In [0]:
%time
# Reducir a 2 particiones con coalesce
df_coalesce = df_repart.coalesce(2)
print("Después de coalesce(2):", df_coalesce.rdd.getNumPartitions())



Después de coalesce(2): 2


In [0]:
%time
# Guardar datos particionando por 'rating'
import time
start = time.time()
output_path = "/Volumes/big_data_ii_2025/spark_examples/spark_data/rating_by_rating"
ratingsDf.write.mode("overwrite").partitionBy("rating").parquet(output_path)
print("Tiempo:", time.time() - start, "segundos")


CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 6.91 µs
Tiempo: 104.14708518981934 segundos


In [0]:
%%sh
ls -las /Volumes/big_data_ii_2025/spark_examples/spark_data/rating_by_rating/

total 8
4 drwxr-xr-x 2 root root 4096 Jun  8 20:18 .
4 drwxrwxrwt 1 root root 4096 Jun  8 20:18 ..


In [0]:
print("Particiones originales:", ratingsDf.rdd.getNumPartitions())



Particiones originales: 8


In [0]:
def medir_tiempo_particiones(df, num_particiones):
    df_mod = df.repartition(num_particiones)
    print(f"\nUsando {num_particiones} particiones:")
    start = time.time()
    # Operación costosa: groupBy + count
    resultado = df_mod.groupBy("movieId").count().collect()
    print("Tiempo:", round(time.time() - start, 2), "segundos")



In [0]:
for n in [2, 4, 8, 16, 32]:
    medir_tiempo_particiones(ratingsDf, n)


Usando 2 particiones:
Tiempo: 63.93 segundos

Usando 4 particiones:
Tiempo: 63.58 segundos

Usando 8 particiones:
Tiempo: 64.058 segundos

Usando 16 particiones:
Tiempo: 65.2 segundos

Usando 32 particiones:
Tiempo: 66.018 segundos
