In [0]:
from pyspark import *
from pyspark.sql import SparkSession, Window
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, DateType, TimestampType, LongType
from pyspark.sql.types import ArrayType, DoubleType, BooleanType, DecimalType
from pyspark.sql.functions import regexp_extract, split, from_unixtime, col, avg, min, max, desc
from pyspark.sql.functions import grouping, explode, array_contains, struct, collect_list, row_number


# Transformaciones con Tipos de Datos Complejos: Arrays, Structs y Maps

En este notebook:
- Trabajaremos con columnas tipo array y struct a partir del dataset MovieLens.
- Aplicaremos funciones como `explode`, y crearemos structs de varias columnas.


In [0]:
# Tabla Ratings
ratings_schema  = StructType(fields=[
    StructField("userId",IntegerType(),True), 
    StructField("movieId",IntegerType(),True),
    StructField("rating",DecimalType(precision=2,scale=1),True),
    StructField("timestamp",LongType(),True)
])
ratingsDf = spark.read\
    .option("header", True)\
    .option("dateFormat", "yyyyMMdd")\
    .schema(ratings_schema)\
    .csv("/Volumes/big_data_ii_2025/spark_examples/spark_data/ratings_full.csv")\
    .withColumn(\
            "date",\
            from_unixtime("timestamp", "yyyyMMdd"))\
                .drop('timestamp')

# Tabla Movies
movies_schema  = StructType(fields=[
    StructField("movieId",IntegerType(),True), 
    StructField("title",StringType(),True),
    StructField("genres",StringType(),True)
])

moviesDf = spark.read\
    .option("header", True)\
    .schema(movies_schema)\
    .csv("/Volumes/big_data_ii_2025/spark_examples/spark_data/movies.csv")

moviesDf = moviesDf.withColumn("genresSplit", split(moviesDf["genres"],"\|"))\
                        .drop('genres').withColumnRenamed("genresSplit","genres")\
                            .withColumn(\
                                "year",\
                                regexp_extract(\
                                           moviesDf["title"],\
                                           "^.+\(([0-9]+)\)$",\
                                           1)\
                                .try_cast(IntegerType()))\
                            .withColumn(\
                            "title_temp",\
                            regexp_extract(\
                                           moviesDf["title"],\
                                           "^(.+?) \([0-9]+\)$",\
                                           1))\
                            .drop('title')\
                        .withColumnRenamed("title_temp","title")




In [0]:
# Agrupar calificaciones por usuario (array de movieIds)
user_movies = ratingsDf.groupBy("userId").agg(collect_list("movieId").alias("peliculas_calificadas"))
user_movies.show(5, truncate=False)



+------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [0]:
# Explode: expandir para tener una fila por usuario/película
user_movies_exploded = user_movies.withColumn("pelicula_id", explode(col("peliculas_calificadas")))
user_movies_exploded.show(5)



+------+---------------------+-----------+
|userId|peliculas_calificadas|pelicula_id|
+------+---------------------+-----------+
|  1021|   [215, 1704, 79132]|        215|
|  1021|   [215, 1704, 79132]|       1704|
|  1021|   [215, 1704, 79132]|      79132|
|  2737| [1, 2, 6, 16, 19,...|          1|
|  2737| [1, 2, 6, 16, 19,...|          2|
+------+---------------------+-----------+
only showing top 5 rows


In [0]:
# Struct: juntar nombre y género de la película
movies_struct = moviesDf.withColumn("info", struct("title", "genres"))
movies_struct.select("movieId", "info").show(5, truncate=False)



+-------+--------------------------------------------------------------+
|movieId|info                                                          |
+-------+--------------------------------------------------------------+
|1      |{Toy Story, [Adventure, Animation, Children, Comedy, Fantasy]}|
|2      |{Jumanji, [Adventure, Children, Fantasy]}                     |
|3      |{Grumpier Old Men, [Comedy, Romance]}                         |
|4      |{Waiting to Exhale, [Comedy, Drama, Romance]}                 |
|5      |{Father of the Bride Part II, [Comedy]}                       |
+-------+--------------------------------------------------------------+
only showing top 5 rows


In [0]:
# Crear columna tipo map (clave=movieId, valor=rating) para cada usuario
from pyspark.sql.functions import create_map, lit
# Ejemplo sencillo: para cada usuario, tomar sus primeras 3 calificaciones y armar el map
sample = ratingsDf.filter(col("userId") == 1).limit(3)
sample_map = sample.withColumn("movie_rating_map", create_map(col("movieId"), col("rating")))
sample_map.show()

+------+-------+------+--------+----------------+
|userId|movieId|rating|    date|movie_rating_map|
+------+-------+------+--------+----------------+
|     1|      1|   4.0|20081103|      {1 -> 4.0}|
|     1|    110|   4.0|20081105|    {110 -> 4.0}|
|     1|    158|   4.0|20081103|    {158 -> 4.0}|
+------+-------+------+--------+----------------+



In [0]:
# Tomemos los 3 primeros ratings de cada usuario para simplicidad
w = Window.partitionBy("userId").orderBy("date")
ratings_sample = ratingsDf.withColumn("rn", row_number().over(w)).filter(col("rn") <= 3)

# Para cada usuario, agrupamos las (movieId, rating) en una lista de pares
user_pairs = ratings_sample.groupBy("userId").agg(collect_list(struct("movieId", "rating")).alias("peliculas_ratings"))

# Convertimos la lista de structs a una columna Map
from pyspark.sql.functions import map_from_entries

user_map = user_pairs.withColumn("mapa_ratings", map_from_entries(col("peliculas_ratings")))

# Mostramos el resultado: ahora para cada usuario hay un Map movieId->rating
user_map.select("userId", "mapa_ratings").show(truncate=False)



+------+---------------------------------------+
|userId|mapa_ratings                           |
+------+---------------------------------------+
|15    |{16 -> 4.5, 50 -> 4.0, 223 -> 3.5}     |
|20    |{256 -> 3.5, 2717 -> 3.0, 88744 -> 3.5}|
|24    |{1 -> 4.5, 9 -> 2.5, 11 -> 4.0}        |
|29    |{1198 -> 4.5, 4025 -> 5.0, 4027 -> 3.0}|
|38    |{356 -> 5.0, 1721 -> 4.0, 5444 -> 2.0} |
|39    |{2 -> 1.5, 29 -> 4.5, 45 -> 4.0}       |
|52    |{239 -> 4.5, 1261 -> 5.0, 2291 -> 4.5} |
|56    |{5 -> 2.0, 31 -> 1.5, 112 -> 3.5}      |
|87    |{1 -> 0.5, 10 -> 1.0, 34 -> 0.5}       |
|88    |{1 -> 5.0, 3 -> 3.0, 5 -> 4.0}         |
|94    |{52 -> 3.5, 147 -> 3.0, 163 -> 3.0}    |
|99    |{50 -> 4.0, 58 -> 4.0, 260 -> 4.0}     |
|102   |{39 -> 4.0, 318 -> 4.0, 356 -> 4.5}    |
|108   |{12 -> 4.5, 267 -> 1.5, 318 -> 5.0}    |
|109   |{1 -> 5.0, 260 -> 5.0, 356 -> 5.0}     |
|118   |{39 -> 3.0, 60 -> 4.0, 146 -> 5.0}     |
|120   |{169 -> 0.5, 277 -> 2.5, 374 -> 2.5}   |
|121   |{2571 -> 3.5

In [0]:
# --- USO DEL MAP ---
user_map_con_valor = user_map.withColumn("rating_pelicula_1", col("mapa_ratings").getItem(lit(1)))
user_map_con_valor.select("userId", "mapa_ratings", "rating_pelicula_1").show()





+------+--------------------+-----------------+
|userId|        mapa_ratings|rating_pelicula_1|
+------+--------------------+-----------------+
|    15|{16 -> 4.5, 50 ->...|             NULL|
|    20|{256 -> 3.5, 2717...|             NULL|
|    24|{1 -> 4.5, 9 -> 2...|              4.5|
|    29|{1198 -> 4.5, 402...|             NULL|
|    38|{356 -> 5.0, 1721...|             NULL|
|    39|{2 -> 1.5, 29 -> ...|             NULL|
|    52|{239 -> 4.5, 1261...|             NULL|
|    56|{5 -> 2.0, 31 -> ...|             NULL|
|    87|{1 -> 0.5, 10 -> ...|              0.5|
|    88|{1 -> 5.0, 3 -> 3...|              5.0|
|    94|{52 -> 3.5, 147 -...|             NULL|
|    99|{50 -> 4.0, 58 ->...|             NULL|
|   102|{39 -> 4.0, 318 -...|             NULL|
|   108|{12 -> 4.5, 267 -...|             NULL|
|   109|{1 -> 5.0, 260 ->...|              5.0|
|   118|{39 -> 3.0, 60 ->...|             NULL|
|   120|{169 -> 0.5, 277 ...|             NULL|
|   121|{2571 -> 3.5, 295...|           

In [0]:
# Puedes filtrar por los usuarios que hayan calificado la película 1 con rating mayor o igual a 4
user_map_con_valor.filter(col("rating_pelicula_1") >= 4).select("userId", "rating_pelicula_1").show()

+------+-----------------+
|userId|rating_pelicula_1|
+------+-----------------+
|    24|              4.5|
|    88|              5.0|
|   109|              5.0|
|   130|              5.0|
|   229|              4.0|
|   439|              4.0|
|   446|              4.0|
|   465|              5.0|
|   537|              4.0|
|   581|              4.5|
|   641|              5.0|
|   663|              4.0|
|   670|              5.0|
|   692|              5.0|
|   716|              4.5|
|  1007|              4.5|
|  1015|              4.0|
|  1059|              4.0|
|  1110|              4.0|
|  1187|              4.5|
+------+-----------------+
only showing top 20 rows
