In [0]:
from pyspark import *
from pyspark.sql import SparkSession, Window
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, DateType, TimestampType, LongType
from pyspark.sql.types import ArrayType, DoubleType, BooleanType, DecimalType
from pyspark.sql.functions import regexp_extract, split, from_unixtime, col, avg, min, max, desc
from pyspark.sql.functions import grouping, explode, array_contains, struct, collect_list, row_number


# Transformaciones con Tipos de Datos Complejos: Arrays, Structs y Maps

En este notebook:
- Trabajaremos con columnas tipo array y struct a partir del dataset MovieLens.
- Aplicaremos funciones como `explode`, y crearemos structs de varias columnas.


In [0]:
# Tabla Ratings
ratings_schema  = StructType(fields=[
    StructField("userId",IntegerType(),True), 
    StructField("movieId",IntegerType(),True),
    StructField("rating",DecimalType(precision=2,scale=1),True),
    StructField("timestamp",LongType(),True)
])
ratingsDf = spark.read\
    .option("header", True)\
    .option("dateFormat", "yyyyMMdd")\
    .schema(ratings_schema)\
    .csv("dbfs:/FileStore/tables/ratings.csv")\
    .withColumn(\
            "date",\
            from_unixtime("timestamp", "yyyyMMdd"))\
                .drop('timestamp')

# Tabla Movies
movies_schema  = StructType(fields=[
    StructField("movieId",IntegerType(),True), 
    StructField("title",StringType(),True),
    StructField("genres",StringType(),True)
])

moviesDf = spark.read\
    .option("header", True)\
    .schema(movies_schema)\
    .csv("dbfs:/FileStore/tables/movies.csv")

moviesDf = moviesDf.withColumn("genresSplit", split(moviesDf["genres"],"\|"))\
                        .drop('genres').withColumnRenamed("genresSplit","genres")\
                            .withColumn(\
                                "year",\
                                regexp_extract(\
                                           moviesDf["title"],\
                                           "^.+\(([0-9]+)\)$",\
                                           1)\
                                .cast(IntegerType()))\
                            .withColumn(\
                            "title_temp",\
                            regexp_extract(\
                                           moviesDf["title"],\
                                           "^(.+?) \([0-9]+\)$",\
                                           1))\
                            .drop('title')\
                        .withColumnRenamed("title_temp","title")




In [0]:
# Agrupar calificaciones por usuario (array de movieIds)
user_movies = ratingsDf.groupBy("userId").agg(collect_list("movieId").alias("peliculas_calificadas"))
user_movies.show(5, truncate=False)



+------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [0]:
# Explode: expandir para tener una fila por usuario/película
user_movies_exploded = user_movies.withColumn("pelicula_id", explode(col("peliculas_calificadas")))
user_movies_exploded.show(5)



+------+---------------------+-----------+
|userId|peliculas_calificadas|pelicula_id|
+------+---------------------+-----------+
|     1| [1, 3, 6, 47, 50,...|          1|
|     1| [1, 3, 6, 47, 50,...|          3|
|     1| [1, 3, 6, 47, 50,...|          6|
|     1| [1, 3, 6, 47, 50,...|         47|
|     1| [1, 3, 6, 47, 50,...|         50|
+------+---------------------+-----------+
only showing top 5 rows



In [0]:
# Struct: juntar nombre y género de la película
movies_struct = moviesDf.withColumn("info", struct("title", "genres"))
movies_struct.select("movieId", "info").show(5, truncate=False)



+-------+--------------------------------------------------------------+
|movieId|info                                                          |
+-------+--------------------------------------------------------------+
|1      |{Toy Story, [Adventure, Animation, Children, Comedy, Fantasy]}|
|2      |{Jumanji, [Adventure, Children, Fantasy]}                     |
|3      |{Grumpier Old Men, [Comedy, Romance]}                         |
|4      |{Waiting to Exhale, [Comedy, Drama, Romance]}                 |
|5      |{Father of the Bride Part II, [Comedy]}                       |
+-------+--------------------------------------------------------------+
only showing top 5 rows



In [0]:
# BONUS: Crear columna tipo map (clave=movieId, valor=rating) para cada usuario
from pyspark.sql.functions import create_map, lit
# Ejemplo sencillo: para cada usuario, tomar sus primeras 3 calificaciones y armar el map
sample = ratingsDf.filter(col("userId") == 1).limit(3)
sample_map = sample.withColumn("movie_rating_map", create_map(col("movieId"), col("rating")))
sample_map.show()

+------+-------+------+--------+----------------+
|userId|movieId|rating|    date|movie_rating_map|
+------+-------+------+--------+----------------+
|     1|      1|   4.0|20000730|      {1 -> 4.0}|
|     1|      3|   4.0|20000730|      {3 -> 4.0}|
|     1|      6|   4.0|20000730|      {6 -> 4.0}|
+------+-------+------+--------+----------------+



In [0]:
# Tomemos los 3 primeros ratings de cada usuario para simplicidad
w = Window.partitionBy("userId").orderBy("date")
ratings_sample = ratingsDf.withColumn("rn", row_number().over(w)).filter(col("rn") <= 3)

# Para cada usuario, agrupamos las (movieId, rating) en una lista de pares
user_pairs = ratings_sample.groupBy("userId").agg(collect_list(struct("movieId", "rating")).alias("peliculas_ratings"))

# Convertimos la lista de structs a una columna Map
from pyspark.sql.functions import map_from_entries

user_map = user_pairs.withColumn("mapa_ratings", map_from_entries(col("peliculas_ratings")))

# Mostramos el resultado: ahora para cada usuario hay un Map movieId->rating
user_map.select("userId", "mapa_ratings").show(truncate=False)



+------+--------------------------------------+
|userId|mapa_ratings                          |
+------+--------------------------------------+
|1     |{1 -> 4.0, 3 -> 4.0, 6 -> 4.0}        |
|2     |{318 -> 3.0, 333 -> 4.0, 1704 -> 4.5} |
|3     |{31 -> 0.5, 527 -> 0.5, 647 -> 0.5}   |
|4     |{162 -> 5.0, 171 -> 3.0, 190 -> 2.0}  |
|5     |{1 -> 4.0, 21 -> 4.0, 34 -> 4.0}      |
|6     |{2 -> 4.0, 3 -> 5.0, 4 -> 3.0}        |
|7     |{1 -> 4.5, 50 -> 4.5, 58 -> 3.0}      |
|8     |{2 -> 4.0, 10 -> 2.0, 11 -> 4.0}      |
|9     |{41 -> 3.0, 187 -> 3.0, 223 -> 4.0}   |
|10    |{296 -> 1.0, 356 -> 3.5, 588 -> 4.0}  |
|11    |{356 -> 5.0, 1101 -> 5.0, 1840 -> 4.0}|
|12    |{39 -> 4.0, 168 -> 5.0, 222 -> 5.0}   |
|13    |{305 -> 1.0, 597 -> 3.0, 1173 -> 3.0} |
|14    |{4 -> 3.0, 7 -> 3.0, 19 -> 1.0}       |
|15    |{44 -> 1.0, 158 -> 1.0, 172 -> 1.0}   |
|16    |{47 -> 3.5, 50 -> 4.0, 111 -> 4.5}    |
|17    |{1 -> 4.5, 44 -> 3.5, 50 -> 4.5}      |
|18    |{47 -> 4.5, 50 -> 5.0, 110 -> 4.

In [0]:
# --- USO DEL MAP ---
user_map_con_valor = user_map.withColumn("rating_pelicula_1", col("mapa_ratings").getItem(lit(1)))
user_map_con_valor.select("userId", "mapa_ratings", "rating_pelicula_1").show()



+------+--------------------+-----------------+
|userId|        mapa_ratings|rating_pelicula_1|
+------+--------------------+-----------------+
|     1|{1 -> 4.0, 3 -> 4...|              4.0|
|     2|{318 -> 3.0, 333 ...|             null|
|     3|{31 -> 0.5, 527 -...|             null|
|     4|{162 -> 5.0, 171 ...|             null|
|     5|{1 -> 4.0, 21 -> ...|              4.0|
|     6|{2 -> 4.0, 3 -> 5...|             null|
|     7|{1 -> 4.5, 50 -> ...|              4.5|
|     8|{2 -> 4.0, 10 -> ...|             null|
|     9|{41 -> 3.0, 187 -...|             null|
|    10|{296 -> 1.0, 356 ...|             null|
|    11|{356 -> 5.0, 1101...|             null|
|    12|{39 -> 4.0, 168 -...|             null|
|    13|{305 -> 1.0, 597 ...|             null|
|    14|{4 -> 3.0, 7 -> 3...|             null|
|    15|{44 -> 1.0, 158 -...|             null|
|    16|{47 -> 3.5, 50 ->...|             null|
|    17|{1 -> 4.5, 44 -> ...|              4.5|
|    18|{47 -> 4.5, 50 ->...|           

In [0]:
# Puedes filtrar por los usuarios que hayan calificado la película 1 con rating mayor o igual a 4
user_map_con_valor.filter(col("rating_pelicula_1") >= 4).select("userId", "rating_pelicula_1").show()

+------+-----------------+
|userId|rating_pelicula_1|
+------+-----------------+
|     1|              4.0|
|     5|              4.0|
|     7|              4.5|
|    17|              4.5|
|    19|              4.0|
|    31|              5.0|
|    40|              5.0|
|    43|              5.0|
|    46|              5.0|
|    57|              5.0|
|    63|              5.0|
|    64|              4.0|
|    71|              5.0|
|    73|              4.5|
|    78|              4.0|
|    91|              4.0|
|    96|              5.0|
|    98|              4.5|
|   103|              4.0|
|   107|              4.0|
+------+-----------------+
only showing top 20 rows

