In [0]:
#import findspark
#findspark.init('/spark/spark-3.5.1-bin-hadoop3')
from pyspark import *
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, DateType, TimestampType, LongType
from pyspark.sql.types import ArrayType, DoubleType, BooleanType, DecimalType

# Sesion

Lo primero siempre es crear la sesion de Spark. La sesion permite a todos los procesos involucrados compartir contexto.

In [0]:
#spark = SparkSession.builder.appName("movielens").getOrCreate()


# Carga de datos

En la siguiente celda realizamos la carga de los datos del dataset de Movielens. en este caso vamos a cargar 3 de las tablas:

* ratings
* movies
* tags

Para esto definimos primero los schemas:

In [0]:
ratingsDf = spark.read.csv("dbfs:/FileStore/tables/ratings.csv")
moviesDf = spark.read.csv("dbfs:/FileStore/tables/movies.csv")
tagsDf = spark.read.csv("dbfs:/FileStore/tables/tags.csv")

Revisamos primero el dataframe de ratings

In [0]:
ratingsDf.head(2)

Out[4]: [Row(_c0='userId', _c1='movieId', _c2='rating', _c3='timestamp'),
 Row(_c0='1', _c1='1', _c2='4.0', _c3='964982703')]

In [0]:
ratingsDf.show(2)

+------+-------+------+---------+
|   _c0|    _c1|   _c2|      _c3|
+------+-------+------+---------+
|userId|movieId|rating|timestamp|
|     1|      1|   4.0|964982703|
+------+-------+------+---------+
only showing top 2 rows



Vemos que tiene header, y los tipos de datos, incluyendo el ultimo que es un timestamp. Podemos definir el schema:

In [0]:
ratings_schema  = StructType(fields=[
    StructField("userId",IntegerType(),True), 
    StructField("movieId",IntegerType(),True),
    StructField("rating",DecimalType(precision=2,scale=1),True),
    StructField("timestamp",LongType(),True)
])

In [0]:
ratingsDf = spark.read\
    .option("header", True)\
    .option("dateFormat", "yyyyMMdd")\
    .schema(ratings_schema)\
    .csv("dbfs:/FileStore/tables/ratings.csv")
ratingsDf.head(2)

Out[7]: [Row(userId=1, movieId=1, rating=Decimal('4.0'), timestamp=964982703),
 Row(userId=1, movieId=3, rating=Decimal('4.0'), timestamp=964981247)]

In [0]:
ratingsDf.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: decimal(2,1) (nullable = true)
 |-- timestamp: long (nullable = true)



In [0]:
movies_schema  = StructType(fields=[
    StructField("movieId",IntegerType(),True), 
    StructField("title",StringType(),True),
    StructField("genres",StringType(),True)
])

moviesDf = spark.read\
    .option("header", True)\
    .schema(movies_schema)\
    .csv("dbfs:/FileStore/tables/movies.csv")
moviesDf.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)



In [0]:
moviesDf.show(5)

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
+-------+--------------------+--------------------+
only showing top 5 rows



Notamos dos cosas:

1. En la columna `genres` vienen multiples valores separados por `|`
1. En la columna `title` viene tanto el nombre de la palicula como el año en que salió

Queremos separar esto.

In [0]:
from pyspark.sql.functions import split, col

# La función spli separa una columna en un arreglo
moviesDf.select(split(col("genres"),"\|").alias("genresSplit")).show()


+--------------------+
|         genresSplit|
+--------------------+
|[Adventure, Anima...|
|[Adventure, Child...|
|   [Comedy, Romance]|
|[Comedy, Drama, R...|
|            [Comedy]|
|[Action, Crime, T...|
|   [Comedy, Romance]|
|[Adventure, Child...|
|            [Action]|
|[Action, Adventur...|
|[Comedy, Drama, R...|
|    [Comedy, Horror]|
|[Adventure, Anima...|
|             [Drama]|
|[Action, Adventur...|
|      [Crime, Drama]|
|    [Drama, Romance]|
|            [Comedy]|
|            [Comedy]|
|[Action, Comedy, ...|
+--------------------+
only showing top 20 rows



O de otra forma:

In [0]:
moviesDf.withColumn("genresSplit", split(moviesDf["genres"],"\|")).show()


+-------+--------------------+--------------------+--------------------+
|movieId|               title|              genres|         genresSplit|
+-------+--------------------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|[Adventure, Anima...|
|      2|      Jumanji (1995)|Adventure|Childre...|[Adventure, Child...|
|      3|Grumpier Old Men ...|      Comedy|Romance|   [Comedy, Romance]|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|[Comedy, Drama, R...|
|      5|Father of the Bri...|              Comedy|            [Comedy]|
|      6|         Heat (1995)|Action|Crime|Thri...|[Action, Crime, T...|
|      7|      Sabrina (1995)|      Comedy|Romance|   [Comedy, Romance]|
|      8| Tom and Huck (1995)|  Adventure|Children|[Adventure, Child...|
|      9| Sudden Death (1995)|              Action|            [Action]|
|     10|    GoldenEye (1995)|Action|Adventure|...|[Action, Adventur...|
|     11|American Presiden...|Comedy|Drama|Romance|

In [0]:
moviesDfSplit = moviesDf.withColumn("genresSplit", split(moviesDf["genres"],"\|"))
moviesDfSplit.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- genresSplit: array (nullable = true)
 |    |-- element: string (containsNull = false)



No es necesario mantener ambas columnas, entonces es posible eliminar la columna `genres`

In [0]:
moviesDfSplit.drop("genres")

Out[14]: DataFrame[movieId: int, title: string, genresSplit: array<string>]

Ahora revisamos el dataframe

In [0]:
moviesDfSplit.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- genresSplit: array (nullable = true)
 |    |-- element: string (containsNull = false)



Los dataframes son inmutables, por lo que vimos anteriormente fue un nuevo dataframe. Para conservar el cambio tenemos que asignarlo para ver el cambio.

In [0]:
moviesDfSplit = moviesDfSplit.drop("genres")
moviesDfSplit.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genresSplit: array (nullable = true)
 |    |-- element: string (containsNull = false)



Ahora sucede lo mismo con la columna title. Podemos extrar el nombre de la película y el año y crear columnas especificas para cada dato. Usamos expresiones regulares con la funcion [regexp_extract](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.regexp_extract.html#pyspark.sql.functions.regexp_extract)

In [0]:
from pyspark.sql.functions import regexp_extract, col

moviesDfSplit.withColumn("year", regexp_extract(moviesDf["title"],"^.+\(([0-9]+)\)$",1)).show()



+-------+--------------------+--------------------+----+
|movieId|               title|         genresSplit|year|
+-------+--------------------+--------------------+----+
|      1|    Toy Story (1995)|[Adventure, Anima...|1995|
|      2|      Jumanji (1995)|[Adventure, Child...|1995|
|      3|Grumpier Old Men ...|   [Comedy, Romance]|1995|
|      4|Waiting to Exhale...|[Comedy, Drama, R...|1995|
|      5|Father of the Bri...|            [Comedy]|1995|
|      6|         Heat (1995)|[Action, Crime, T...|1995|
|      7|      Sabrina (1995)|   [Comedy, Romance]|1995|
|      8| Tom and Huck (1995)|[Adventure, Child...|1995|
|      9| Sudden Death (1995)|            [Action]|1995|
|     10|    GoldenEye (1995)|[Action, Adventur...|1995|
|     11|American Presiden...|[Comedy, Drama, R...|1995|
|     12|Dracula: Dead and...|    [Comedy, Horror]|1995|
|     13|        Balto (1995)|[Adventure, Anima...|1995|
|     14|        Nixon (1995)|             [Drama]|1995|
|     15|Cutthroat Island ...|[

In [0]:
moviesDfSplit.withColumn("year", regexp_extract(moviesDf["title"],"^.+\(([0-9]+)\)$",1)).printSchema()


root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genresSplit: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- year: string (nullable = true)



In [0]:
moviesDfSplit.withColumn("year", regexp_extract(moviesDf["title"],"^.+\(([0-9]+)\)$",1).cast(IntegerType())).show()


+-------+--------------------+--------------------+----+
|movieId|               title|         genresSplit|year|
+-------+--------------------+--------------------+----+
|      1|    Toy Story (1995)|[Adventure, Anima...|1995|
|      2|      Jumanji (1995)|[Adventure, Child...|1995|
|      3|Grumpier Old Men ...|   [Comedy, Romance]|1995|
|      4|Waiting to Exhale...|[Comedy, Drama, R...|1995|
|      5|Father of the Bri...|            [Comedy]|1995|
|      6|         Heat (1995)|[Action, Crime, T...|1995|
|      7|      Sabrina (1995)|   [Comedy, Romance]|1995|
|      8| Tom and Huck (1995)|[Adventure, Child...|1995|
|      9| Sudden Death (1995)|            [Action]|1995|
|     10|    GoldenEye (1995)|[Action, Adventur...|1995|
|     11|American Presiden...|[Comedy, Drama, R...|1995|
|     12|Dracula: Dead and...|    [Comedy, Horror]|1995|
|     13|        Balto (1995)|[Adventure, Anima...|1995|
|     14|        Nixon (1995)|             [Drama]|1995|
|     15|Cutthroat Island ...|[

In [0]:
moviesDfSplit.withColumn("year", regexp_extract(moviesDf["title"],"^.+\(([0-9]+)\)$",1).cast(IntegerType())).printSchema()


root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genresSplit: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- year: integer (nullable = true)



In [0]:
moviesDfSplit = moviesDfSplit\
                .withColumn(\
                            "year",\
                            regexp_extract(\
                                           moviesDf["title"],\
                                           "^.+\(([0-9]+)\)$",\
                                           1)\
                            .cast(IntegerType()))


Existen muchas funciones similares que se pueden utilizar, la documentación está en [Spark SQL Functions](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/functions.html)

In [0]:
moviesDfSplit.show(5)

+-------+--------------------+--------------------+----+
|movieId|               title|         genresSplit|year|
+-------+--------------------+--------------------+----+
|      1|    Toy Story (1995)|[Adventure, Anima...|1995|
|      2|      Jumanji (1995)|[Adventure, Child...|1995|
|      3|Grumpier Old Men ...|   [Comedy, Romance]|1995|
|      4|Waiting to Exhale...|[Comedy, Drama, R...|1995|
|      5|Father of the Bri...|            [Comedy]|1995|
+-------+--------------------+--------------------+----+
only showing top 5 rows



Hacemos lo mismo para obtener el titulo

In [0]:
moviesDfSplit=moviesDfSplit\
                .withColumn(\
                            "title_temp",\
                            regexp_extract(\
                                           _["title"],\
                                           "^(.+?) \([0-9]+\)$",\
                                           1))\
                .drop('title')\
                .withColumnRenamed("title_temp","title")
moviesDfSplit.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- genresSplit: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- year: integer (nullable = true)
 |-- title: string (nullable = true)



In [0]:
moviesDfSplit.show(5)

+-------+--------------------+----+--------------------+
|movieId|         genresSplit|year|               title|
+-------+--------------------+----+--------------------+
|      1|[Adventure, Anima...|1995|           Toy Story|
|      2|[Adventure, Child...|1995|             Jumanji|
|      3|   [Comedy, Romance]|1995|    Grumpier Old Men|
|      4|[Comedy, Drama, R...|1995|   Waiting to Exhale|
|      5|            [Comedy]|1995|Father of the Bri...|
+-------+--------------------+----+--------------------+
only showing top 5 rows



Por ultimo hacemos lo mismo con la tabla de tags

In [0]:
tagsDf.show()

+------+-------+-----------------+----------+
|   _c0|    _c1|              _c2|       _c3|
+------+-------+-----------------+----------+
|userId|movieId|              tag| timestamp|
|     2|  60756|            funny|1445714994|
|     2|  60756|  Highly quotable|1445714996|
|     2|  60756|     will ferrell|1445714992|
|     2|  89774|     Boxing story|1445715207|
|     2|  89774|              MMA|1445715200|
|     2|  89774|        Tom Hardy|1445715205|
|     2| 106782|            drugs|1445715054|
|     2| 106782|Leonardo DiCaprio|1445715051|
|     2| 106782|  Martin Scorsese|1445715056|
|     7|  48516|     way too long|1169687325|
|    18|    431|        Al Pacino|1462138765|
|    18|    431|         gangster|1462138749|
|    18|    431|            mafia|1462138755|
|    18|   1221|        Al Pacino|1461699306|
|    18|   1221|            Mafia|1461699303|
|    18|   5995|        holocaust|1455735472|
|    18|   5995|       true story|1455735479|
|    18|  44665|     twist ending|

In [0]:
tags_schema  = StructType(fields=[    
    StructField("userId",IntegerType(),True), 
    StructField("movieId",IntegerType(),True), 
    StructField("tag",StringType(),True),
    StructField("timestamp",LongType(),True)
])

tagsDf = spark.read\
    .option("header", True)\
    .schema(tags_schema)\
    .csv("dbfs:/FileStore/tables/tags.csv")

tagsDf.show()

+------+-------+-----------------+----------+
|userId|movieId|              tag| timestamp|
+------+-------+-----------------+----------+
|     2|  60756|            funny|1445714994|
|     2|  60756|  Highly quotable|1445714996|
|     2|  60756|     will ferrell|1445714992|
|     2|  89774|     Boxing story|1445715207|
|     2|  89774|              MMA|1445715200|
|     2|  89774|        Tom Hardy|1445715205|
|     2| 106782|            drugs|1445715054|
|     2| 106782|Leonardo DiCaprio|1445715051|
|     2| 106782|  Martin Scorsese|1445715056|
|     7|  48516|     way too long|1169687325|
|    18|    431|        Al Pacino|1462138765|
|    18|    431|         gangster|1462138749|
|    18|    431|            mafia|1462138755|
|    18|   1221|        Al Pacino|1461699306|
|    18|   1221|            Mafia|1461699303|
|    18|   5995|        holocaust|1455735472|
|    18|   5995|       true story|1455735479|
|    18|  44665|     twist ending|1456948283|
|    18|  52604|  Anthony Hopkins|

In [0]:
tagsDf.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- tag: string (nullable = true)
 |-- timestamp: long (nullable = true)



In [0]:
tagsDf.describe().show()

+-------+------------------+-----------------+-----------+--------------------+
|summary|            userId|          movieId|        tag|           timestamp|
+-------+------------------+-----------------+-----------+--------------------+
|  count|              3683|             3683|       3683|                3683|
|   mean| 431.1493347814282|27252.01357588922|       null| 1.320031966823785E9|
| stddev|158.47255348483532|43490.55880276775|       null|1.7210245043712625E8|
|    min|                 2|                1|"""artsy"""|          1137179352|
|    max|               610|           193565|    zombies|          1537098603|
+-------+------------------+-----------------+-----------+--------------------+



Finalmente, convertimos el timestamp en una fecha:

In [0]:
from pyspark.sql.functions import from_unixtime


tags_schema  = StructType(fields=[    
    StructField("userId",IntegerType(),True), 
    StructField("movieId",IntegerType(),True), 
    StructField("tag",StringType(),True),
    StructField("timestamp",LongType(),True)
])

tagsDf = spark.read\
    .option("header", True)\
    .schema(tags_schema)\
    .csv("dbfs:/FileStore/tables/tags.csv")


tagsDf=tagsDf\
        .withColumn(\
            "date",\
            from_unixtime("timestamp", "yyyyMMdd"))\
                .drop('timestamp')

tagsDf.show()

+------+-------+-----------------+--------+
|userId|movieId|              tag|    date|
+------+-------+-----------------+--------+
|     2|  60756|            funny|20151024|
|     2|  60756|  Highly quotable|20151024|
|     2|  60756|     will ferrell|20151024|
|     2|  89774|     Boxing story|20151024|
|     2|  89774|              MMA|20151024|
|     2|  89774|        Tom Hardy|20151024|
|     2| 106782|            drugs|20151024|
|     2| 106782|Leonardo DiCaprio|20151024|
|     2| 106782|  Martin Scorsese|20151024|
|     7|  48516|     way too long|20070125|
|    18|    431|        Al Pacino|20160501|
|    18|    431|         gangster|20160501|
|    18|    431|            mafia|20160501|
|    18|   1221|        Al Pacino|20160426|
|    18|   1221|            Mafia|20160426|
|    18|   5995|        holocaust|20160217|
|    18|   5995|       true story|20160217|
|    18|  44665|     twist ending|20160302|
|    18|  52604|  Anthony Hopkins|20160310|
|    18|  52604|  courtroom dram