In [1]:
import findspark
findspark.init('/Users/joseaguilar/Documents/Development/spark/spark-3.5.1-bin-hadoop3')
from pyspark import *
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, DateType, TimestampType, LongType
from pyspark.sql.types import ArrayType, DoubleType, BooleanType, DecimalType
from pyspark.sql.functions import regexp_extract, split, from_unixtime, col, avg, min, max, desc
from pyspark.sql.functions import grouping, explode, array_contains



spark = SparkSession.builder.appName("movielens").getOrCreate()

24/06/27 21:28:10 WARN Utils: Your hostname, Joses-MacBook-Air-2.local resolves to a loopback address: 127.0.0.1; using 192.168.100.124 instead (on interface en0)
24/06/27 21:28:10 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/27 21:28:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Consultas sobre Movielens

## Schema de Movielens

![Schema](movielens_schema.png)

## ¿Cuál es la distribución de la clasificación de las películas? 

Proceso:

1. Cargar la tabla ratings
1. Cargar la tabla movies
1. Unir ambas tablas
1. Agrupar por calificación
1. Contar las películas


### Cargar las tablas

In [2]:
# Tabla Ratings
ratings_schema  = StructType(fields=[
    StructField("userId",IntegerType(),True), 
    StructField("movieId",IntegerType(),True),
    StructField("rating",DecimalType(precision=2,scale=1),True),
    StructField("timestamp",LongType(),True)
])
ratingsDf = spark.read\
    .option("header", True)\
    .option("dateFormat", "yyyyMMdd")\
    .schema(ratings_schema)\
    .csv("./ml-latest-small/ratings.csv")\
    .withColumn(\
            "date",\
            from_unixtime("timestamp", "yyyyMMdd"))\
                .drop('timestamp')

# Tabla Movies
movies_schema  = StructType(fields=[
    StructField("movieId",IntegerType(),True), 
    StructField("title",StringType(),True),
    StructField("genres",StringType(),True)
])

moviesDf = spark.read\
    .option("header", True)\
    .schema(movies_schema)\
    .csv("./ml-latest-small/movies.csv")

moviesDf = moviesDf.withColumn("genresSplit", split(moviesDf["genres"],"\|"))\
                        .drop('genres').withColumnRenamed("genresSplit","genres")\
                            .withColumn(\
                                "year",\
                                regexp_extract(\
                                           moviesDf["title"],\
                                           "^.+\(([0-9]+)\)$",\
                                           1)\
                                .cast(IntegerType()))\
                            .withColumn(\
                            "title_temp",\
                            regexp_extract(\
                                           moviesDf["title"],\
                                           "^(.+?) \([0-9]+\)$",\
                                           1))\
                            .drop('title')\
                        .withColumnRenamed("title_temp","title")

### Unir Ambas tablas

In [3]:
movie_ratingsDF = ratingsDf.join(moviesDf,on="movieId",how="inner")
movie_ratingsDF.head(3)

[Row(movieId=1, userId=1, rating=Decimal('4.0'), date='20000730', genres=['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy'], year=1995, title='Toy Story'),
 Row(movieId=3, userId=1, rating=Decimal('4.0'), date='20000730', genres=['Comedy', 'Romance'], year=1995, title='Grumpier Old Men'),
 Row(movieId=6, userId=1, rating=Decimal('4.0'), date='20000730', genres=['Action', 'Crime', 'Thriller'], year=1995, title='Heat')]

### Mostrar la tabla

In [4]:
movie_ratingsDF.show(2)

+-------+------+------+--------+--------------------+----+----------------+
|movieId|userId|rating|    date|              genres|year|           title|
+-------+------+------+--------+--------------------+----+----------------+
|      1|     1|   4.0|20000730|[Adventure, Anima...|1995|       Toy Story|
|      3|     1|   4.0|20000730|   [Comedy, Romance]|1995|Grumpier Old Men|
+-------+------+------+--------+--------------------+----+----------------+
only showing top 2 rows



Obtener todas las películas con una calificación superior a 4.

In [6]:
movie_ratingsDF.createOrReplaceTempView("movies")
high_rated_movies = spark.sql("SELECT * FROM movies WHERE rating > 4")
high_rated_movies.show(5)

+-------+------+------+--------+--------------------+----+--------------------+
|movieId|userId|rating|    date|              genres|year|               title|
+-------+------+------+--------+--------------------+----+--------------------+
|     47|     1|   5.0|20000730| [Mystery, Thriller]|1995|Seven (a.k.a. Se7en)|
|     50|     1|   5.0|20000730|[Crime, Mystery, ...|1995| Usual Suspects, The|
|    101|     1|   5.0|20000730|[Adventure, Comed...|1996|       Bottle Rocket|
|    151|     1|   5.0|20000730|[Action, Drama, R...|1995|             Rob Roy|
|    157|     1|   5.0|20000730|       [Comedy, War]|1995|      Canadian Bacon|
+-------+------+------+--------+--------------------+----+--------------------+
only showing top 5 rows



Calcular la calificación promedio por año.

In [9]:
avg_ratings_by_year = spark.sql("SELECT year, AVG(rating) as avg_rating FROM movies GROUP BY year ORDER BY year DESC")
avg_ratings_by_year.show(10)

+----+----------+
|year|avg_rating|
+----+----------+
|2018|   3.48352|
|2017|   3.57809|
|2016|   3.38726|
|2015|   3.41039|
|2014|   3.51214|
|2013|   3.45712|
|2012|   3.51155|
|2011|   3.46175|
|2010|   3.56910|
|2009|   3.56897|
+----+----------+
only showing top 10 rows



Convertir la fecha de string a timestamp y extraer el año.

In [13]:
from pyspark.sql.functions import col, year, to_date

movie_ratingsDF = movie_ratingsDF.withColumn("date", to_date(col("date"), "yyyyMMdd"))
movie_ratingsDF = movie_ratingsDF.withColumn("year2", year(col("date2")))
movie_ratingsDF.show(20)


AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `date2` cannot be resolved. Did you mean one of the following? [`date`, `title`, `rating`, `year`, `genres`].;
'Project [movieId#1, userId#0, rating#2, date#379, genres#34, year#335, title#53, year('date2) AS year2#387]
+- Project [movieId#1, userId#0, rating#2, to_date(date#327, Some(yyyyMMdd), Some(America/Costa_Rica), false) AS date#379, genres#34, year#335, title#53]
   +- Project [movieId#1, userId#0, rating#2, date#327, genres#34, year(date#327) AS year#335, title#53]
      +- Project [movieId#1, userId#0, rating#2, to_date(date#275, Some(yyyyMMdd), Some(America/Costa_Rica), false) AS date#327, genres#34, year#283, title#53]
         +- Project [movieId#1, userId#0, rating#2, date#275, genres#34, year(date#275) AS year#283, title#53]
            +- Project [movieId#1, userId#0, rating#2, to_date(date#8, Some(yyyyMMdd), Some(America/Costa_Rica), false) AS date#275, genres#34, year#38, title#53]
               +- Project [movieId#1, userId#0, rating#2, date#8, genres#34, year#38, title#53]
                  +- Join Inner, (movieId#1 = movieId#19)
                     :- Project [userId#0, movieId#1, rating#2, date#8]
                     :  +- Project [userId#0, movieId#1, rating#2, timestamp#3L, from_unixtime(timestamp#3L, yyyyMMdd, Some(America/Costa_Rica)) AS date#8]
                     :     +- Relation [userId#0,movieId#1,rating#2,timestamp#3L] csv
                     +- Project [movieId#19, genres#34, year#38, title_temp#43 AS title#53]
                        +- Project [movieId#19, genres#34, year#38, title_temp#43]
                           +- Project [movieId#19, title#20, genres#34, year#38, regexp_extract(title#20, ^(.+?) \([0-9]+\)$, 1) AS title_temp#43]
                              +- Project [movieId#19, title#20, genres#34, cast(regexp_extract(title#20, ^.+\(([0-9]+)\)$, 1) as int) AS year#38]
                                 +- Project [movieId#19, title#20, genresSplit#25 AS genres#34]
                                    +- Project [movieId#19, title#20, genresSplit#25]
                                       +- Project [movieId#19, title#20, genres#21, split(genres#21, \|, -1) AS genresSplit#25]
                                          +- Relation [movieId#19,title#20,genres#21] csv


Filtrar películas del género 'Comedy' y ordenarlas por calificación descendente.

In [14]:
comedy_movies = spark.sql("""
  SELECT * 
  FROM movies 
  WHERE array_contains(genres, 'Comedy') 
  ORDER BY rating DESC
""")
comedy_movies.show()


+-------+------+------+--------+--------------------+----+--------------------+
|movieId|userId|rating|    date|              genres|year|               title|
+-------+------+------+--------+--------------------+----+--------------------+
|   2141|     1|   5.0|20000730|[Adventure, Anima...|1986|   American Tail, An|
|  60756|     2|   5.0|20151024|            [Comedy]|2008|       Step Brothers|
|   2387|     1|   5.0|20000730|     [Comedy, Crime]|1998|     Very Bad Things|
|    157|     1|   5.0|20000730|       [Comedy, War]|1995|      Canadian Bacon|
|   2395|     1|   5.0|20000730|     [Comedy, Drama]|1998|            Rushmore|
|    231|     1|   5.0|20000730| [Adventure, Comedy]|1994|Dumb & Dumber (Du...|
|   2470|     1|   5.0|20000730| [Adventure, Comedy]|1986|    Crocodile Dundee|
|    608|     1|   5.0|20000730|[Comedy, Crime, D...|1996|               Fargo|
|   2502|     1|   5.0|20000730|     [Comedy, Crime]|1999|        Office Space|
|   1073|     1|   5.0|20000730|[Childre