In [0]:
#import findspark
#findspark.init('/spark/spark-3.5.1-bin-hadoop3')
from pyspark import *
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, DateType, TimestampType, LongType
from pyspark.sql.types import ArrayType, DoubleType, BooleanType, DecimalType
from pyspark.sql.functions import regexp_extract, split, from_unixtime, col, avg, min, max, desc
from pyspark.sql.functions import grouping, explode, array_contains



#spark = SparkSession.builder.appName("movielens").getOrCreate()

# Consultas sobre Movielens

## Schema de Movielens

![Schema](movielens_schema.png)

## ¿Cuál es la distribución de la clasificación de las películas? 

Proceso:

1. Cargar la tabla ratings
1. Cargar la tabla movies
1. Unir ambas tablas
1. Agrupar por calificación
1. Contar las películas


### Cargar las tablas

In [0]:
# Tabla Ratings
ratings_schema  = StructType(fields=[
    StructField("userId",IntegerType(),True), 
    StructField("movieId",IntegerType(),True),
    StructField("rating",DecimalType(precision=2,scale=1),True),
    StructField("timestamp",LongType(),True)
])
ratingsDf = spark.read\
    .option("header", True)\
    .option("dateFormat", "yyyyMMdd")\
    .schema(ratings_schema)\
    .csv("dbfs:/FileStore/tables/ratings.csv")\
    .withColumn(\
            "date",\
            from_unixtime("timestamp", "yyyyMMdd"))\
                .drop('timestamp')

# Tabla Movies
movies_schema  = StructType(fields=[
    StructField("movieId",IntegerType(),True), 
    StructField("title",StringType(),True),
    StructField("genres",StringType(),True)
])

moviesDf = spark.read\
    .option("header", True)\
    .schema(movies_schema)\
    .csv("dbfs:/FileStore/tables/movies.csv")

moviesDf = moviesDf.withColumn("genresSplit", split(moviesDf["genres"],"\|"))\
                        .drop('genres').withColumnRenamed("genresSplit","genres")\
                            .withColumn(\
                                "year",\
                                regexp_extract(\
                                           moviesDf["title"],\
                                           "^.+\(([0-9]+)\)$",\
                                           1)\
                                .cast(IntegerType()))\
                            .withColumn(\
                            "title_temp",\
                            regexp_extract(\
                                           moviesDf["title"],\
                                           "^(.+?) \([0-9]+\)$",\
                                           1))\
                            .drop('title')\
                        .withColumnRenamed("title_temp","title")

### Unir Ambas tablas

In [0]:
movie_ratingsDF = ratingsDf.join(moviesDf,on="movieId",how="inner")
movie_ratingsDF.show(3)

+-------+------+------+--------+--------------------+----+----------------+
|movieId|userId|rating|    date|              genres|year|           title|
+-------+------+------+--------+--------------------+----+----------------+
|      1|     1|   4.0|20000730|[Adventure, Anima...|1995|       Toy Story|
|      3|     1|   4.0|20000730|   [Comedy, Romance]|1995|Grumpier Old Men|
|      6|     1|   4.0|20000730|[Action, Crime, T...|1995|            Heat|
+-------+------+------+--------+--------------------+----+----------------+
only showing top 3 rows



### Mostrar la tabla

In [0]:
movie_ratingsDF.show(2)

Obtener todas las películas con una calificación superior a 4.

In [0]:
movie_ratingsDF.createOrReplaceTempView("movies")
high_rated_movies = spark.sql("SELECT * FROM movies WHERE rating > 4")
high_rated_movies.show(5)

+-------+------+------+--------+--------------------+----+--------------------+
|movieId|userId|rating|    date|              genres|year|               title|
+-------+------+------+--------+--------------------+----+--------------------+
|     47|     1|   5.0|20000730| [Mystery, Thriller]|1995|Seven (a.k.a. Se7en)|
|     50|     1|   5.0|20000730|[Crime, Mystery, ...|1995| Usual Suspects, The|
|    101|     1|   5.0|20000730|[Adventure, Comed...|1996|       Bottle Rocket|
|    151|     1|   5.0|20000730|[Action, Drama, R...|1995|             Rob Roy|
|    157|     1|   5.0|20000730|       [Comedy, War]|1995|      Canadian Bacon|
+-------+------+------+--------+--------------------+----+--------------------+
only showing top 5 rows



Calcular la calificación promedio por año.

In [0]:
avg_ratings_by_year = spark.sql("SELECT year, AVG(rating) as avg_rating FROM movies GROUP BY year ORDER BY year DESC")
avg_ratings_by_year.show(30)

+----+----------+
|year|avg_rating|
+----+----------+
|2018|   3.48352|
|2017|   3.57809|
|2016|   3.38726|
|2015|   3.41039|
|2014|   3.51214|
|2013|   3.45712|
|2012|   3.51155|
|2011|   3.46175|
|2010|   3.56910|
|2009|   3.56897|
|2008|   3.52885|
|2007|   3.52180|
|2006|   3.49845|
|2005|   3.35989|
|2004|   3.50723|
|2003|   3.46232|
|2002|   3.41429|
|2001|   3.48224|
|2000|   3.39890|
|1999|   3.49923|
|1998|   3.42759|
|1997|   3.34724|
|1996|   3.33533|
|1995|   3.44359|
|1994|   3.50094|
|1993|   3.45215|
|1992|   3.35494|
|1991|   3.55226|
|1990|   3.42731|
|1989|   3.46306|
+----+----------+
only showing top 30 rows



Convertir la fecha de string a timestamp y extraer el año.

In [0]:
from pyspark.sql.functions import col, year, to_date

movie_ratingsDF = movie_ratingsDF.withColumn("date2", to_date(col("date"), "yyyyMMdd"))
movie_ratingsDF = movie_ratingsDF.withColumn("year2", year(col("date2")))
movie_ratingsDF.show(20)


+-------+------+------+----------+--------------------+----+--------------------+----------+-----+
|movieId|userId|rating|      date|              genres|year|               title|     date2|year2|
+-------+------+------+----------+--------------------+----+--------------------+----------+-----+
|      1|     1|   4.0|2000-07-30|[Adventure, Anima...|1995|           Toy Story|2000-07-30| 2000|
|      3|     1|   4.0|2000-07-30|   [Comedy, Romance]|1995|    Grumpier Old Men|2000-07-30| 2000|
|      6|     1|   4.0|2000-07-30|[Action, Crime, T...|1995|                Heat|2000-07-30| 2000|
|     47|     1|   5.0|2000-07-30| [Mystery, Thriller]|1995|Seven (a.k.a. Se7en)|2000-07-30| 2000|
|     50|     1|   5.0|2000-07-30|[Crime, Mystery, ...|1995| Usual Suspects, The|2000-07-30| 2000|
|     70|     1|   3.0|2000-07-30|[Action, Comedy, ...|1996| From Dusk Till Dawn|2000-07-30| 2000|
|    101|     1|   5.0|2000-07-30|[Adventure, Comed...|1996|       Bottle Rocket|2000-07-30| 2000|
|    110| 

Filtrar películas del género 'Comedy' y ordenarlas por calificación descendente.

In [0]:
comedy_movies = spark.sql("""
  SELECT * 
  FROM movies 
  WHERE array_contains(genres, 'Comedy') 
  ORDER BY rating DESC
""")
comedy_movies.show()


+-------+------+------+--------+--------------------+----+--------------------+
|movieId|userId|rating|    date|              genres|year|               title|
+-------+------+------+--------+--------------------+----+--------------------+
|   2141|     1|   5.0|20000730|[Adventure, Anima...|1986|   American Tail, An|
|  60756|     2|   5.0|20151024|            [Comedy]|2008|       Step Brothers|
|   2387|     1|   5.0|20000730|     [Comedy, Crime]|1998|     Very Bad Things|
|    157|     1|   5.0|20000730|       [Comedy, War]|1995|      Canadian Bacon|
|   2395|     1|   5.0|20000730|     [Comedy, Drama]|1998|            Rushmore|
|    231|     1|   5.0|20000730| [Adventure, Comedy]|1994|Dumb & Dumber (Du...|
|   2470|     1|   5.0|20000730| [Adventure, Comedy]|1986|    Crocodile Dundee|
|    608|     1|   5.0|20000730|[Comedy, Crime, D...|1996|               Fargo|
|   2502|     1|   5.0|20000730|     [Comedy, Crime]|1999|        Office Space|
|   1073|     1|   5.0|20000730|[Childre