In [0]:
#import findspark
#findspark.init('/spark/spark-3.5.1-bin-hadoop3')
from pyspark import *
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, DateType, TimestampType, LongType
from pyspark.sql.types import ArrayType, DoubleType, BooleanType, DecimalType
from pyspark.sql.functions import regexp_extract, split, from_unixtime, col, avg, min, max, desc
from pyspark.sql.functions import grouping



#spark = SparkSession.builder.appName("movielens").getOrCreate()

# Consultas sobre Movielens

## Schema de Movielens

![Schema](movielens_schema.png)

## ¿Cuál es la distribución de la clasificación de las películas? 

Proceso:

1. Cargar la tabla ratings
1. Cargar la tabla movies
1. Unir ambas tablas
1. Agrupar por calificación
1. Contar las películas


### Cargar las tablas

In [0]:
# Tabla Ratings
ratings_schema  = StructType(fields=[
    StructField("userId",IntegerType(),True), 
    StructField("movieId",IntegerType(),True),
    StructField("rating",DecimalType(precision=2,scale=1),True),
    StructField("timestamp",LongType(),True)
])
ratingsDf = spark.read\
    .option("header", True)\
    .option("dateFormat", "yyyyMMdd")\
    .schema(ratings_schema)\
    .csv("dbfs:/FileStore/tables/ratings.csv")\
    .withColumn(\
            "date",\
            from_unixtime("timestamp", "yyyyMMdd"))\
                .drop('timestamp')

# Tabla Movies
movies_schema  = StructType(fields=[
    StructField("movieId",IntegerType(),True), 
    StructField("title",StringType(),True),
    StructField("genres",StringType(),True)
])

moviesDf = spark.read\
    .option("header", True)\
    .schema(movies_schema)\
    .csv("dbfs:/FileStore/tables/movies.csv")

moviesDf = moviesDf.withColumn("genresSplit", split(moviesDf["genres"],"\|"))\
                        .drop('genres').withColumnRenamed("genresSplit","genres")\
                            .withColumn(\
                                "year",\
                                regexp_extract(\
                                           moviesDf["title"],\
                                           "^.+\(([0-9]+)\)$",\
                                           1)\
                                .cast(IntegerType()))\
                            .withColumn(\
                            "title_temp",\
                            regexp_extract(\
                                           moviesDf["title"],\
                                           "^(.+?) \([0-9]+\)$",\
                                           1))\
                            .drop('title')\
                        .withColumnRenamed("title_temp","title")

### Unir Ambas tablas

In [0]:
movie_ratingsDF = ratingsDf.join(moviesDf,on="movieId",how="inner")
movie_ratingsDF.show()

+-------+------+------+--------+--------------------+----+--------------------+
|movieId|userId|rating|    date|              genres|year|               title|
+-------+------+------+--------+--------------------+----+--------------------+
|      1|     1|   4.0|20000730|[Adventure, Anima...|1995|           Toy Story|
|      3|     1|   4.0|20000730|   [Comedy, Romance]|1995|    Grumpier Old Men|
|      6|     1|   4.0|20000730|[Action, Crime, T...|1995|                Heat|
|     47|     1|   5.0|20000730| [Mystery, Thriller]|1995|Seven (a.k.a. Se7en)|
|     50|     1|   5.0|20000730|[Crime, Mystery, ...|1995| Usual Suspects, The|
|     70|     1|   3.0|20000730|[Action, Comedy, ...|1996| From Dusk Till Dawn|
|    101|     1|   5.0|20000730|[Adventure, Comed...|1996|       Bottle Rocket|
|    110|     1|   4.0|20000730|[Action, Drama, War]|1995|          Braveheart|
|    151|     1|   5.0|20000730|[Action, Drama, R...|1995|             Rob Roy|
|    157|     1|   5.0|20000730|       [

### Agrupar por rating

In [0]:
from pyspark.sql.functions import grouping

movie_ratingsDF_grouped_by_rating = movie_ratingsDF.groupBy("rating")

### Contar las calificaciones

In [0]:
movie_ratingsDF_grouped_by_rating.agg({"*": "count"}).collect()

Out[5]: [Row(rating=Decimal('5.0'), count(1)=13211),
 Row(rating=Decimal('2.5'), count(1)=5550),
 Row(rating=Decimal('0.5'), count(1)=1370),
 Row(rating=Decimal('1.0'), count(1)=2811),
 Row(rating=Decimal('3.5'), count(1)=13136),
 Row(rating=Decimal('1.5'), count(1)=1791),
 Row(rating=Decimal('3.0'), count(1)=20047),
 Row(rating=Decimal('2.0'), count(1)=7551),
 Row(rating=Decimal('4.0'), count(1)=26818),
 Row(rating=Decimal('4.5'), count(1)=8551)]

In [0]:
# Alternativa: metodo count
movie_ratingsDF_grouped_by_rating.count().show()

+------+-----+
|rating|count|
+------+-----+
|   5.0|13211|
|   2.5| 5550|
|   0.5| 1370|
|   1.0| 2811|
|   3.5|13136|
|   1.5| 1791|
|   3.0|20047|
|   2.0| 7551|
|   4.0|26818|
|   4.5| 8551|
+------+-----+



Ordenamos la lista para que se vea mejor:

In [0]:
movie_ratingsDF_grouped_by_rating.count().orderBy(desc("rating")).show()

+------+-----+
|rating|count|
+------+-----+
|   5.0|13211|
|   4.5| 8551|
|   4.0|26818|
|   3.5|13136|
|   3.0|20047|
|   2.5| 5550|
|   2.0| 7551|
|   1.5| 1791|
|   1.0| 2811|
|   0.5| 1370|
+------+-----+



## ¿Cuál es la película con la mayor cantidad de reseñas? 

In [0]:
movie_ratingsDF_grouped_by_movieID = movie_ratingsDF.groupBy("title").count().orderBy(desc("count"))
movie_ratingsDF_grouped_by_movieID.show(1)


+------------+-----+
|       title|count|
+------------+-----+
|Forrest Gump|  329|
+------------+-----+
only showing top 1 row

