In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("S5").getOrCreate()

In [3]:
movie_df = spark.read.csv("/content/movies.csv", header=True, inferSchema=True)
rating_df = spark.read.csv("/content/ratings.csv", header=True, inferSchema=True)

movie_df.show()
rating_df.show()

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
|      8| Tom and Huck (1995)|  Adventure|Children|
|      9| Sudden Death (1995)|              Action|
|     10|    GoldenEye (1995)|Action|Adventure|...|
|     11|American Presiden...|Comedy|Drama|Romance|
|     12|Dracula: Dead and...|       Comedy|Horror|
|     13|        Balto (1995)|Adventure|Animati...|
|     14|        Nixon (1995)|               Drama|
|     15|Cutthroat Island ...|Action|Adventure|...|
|     16|       Casino (1995)|         Crime|Drama|
|     17|Sen

In [4]:
movie_df.createOrReplaceTempView("movies")
rating_df.createOrReplaceTempView("ratings")

In [8]:
query = """
  SELECT m.title, r.avg_rating
  FROM (
      SELECT movieId, AVG(rating) AS avg_rating
      FROM ratings
      GROUP BY movieId
      HAVING COUNT(rating) >= 10
  ) r
  JOIN
  movies m
  ON m.movieId = r.movieId
  ORDER BY r.avg_rating DESC
  LIMIT 10
"""

result = spark.sql(query)
result.show()

+--------------------+-----------------+
|               title|       avg_rating|
+--------------------+-----------------+
|Secrets & Lies (1...|4.590909090909091|
|Guess Who's Comin...|4.545454545454546|
|Paths of Glory (1...|4.541666666666667|
|Streetcar Named D...|            4.475|
|Celebration, The ...|4.458333333333333|
|          Ran (1985)|4.433333333333334|
|Shawshank Redempt...|4.429022082018927|
|His Girl Friday (...|4.392857142857143|
|All Quiet on the ...|             4.35|
| Hustler, The (1961)|4.333333333333333|
+--------------------+-----------------+



In [9]:
spark.stop()