In [5]:
from pyspark.sql import SparkSession

In [6]:
spark = SparkSession.builder.appName("movies").getOrCreate()

In [9]:
movies_file = "/content/movies.csv"
ratings_file = "/content/ratings.csv"
movies_df = spark.read.csv(movies_file, header=True, inferSchema=True)
ratings_df = spark.read.csv(ratings_file, header=True, inferSchema=True)
movies_df.show()
ratings_df.show()

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
|      8| Tom and Huck (1995)|  Adventure|Children|
|      9| Sudden Death (1995)|              Action|
|     10|    GoldenEye (1995)|Action|Adventure|...|
|     11|American Presiden...|Comedy|Drama|Romance|
|     12|Dracula: Dead and...|       Comedy|Horror|
|     13|        Balto (1995)|Adventure|Animati...|
|     14|        Nixon (1995)|               Drama|
|     15|Cutthroat Island ...|Action|Adventure|...|
|     16|       Casino (1995)|         Crime|Drama|
|     17|Sen

In [10]:
#Create temporary views for movies and ratings.
movies_df.createOrReplaceTempView("movies")
ratings_df.createOrReplaceTempView("ratings")

In [11]:
#Write queries to find the top 10 highest-rated movies with at least 10 ratings.
query = """
    SELECT m.title, r.avg_rating, r.count_ratings
    FROM (
      SELECT movieId, AVG(rating) AS avg_rating, COUNT(rating) AS count_ratings
      FROM ratings
      GROUP BY movieId
      HAVING count_ratings >= 10
    ) r
    JOIN
    movies m ON m.movieId = r.movieId
    ORDER BY r.avg_rating DESC
    LIMIT 10
"""

result = spark.sql(query)
result.show()

+--------------------+-----------------+-------------+
|               title|       avg_rating|count_ratings|
+--------------------+-----------------+-------------+
|Secrets & Lies (1...|4.590909090909091|           11|
|Guess Who's Comin...|4.545454545454546|           11|
|Paths of Glory (1...|4.541666666666667|           12|
|Streetcar Named D...|            4.475|           20|
|Celebration, The ...|4.458333333333333|           12|
|          Ran (1985)|4.433333333333334|           15|
|Shawshank Redempt...|4.429022082018927|          317|
|His Girl Friday (...|4.392857142857143|           14|
|All Quiet on the ...|             4.35|           10|
| Hustler, The (1961)|4.333333333333333|           18|
+--------------------+-----------------+-------------+



In [12]:
spark.stop()