In [2]:
from pyspark.sql import SparkSession


spark = SparkSession.builder.appName("MovieRatingPrediction").getOrCreate()


movie_df= spark.read.csv('movie.csv', inferSchema =True, header=True)



movie_df.printSchema()


root
 |-- _c0: string (nullable = true)
 |-- id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- original_language: string (nullable = true)
 |-- vote_average: string (nullable = true)
 |-- vote_count: string (nullable = true)
 |-- popularity: double (nullable = true)
 |-- overview: string (nullable = true)
 |-- budget: string (nullable = true)
 |-- production_companies: string (nullable = true)
 |-- revenue: string (nullable = true)
 |-- runtime: string (nullable = true)
 |-- tagline: string (nullable = true)



In [4]:
from pyspark.sql.functions import col

top_rated_movies = movie_df.filter(col("vote_count") > 50) \
                           .orderBy(col("vote_average").desc()) \
                           .select("title", "vote_average", "vote_count") \
                           .limit(10)

top_rated_movies.show()



+--------------------+------------+----------+
|               title|vote_average|vote_count|
+--------------------+------------+----------+
|BTS: Permission t...|         9.1|       143|
|Spider-Man: Acros...|         8.8|      1160|
|Folklore: The Lon...|         8.7|       116|
|       The Godfather|         8.7|     18076|
|The Shawshank Red...|         8.7|     23935|
|The Godfather Par...|         8.6|     10913|
|    Schindler's List|         8.6|     14153|
|Dilwale Dulhania ...|         8.6|      4147|
|Franco Escamilla:...|         8.6|        77|
|            Rubius X|         8.6|       136|
+--------------------+------------+----------+



In [12]:
from pyspark.sql.functions import avg

avg_rating_per_genre = movie_df.groupBy("genres") \
                               .agg(avg("vote_average").alias("average_rating")) \
                               .orderBy("average_rating", ascending=False)  # Order by average rating

avg_rating_per_genre.show()




+--------------------+--------------+
|              genres|average_rating|
+--------------------+--------------+
|['TOHO', 'Kyokuic...|         100.0|
|['Romance', 'Anim...|           8.5|
|['Romance', 'Anim...|           8.5|
|['Fantasy', 'Dram...|           8.5|
|['Animation', 'Ac...|           8.3|
|['Animation', 'Th...|           8.3|
|['TV Movie', 'Ani...|           8.3|
|['Family', 'Anima...|           8.3|
|['Animation', 'Ac...|           8.3|
|['Adventure', 'Fa...|           8.3|
|['Animation', 'Fa...|           8.3|
|['Family', 'Roman...|           8.3|
|['Animation', 'Fa...|           8.3|
|['Drama', 'Fantas...|           8.2|
|['Animation', 'Fa...|           8.2|
|['Comedy', 'Anima...|           8.2|
|['Animation', 'Co...|           8.2|
|['Fantasy', 'Acti...|           8.2|
|['Animation', 'Fa...|           8.2|
|['Animation', 'TV...|           8.2|
+--------------------+--------------+
only showing top 20 rows



In [14]:
most_popular_movies = movie_df.orderBy(col("popularity").desc()) \
                              .select("title", "popularity") \
                              .limit(10)

most_popular_movies.show()




+--------------------+----------+
|               title|popularity|
+--------------------+----------+
|               Roger| 4000000.0|
|              Fast X|  8363.473|
|John Wick: Chapter 4|  4210.313|
|The Super Mario B...|  3394.458|
|Spider-Man: Acros...|  2859.047|
|            Hypnotic|  2654.854|
|Transformers: Ris...|  2453.807|
|     The Black Demon|    1777.2|
|  The Little Mermaid|   1448.64|
|Avatar: The Way o...|  1344.884|
+--------------------+----------+



In [19]:
highest_vote_count_movies = movie_df.orderBy(col("vote_count").desc()) \
                                    .select("title", "vote_count") \
                                    .limit(10)

highest_vote_count_movies.show()




+--------------------+----------+
|               title|vote_count|
+--------------------+----------+
|           Labor Day|       999|
|A Tale of Two Sis...|       999|
|Final Fantasy VII...|       999|
|  Jackass Number Two|       999|
|    Now You See Me 2|      9982|
|         Top Secret!|       997|
|           The Saint|       997|
|            Die Hard|      9961|
|            Bandidas|       996|
|          Four Lions|       996|
+--------------------+----------+



In [22]:
top_rated_movies_with_popularity = movie_df.filter(col("vote_count") > 50) \
                                           .orderBy(col("vote_average").desc()) \
                                           .select("title", "popularity") \
                                           .limit(10)

top_rated_movies_with_popularity.show()



+--------------------+----------+
|               title|popularity|
+--------------------+----------+
|BTS: Permission t...|    16.289|
|Spider-Man: Acros...|  2859.047|
|Folklore: The Lon...|    12.354|
|       The Godfather|   110.377|
|The Shawshank Red...|    69.056|
|The Godfather Par...|     54.99|
|    Schindler's List|    43.117|
|Dilwale Dulhania ...|    26.908|
|Franco Escamilla:...|    16.977|
|            Rubius X|    14.658|
+--------------------+----------+



In [25]:
from pyspark.sql.functions import col


valid_languages = movie_df.filter(col("original_language").rlike("^[a-zA-Z]+$"))


avg_rating_per_language = valid_languages.groupBy("original_language") \
                                         .agg(avg("vote_average").alias("average_rating")) \
                                         .orderBy("average_rating", ascending=False)

avg_rating_per_language.show()





+-----------------+------------------+
|original_language|    average_rating|
+-----------------+------------------+
|           Hebrew|               7.7|
|          Persian|               7.7|
|          Kannada|               7.7|
|            Irish|               7.6|
|          Bosnian|               7.5|
|         Dzongkha|               7.4|
|               sh|               7.3|
|        Malayalam|               7.2|
|         Romanian|               7.0|
|          Swedish| 6.959259259259259|
|           Danish| 6.953846153846153|
|          Turkish| 6.800000000000002|
|         Japanese|6.7049773755656155|
|            Khmer|               6.6|
|          Serbian|               6.6|
|       Macedonian|               6.6|
|        Hungarian|               6.6|
|        Ukrainian|             6.575|
|           German|6.5034090909090905|
|          Catalan|               6.5|
+-----------------+------------------+
only showing top 20 rows



In [None]:
import os


graphs_folder_path = os.path.expanduser('~/Documents/graphs')


if not os.path.exists(graphs_folder_path):
    os.makedirs(graphs_folder_path)


top_rated_movies.write.csv(os.path.join(graphs_folder_path, "top_rated_movies.csv"), header=True)


avg_rating_per_genre.write.csv(os.path.join(graphs_folder_path, "avg_rating_per_genre.csv"), header=True)


most_popular_movies.write.csv(os.path.join(graphs_folder_path, "most_popular_movies.csv"), header=True)


popularity_rating.write.csv(os.path.join(graphs_folder_path, "popularity_rating.csv"), header=True)


highest_vote_count_movies.write.csv(os.path.join(graphs_folder_path, "highest_vote_count_movies.csv"), header=True)


top_rated_movies_with_popularity.write.csv(os.path.join(graphs_folder_path, "top_rated_movies_with_popularity.csv"), header=True)


avg_rating_per_language.write.csv(os.path.join(graphs_folder_path, "avg_rating_per_language.csv"), header=True)

