In [1]:
!pip install pyspark==3.5.0 findspark

Collecting pyspark==3.5.0
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425344 sha256=a0cd1bad76dd0cbe4842371813816d843e11232338a698a95def385d83d5a64e
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: findspark, pyspark
Successfully installed findspark-2.0.1 pyspark-3.5.0


In [None]:
import findspark
findspark.init()

In [3]:
movies_data = [
    (1, "The Shawshank Redemption (1994)", "Crime|Drama"),
    (2, "The Godfather (1972)", "Crime|Drama"),
    (3, "The Dark Knight (2008)", "Action|Crime|Drama"),
    (4, "The Lord of the Rings: The Return of the King (2003)", "Adventure|Drama|Fantasy"),
    (5, "Pulp Fiction (1994)", "Crime|Thriller"),
    (6, "Schindler's List (1993)", "Biography|Drama|History"),
    (7, "Inception (2010)", "Action|Adventure|Sci-Fi"),
    (8, "Spirited Away (2001)", "Animation|Adventure|Family"),
    (9, "Interstellar (2014)", "Adventure|Drama|Sci-Fi"),
    (10, "The Search for the Giant Squid (2023)", "Documentary|Nature")
]

ratings_data = [
    (101, 1, 5.0, 1475442332),  # User 101 loved 'The Shawshank Redemption'
    (22, 2, 3.0, 1662234457),   # User 22 wasn't a fan of 'The Godfather'
    (101, 3, 4.5, 1305696483),  # User 101 also enjoyed 'The Dark Knight'
    (47, 6, 4.0, 1440115102),   # User 47 found 'Schindler's List' impactful
    (158, 5, 2.5, 1525717880),  # User 158 disliked 'Pulp Fiction'
    (22, 10, 4.0, 1661022219),  # User 22 enjoyed the documentary
    (85, 3, 4.5, 1403772446),   # Another high rating for 'The Dark Knight'
    (10, 7, 3.0, 1602650521),   # User 10 found 'Inception' just okay
    (10, 1, 3.5, 1475442891),  # User 10 had mixed feelings about 'Shawshank...'
    (192, 8, 5.0, 1292342928)   # User 192 was captivated by 'Spirited Away'
 ]

In [8]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import *

spark = SparkSession.builder.appName("MovieLens").getOrCreate()

# Movies Schema
schema_movies = StructType([
    StructField("movieId", IntegerType(), True),
    StructField("title", StringType(), True),
    StructField("genres", StringType(), True)
])

# Create DataFrames
movies_df = spark.createDataFrame(movies_data, schema_movies)
ratings_df = spark.createDataFrame(ratings_data, ["userId", "movieId", "rating", "timestamp"])





In [14]:
from pyspark.sql import functions as F

# Most popular movie (by number of ratings)
popular_movie = ratings_df.groupBy("movieId").count().orderBy(F.col("count").desc()).limit(1)
popular_movie.show()


# Average rating per genre
avg_rating_by_genre = movies_df.join(ratings_df, movies_df['movieId'] == ratings_df['movieId'], 'inner').groupBy("genres").avg("rating")
avg_rating_by_genre.show(truncate=False)



+-------+-----+
|movieId|count|
+-------+-----+
|      1|    2|
+-------+-----+

+--------------------------+------------------+
|genres                    |avg(rating)       |
+--------------------------+------------------+
|Crime|Thriller            |2.5               |
|Crime|Drama               |3.8333333333333335|
|Documentary|Nature        |4.0               |
|Action|Adventure|Sci-Fi   |3.0               |
|Biography|Drama|History   |4.0               |
|Action|Crime|Drama        |4.5               |
|Animation|Adventure|Family|5.0               |
+--------------------------+------------------+

