In [1]:
import os
os.environ["PYSPARK_PYTHON"] = r"C:\Users\DELL\AppData\Local\Programs\Python\Python310\python.exe"
os.environ["PYSPARK_DRIVER_PYTHON"] = r"C:\Users\DELL\AppData\Local\Programs\Python\Python310\python.exe"

In [2]:
%pip install pyspark

Note: you may need to restart the kernel to use updated packages.Collecting pyspark
  Using cached pyspark-4.0.1.tar.gz (434.2 MB)
Collecting py4j==0.10.9.9
  Using cached py4j-0.10.9.9-py2.py3-none-any.whl (203 kB)
Using legacy 'setup.py install' for pyspark, since package 'wheel' is not installed.
Installing collected packages: py4j, pyspark
    Running setup.py install for pyspark: started
    Running setup.py install for pyspark: still running...
    Running setup.py install for pyspark: finished with status 'done'
Successfully installed py4j-0.10.9.9 pyspark-4.0.1



You should consider upgrading via the 'c:\Users\DELL\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


In [3]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession


# Cấu hình Spark
spark_conf = (
    SparkConf()
    .setAppName("MovieRatingSpark")
    .setMaster("local[*]")
)

# Tạo SparkContext
spark_context = SparkContext.getOrCreate(conf=spark_conf)

# Tạo SparkSession
spark_session = (
    SparkSession
    .builder
    .appName("MovieRatingSpark")
    .getOrCreate()
)

if spark_session:
    print("SparkSession created successfully.")
else:
    print("Error: Failed to create SparkSession.")

SparkSession created successfully.


In [4]:
import os

rs_path = "./resource/"

# Đọc file movies và rating{1,2} với RDD
movies_rdd = spark_context.textFile(os.path.join(rs_path, "movies.txt"))
ratings_1_rdd = spark_context.textFile(os.path.join(rs_path, "ratings_1.txt"))
ratings_2_rdd = spark_context.textFile(os.path.join(rs_path, "ratings_2.txt"))

# Show data
if movies_rdd and ratings_1_rdd and ratings_2_rdd:
    print("\nmovies.txt:")
    for line in movies_rdd.take(5):
        print(line)
    print("\nratings_1.txt:")
    for line in ratings_1_rdd.take(5):
        print(line)
    print("\nratings_2.txt:")
    for line in ratings_2_rdd.take(5):
        print(line)
else:
    print("Error: Failed to load RDD files.")


movies.txt:
1001,The Godfather (1972),Crime|Drama
1002,The Shawshank Redemption (1994),Drama
1003,Schindler's List (1993),Biography|Drama|History
1004,Raging Bull (1980),Biography|Drama|Sport
1005,Casablanca (1942),Drama|Romance|War

ratings_1.txt:
7,1020,4.5,1577836800
23,1015,3.5,1577923200
45,1030,4.0,1578009600
12,1047,3.0,1578096000
38,1012,4.5,1578182400

ratings_2.txt:
12,1012,3.5,1577837800
34,1039,4.0,1577924200
27,1043,4.5,1578010600
8,1020,3.0,1578097000
19,1050,4.0,1578183400


In [5]:
# Parse movies
def parse_movie(line):
    parts = line.split(',', 2)
    movie_id = int(parts[0])
    title = parts[1]
    return (movie_id, title)
movies_parse = movies_rdd.map(parse_movie)

print("After parsing:")
for col in movies_parse.take(5):
    print(f"Movie: {col[0]} {col[1]}")

After parsing:
Movie: 1001 The Godfather (1972)
Movie: 1002 The Shawshank Redemption (1994)
Movie: 1003 Schindler's List (1993)
Movie: 1004 Raging Bull (1980)
Movie: 1005 Casablanca (1942)


In [6]:
# Parse ratings
def parse_rating(line):
    parts = line.split(',')
    movie_id = int(parts[1])
    rating = float(parts[2])
    return (movie_id, rating)

ratings_1_parse = ratings_1_rdd.map(parse_rating)
ratings_2_parse = ratings_2_rdd.map(parse_rating)

#show rating 1
print("Rating 1:")
for col in ratings_1_parse.take(5):
    print(f"Movie: {col[0]}, rating: {col[1]}")

Rating 1:
Movie: 1020, rating: 4.5
Movie: 1015, rating: 3.5
Movie: 1030, rating: 4.0
Movie: 1047, rating: 3.0
Movie: 1012, rating: 4.5


In [7]:
# Merge ratings 1 && 2
all_ratings = ratings_1_parse.union(ratings_2_parse)
print(f"After merging: {all_ratings.count()}")

# all_ratings: (movie_id, rating)
# x[0] = movie_id
# x[1] = rating
rating_pairs = all_ratings.map(
    lambda x: (x[0], (x[1], 1))  # => (movie_id, (rating, 1))
)

# a = (sum, count)
# b = (sum, count)
rating_stats = rating_pairs.reduceByKey(
    lambda a, b: (a[0] + b[0], a[1] + b[1])
)

# x = (sumRatings, countRatings)
# x[0] = sumRatings
# x[1] = countRatings
rating_avg = rating_stats.mapValues(
    lambda x: (x[0] / x[1], x[1])  # => (avgRating, countRatings)
)

# movies_parse: (movie_id, title)
# rating_avg : (movie_id, (avg, count))
# after joining = > (movie_id, (title, (avgRating, countRatings)))
rating_avg_title = movies_parse.join(rating_avg)


# Kết quả
for _, (title, (avg, cnt)) in rating_avg_title.take(10):
    print(f"{title}: AverageRating:{avg:.2f} (TotalRatings:{cnt})")

# Lọc những phim có ít nhất 5 lượt đánh giá
movies_5_ratings = rating_avg_title.filter(
    lambda x: x[1][1][1] >= 5  # x = (movieId, (title, (avg, count)))
)

# Tìm phim có điểm trung bình cao nhất
top_avg_movie = movies_5_ratings.max(
    key=lambda x: x[1][1][0]  # avg rating
)
#print
_, (title, (avg, total)) = top_avg_movie

print(f"{title} is the highest rated movie with an average rating of {avg:.2f} among movies with at least {total} ratings.")


After merging: 184
E.T. the Extra-Terrestrial (1982): AverageRating:3.67 (TotalRatings:18)
Mad Max: Fury Road (2015): AverageRating:3.47 (TotalRatings:18)
Sunset Boulevard (1950): AverageRating:4.36 (TotalRatings:7)
The Lord of the Rings: The Return of the King (2003): AverageRating:3.82 (TotalRatings:11)
Lawrence of Arabia (1962): AverageRating:3.44 (TotalRatings:18)
Fight Club (1999): AverageRating:3.50 (TotalRatings:7)
Gladiator (2000): AverageRating:3.61 (TotalRatings:18)
The Social Network (2010): AverageRating:3.86 (TotalRatings:7)
Psycho (1960): AverageRating:4.00 (TotalRatings:2)
The Silence of the Lambs (1991): AverageRating:3.14 (TotalRatings:7)
Sunset Boulevard (1950) is the highest rated movie with an average rating of 4.36 among movies with at least 7 ratings.


In [8]:
# Clear
spark_context.stop()
spark_session.stop()