In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import os

conf = SparkConf().setAppName("MovieRatingsAnalytics").setMaster("local[*]")
sc = SparkContext.getOrCreate(conf=conf)
spark = SparkSession.builder.appName("MovieRatingsAnalytics").getOrCreate()

print("Init successfully!")

25/11/25 21:02:29 WARN Utils: Your hostname, tienloc-laptop resolves to a loopback address: 127.0.1.1; using 192.168.31.171 instead (on interface wlp0s20f3)
25/11/25 21:02:29 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/25 21:02:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Init successfully!


In [None]:
import os

#File path
data_path = "file:///home/tienloc/lab2bigdata/data/"

# Read file movies.txt
movies_rdd = sc.textFile(data_path + "movies.txt")
print(f"Number of film: {movies_rdd.count()}")

# Read file ratings_1.txt và ratings_2.txt
ratings_1_rdd = sc.textFile(data_path + "ratings_1.txt")
ratings_2_rdd = sc.textFile(data_path + "ratings_2.txt")

print(f"Number of rating from file 1: {ratings_1_rdd.count()}")
print(f"Number of rating from file 2: {ratings_2_rdd.count()}")

print("\nData from movies.txt (5 first lines):")
for line in movies_rdd.take(5):
    print(line)

print("\nData from ratings_1.txt (5 first lines):")
for line in ratings_1_rdd.take(5):
    print(line)

Number of film: 50
Number of rating from file 1: 84
Number of rating from file 2: 100

Data from movies.txt (5 first lines):
1001,The Godfather (1972),Crime|Drama
1002,The Shawshank Redemption (1994),Drama
1003,Schindler's List (1993),Biography|Drama|History
1004,Raging Bull (1980),Biography|Drama|Sport
1005,Casablanca (1942),Drama|Romance|War

Data from ratings_1.txt (5 first lines):
7,1020,4.5,1577836800
23,1015,3.5,1577923200
45,1030,4.0,1578009600
12,1047,3.0,1578096000
38,1012,4.5,1578182400


In [9]:
# Parse movies.txt: MovieID, Title, Genres
def parse_movie(line):
    parts = line.split(',', 2) 
    movie_id = int(parts[0])
    title = parts[1]
    genres = parts[2] if len(parts) > 2 else ""
    return (movie_id, title)

movies_parsed = movies_rdd.map(parse_movie)
print("Movies parsed (5 records):")
for movie in movies_parsed.take(5):
    print(f"MovieID: {movie[0]}, Title: {movie[1]}")

# Create dictionary from movies_parsed RDD
movies_dict = movies_parsed.collectAsMap()
print(f"\nNumber of film in dictionary: {len(movies_dict)}")

Movies parsed (5 records):
MovieID: 1001, Title: The Godfather (1972)
MovieID: 1002, Title: The Shawshank Redemption (1994)
MovieID: 1003, Title: Schindler's List (1993)
MovieID: 1004, Title: Raging Bull (1980)
MovieID: 1005, Title: Casablanca (1942)

Number of film in dictionary: 50


In [10]:
# Parse ratings: UserID, MovieID, Rating, Timestamp
def parse_rating(line):
    parts = line.split(',')
    user_id = int(parts[0])
    movie_id = int(parts[1])
    rating = float(parts[2])
    timestamp = int(parts[3])
    return (movie_id, rating)

ratings_1_parsed = ratings_1_rdd.map(parse_rating)
ratings_2_parsed = ratings_2_rdd.map(parse_rating)

print("Ratings 1 parsed (5 records):")
for rating in ratings_1_parsed.take(5):
    print(f"MovieID: {rating[0]}, Rating: {rating[1]}")

print("\nRatings 2 parsed (5 records):")
for rating in ratings_2_parsed.take(5):
    print(f"MovieID: {rating[0]}, Rating: {rating[1]}")

# Merge 2 RDD ratings lại
all_ratings = ratings_1_parsed.union(ratings_2_parsed)
print(f"\nTotal ratings from 2 files: {all_ratings.count()}")

Ratings 1 parsed (5 records):
MovieID: 1020, Rating: 4.5
MovieID: 1015, Rating: 3.5
MovieID: 1030, Rating: 4.0
MovieID: 1047, Rating: 3.0
MovieID: 1012, Rating: 4.5

Ratings 2 parsed (5 records):
MovieID: 1012, Rating: 3.5
MovieID: 1039, Rating: 4.0
MovieID: 1043, Rating: 4.5
MovieID: 1020, Rating: 3.0
MovieID: 1050, Rating: 4.0

Total ratings from 2 files: 184


In [11]:
# Compute average rating and total number of ratings for each movie

# Group by MovieID and calculate sum of ratings and count
# (movie_id, rating) -> (movie_id, (rating, 1))
ratings_with_count = all_ratings.map(lambda x: (x[0], (x[1], 1)))

print("Ratings with count (5 records):")
for record in ratings_with_count.take(5):
    print(f"MovieID: {record[0]}, (Rating: {record[1][0]}, Count: {record[1][1]})")

# Reduce by key to compute total rating and total number of ratings for each movie
# (movie_id, (sum_ratings, total_count))
movie_stats = ratings_with_count.reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1]))

print(f"\nTotal number of movies with ratings: {movie_stats.count()}")
print("Movie stats (5 records):")
for stat in movie_stats.take(5):
    print(f"MovieID: {stat[0]}, Sum: {stat[1][0]}, Count: {stat[1][1]}")


Ratings with count (5 records):
MovieID: 1020, (Rating: 4.5, Count: 1)
MovieID: 1015, (Rating: 3.5, Count: 1)
MovieID: 1030, (Rating: 4.0, Count: 1)
MovieID: 1047, (Rating: 3.0, Count: 1)
MovieID: 1012, (Rating: 4.5, Count: 1)

Total number of movies with ratings: 14
Movie stats (5 records):
MovieID: 1020, Sum: 66.0, Count: 18
MovieID: 1012, Sum: 8.0, Count: 2
MovieID: 1040, Sum: 65.0, Count: 18
MovieID: 1028, Sum: 24.5, Count: 7
MovieID: 1037, Sum: 70.0, Count: 18


In [12]:
# Compute average rating and add movie title
# (movie_id, (sum_ratings, total_ratings)) -> (movie_id, (average_rating, total_ratings, movie_title))
def calculate_average_and_add_title(record):
    movie_id, (sum_ratings, count) = record
    average_rating = sum_ratings / count
    movie_title = movies_dict.get(movie_id, f"Unknown Movie {movie_id}")
    return (movie_id, (average_rating, count, movie_title))


movie_results = movie_stats.map(calculate_average_and_add_title)

# Print all movie results
for result in movie_results.collect():
    movie_id, (avg_rating, count, title) = result
    print(f"{title} AverageRating: {avg_rating:.2f} (TotalRatings: {count})")


# Filter movies with at least 5 ratings
movies_with_min_ratings = movie_results.filter(lambda x: x[1][1] >= 5)

# Find the highest rated movie
# Ensure the RDD is not empty before calling max
if movies_with_min_ratings.isEmpty():
    print("No movies have at least 5 ratings.")
else:
    highest_rated_movie = movies_with_min_ratings.max(key=lambda x: x[1][0])

    movie_id, (avg_rating, count, title) = highest_rated_movie
    print(f"\n{title} is the highest rated movie with an average rating of {avg_rating:.2f} among movies with at least 5 ratings.")


E.T. the Extra-Terrestrial (1982) AverageRating: 3.67 (TotalRatings: 18)
Psycho (1960) AverageRating: 4.00 (TotalRatings: 2)
Gladiator (2000) AverageRating: 3.61 (TotalRatings: 18)
Fight Club (1999) AverageRating: 3.50 (TotalRatings: 7)
The Lord of the Rings: The Fellowship of the Ring (2001) AverageRating: 3.89 (TotalRatings: 18)
The Terminator (1984) AverageRating: 4.06 (TotalRatings: 18)
The Godfather: Part II (1974) AverageRating: 4.00 (TotalRatings: 17)
The Silence of the Lambs (1991) AverageRating: 3.14 (TotalRatings: 7)
Mad Max: Fury Road (2015) AverageRating: 3.47 (TotalRatings: 18)
Lawrence of Arabia (1962) AverageRating: 3.44 (TotalRatings: 18)
Sunset Boulevard (1950) AverageRating: 4.36 (TotalRatings: 7)
The Social Network (2010) AverageRating: 3.86 (TotalRatings: 7)
No Country for Old Men (2007) AverageRating: 3.89 (TotalRatings: 18)
The Lord of the Rings: The Return of the King (2003) AverageRating: 3.82 (TotalRatings: 11)

Sunset Boulevard (1950) is the highest rated movi

In [None]:
# clean resource
sc.stop()
spark.stop()
print("Stopping Spark Context và Spark Session...")

Đã dừng Spark Context và Spark Session.
