In [1]:
# Import libs
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import os

# Init Spark Context
conf = SparkConf().setAppName("MovieRatingsAnalytics").setMaster("local[*]")
sc = SparkContext.getOrCreate(conf=conf)
spark = SparkSession.builder.appName("MovieRatingsAbalytics").getOrCreate()

print("Init successfully!")

25/11/25 21:07:17 WARN Utils: Your hostname, tienloc-laptop resolves to a loopback address: 127.0.1.1; using 192.168.31.171 instead (on interface wlp0s20f3)
25/11/25 21:07:17 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/25 21:07:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Init successfully!


In [3]:

data_path = "file:///home/tienloc/lab2bigdata/data/"

# Read the movies.txt file
movies_rdd = sc.textFile(data_path + "movies.txt")
print(f"Number of movies: {movies_rdd.count()}")

# Read the ratings_1.txt and ratings_2.txt files
ratings_1_rdd = sc.textFile(data_path + "ratings_1.txt")
ratings_2_rdd = sc.textFile(data_path + "ratings_2.txt")

print(f"Number of ratings from file 1: {ratings_1_rdd.count()}")
print(f"Number of ratings from file 2: {ratings_2_rdd.count()}")

# Display a few sample lines
print("\nSample data from movies.txt (first 5 lines):")
for line in movies_rdd.take(5):
    print(line)

print("\nSample data from ratings_1.txt (first 5 lines):")
for line in ratings_1_rdd.take(5):
    print(line)

                                                                                

Number of movies: 50
Number of ratings from file 1: 84
Number of ratings from file 2: 100

Sample data from movies.txt (first 5 lines):
1001,The Godfather (1972),Crime|Drama
1002,The Shawshank Redemption (1994),Drama
1003,Schindler's List (1993),Biography|Drama|History
1004,Raging Bull (1980),Biography|Drama|Sport
1005,Casablanca (1942),Drama|Romance|War

Sample data from ratings_1.txt (first 5 lines):
7,1020,4.5,1577836800
23,1015,3.5,1577923200
45,1030,4.0,1578009600
12,1047,3.0,1578096000
38,1012,4.5,1578182400


In [4]:
# Process movies data
# Parse movies.txt: MovieID, Title, Genres
def parse_movie_with_genres(line):
    parts = line.split(',', 2)  # Split into 3 parts: ID, Title, Genres
    movie_id = int(parts[0])
    title = parts[1]
    genres = parts[2] if len(parts) > 2 else ""
    # Split genres by "|" and remove extra spaces
    genre_list = [genre.strip() for genre in genres.split('|') if genre.strip()]
    return (movie_id, title, genre_list)


movies_parsed = movies_rdd.map(parse_movie_with_genres)
print("Movies parsed with genres (5 records):")
for movie in movies_parsed.take(5):
    print(f"MovieID: {movie[0]}, Title: {movie[1]}, Genres: {movie[2]}")


# Create a dictionary to look up movie info by ID (including genres)
movies_dict = movies_parsed.map(lambda x: (x[0], (x[1], x[2]))).collectAsMap()
print(f"\nTotal number of movies in dictionary: {len(movies_dict)}")


# Create an RDD containing (movie_id, genre) for each genre of each movie
movie_genres = movies_parsed.flatMap(lambda x: [(x[0], genre) for genre in x[2]])
print(f"\nTotal number of (movie_id, genre) pairs: {movie_genres.count()}")


print("\nSample movie_genres (10 records):")
for mg in movie_genres.take(10):
    print(f"MovieID: {mg[0]}, Genre: {mg[1]}")


Movies parsed with genres (5 records):
MovieID: 1001, Title: The Godfather (1972), Genres: ['Crime', 'Drama']
MovieID: 1002, Title: The Shawshank Redemption (1994), Genres: ['Drama']
MovieID: 1003, Title: Schindler's List (1993), Genres: ['Biography', 'Drama', 'History']
MovieID: 1004, Title: Raging Bull (1980), Genres: ['Biography', 'Drama', 'Sport']
MovieID: 1005, Title: Casablanca (1942), Genres: ['Drama', 'Romance', 'War']

Total number of movies in dictionary: 50

Total number of (movie_id, genre) pairs: 122

Sample movie_genres (10 records):
MovieID: 1001, Genre: Crime
MovieID: 1001, Genre: Drama
MovieID: 1002, Genre: Drama
MovieID: 1003, Genre: Biography
MovieID: 1003, Genre: Drama
MovieID: 1003, Genre: History
MovieID: 1004, Genre: Biography
MovieID: 1004, Genre: Drama
MovieID: 1004, Genre: Sport
MovieID: 1005, Genre: Drama


In [5]:
# Process ratings data
# Parse ratings: UserID, MovieID, Rating, Timestamp
def parse_rating(line):
    parts = line.split(',')
    movie_id = int(parts[1])
    rating = float(parts[2])
    return (movie_id, rating)


# Parse both rating files
ratings_1_parsed = ratings_1_rdd.map(parse_rating)
ratings_2_parsed = ratings_2_rdd.map(parse_rating)

print("Ratings 1 parsed (5 records):")
for rating in ratings_1_parsed.take(5):
    print(f"MovieID: {rating[0]}, Rating: {rating[1]}")

print("\nRatings 2 parsed (5 records):")
for rating in ratings_2_parsed.take(5):
    print(f"MovieID: {rating[0]}, Rating: {rating[1]}")


# Union both ratings RDDs
all_ratings = ratings_1_parsed.union(ratings_2_parsed)
print(f"\nTotal number of ratings from both files: {all_ratings.count()}")


Ratings 1 parsed (5 records):
MovieID: 1020, Rating: 4.5
MovieID: 1015, Rating: 3.5
MovieID: 1030, Rating: 4.0
MovieID: 1047, Rating: 3.0
MovieID: 1012, Rating: 4.5

Ratings 2 parsed (5 records):
MovieID: 1012, Rating: 3.5
MovieID: 1039, Rating: 4.0
MovieID: 1043, Rating: 4.5
MovieID: 1020, Rating: 3.0
MovieID: 1050, Rating: 4.0

Total number of ratings from both files: 184


In [6]:
# Compute average rating and total number of ratings for each genre

# Join ratings with movie_genres to get (genre, rating)
# all_ratings has format (movie_id, rating)
# movie_genres has format (movie_id, genre)

# Join to get (movie_id, (rating, genre))
ratings_with_genres = all_ratings.join(movie_genres)
print("Ratings with genres (5 records):")
for record in ratings_with_genres.take(5):
    print(f"MovieID: {record[0]}, Rating: {record[1][0]}, Genre: {record[1][1]}")


# Convert to (genre, rating)
genre_ratings = ratings_with_genres.map(lambda x: (x[1][1], x[1][0]))

print(f"\nTotal number of genre-rating pairs: {genre_ratings.count()}")
print("\nGenre ratings (10 records):")
for gr in genre_ratings.take(10):
    print(f"Genre: {gr[0]}, Rating: {gr[1]}")


# Compute total rating and number of ratings for each genre
# (genre, rating) -> (genre, (rating, 1))
genre_ratings_with_count = genre_ratings.map(lambda x: (x[0], (x[1], 1)))

# Reduce by key to calculate total rating and total number of ratings for each genre
# (genre, (sum_ratings, total_count))
genre_stats = genre_ratings_with_count.reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1]))

print(f"\nTotal number of genres with ratings: {genre_stats.count()}")
print("Genre stats (5 records):")
for stat in genre_stats.take(5):
    print(f"Genre: {stat[0]}, Sum: {stat[1][0]}, Count: {stat[1][1]}")


Ratings with genres (5 records):
MovieID: 1020, Rating: 4.5, Genre: Family
MovieID: 1020, Rating: 4.5, Genre: Sci-Fi
MovieID: 1020, Rating: 3.5, Genre: Family
MovieID: 1020, Rating: 3.5, Genre: Sci-Fi
MovieID: 1020, Rating: 3.5, Genre: Family

Total number of genre-rating pairs: 471

Genre ratings (10 records):
Genre: Family, Rating: 4.5
Genre: Sci-Fi, Rating: 4.5
Genre: Family, Rating: 3.5
Genre: Sci-Fi, Rating: 3.5
Genre: Family, Rating: 3.5
Genre: Sci-Fi, Rating: 3.5
Genre: Family, Rating: 3.5
Genre: Sci-Fi, Rating: 3.5
Genre: Family, Rating: 3.5
Genre: Sci-Fi, Rating: 3.5

Total number of genres with ratings: 12
Genre stats (5 records):
Genre: Sci-Fi, Sum: 201.5, Count: 54
Genre: Action, Sum: 200.5, Count: 54
Genre: Family, Sum: 66.0, Count: 18
Genre: Drama, Sum: 481.0, Count: 128
Genre: Biography, Sum: 89.0, Count: 25


In [7]:
# Compute average rating for each genre and display the results
def calculate_genre_average(record):
    genre, (sum_ratings, count) = record
    average_rating = sum_ratings / count
    return (genre, (average_rating, count))


genre_results = genre_stats.map(calculate_genre_average)

# Collect and display all genres with their average rating and total ratings
all_genres = genre_results.collect()
for genre, (avg_rating, count) in all_genres:
    print(f"{genre} - AverageRating: {avg_rating:.2f} (TotalRatings: {count})")


Sci-Fi - AverageRating: 3.73 (TotalRatings: 54)
Action - AverageRating: 3.71 (TotalRatings: 54)
Family - AverageRating: 3.67 (TotalRatings: 18)
Drama - AverageRating: 3.76 (TotalRatings: 128)
Biography - AverageRating: 3.56 (TotalRatings: 25)
Thriller - AverageRating: 3.70 (TotalRatings: 27)
Horror - AverageRating: 4.00 (TotalRatings: 2)
Adventure - AverageRating: 3.63 (TotalRatings: 83)
Film-Noir - AverageRating: 4.36 (TotalRatings: 7)
Mystery - AverageRating: 4.00 (TotalRatings: 2)
Fantasy - AverageRating: 3.86 (TotalRatings: 29)
Crime - AverageRating: 3.81 (TotalRatings: 42)


In [8]:
# Clean resource
sc.stop()
spark.stop()
print("Stopping Spark Context và Spark Session.")

Stopping Spark Context và Spark Session.
