In [1]:
# Import libs
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import os

# Init Spark Context
conf = SparkConf().setAppName("MovieRatingsAnalytics").setMaster("local[*]")
sc = SparkContext.getOrCreate(conf=conf)
spark = SparkSession.builder.appName("MovieRatingsAnalytics").getOrCreate()

print("Init successfully!")

25/11/25 21:16:30 WARN Utils: Your hostname, tienloc-laptop resolves to a loopback address: 127.0.1.1; using 192.168.31.171 instead (on interface wlp0s20f3)
25/11/25 21:16:30 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/25 21:16:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Init successfully!


In [3]:

data_path = "file:///home/tienloc/lab2bigdata/data/"

# Read the movies.txt file
movies_rdd = sc.textFile(data_path + "movies.txt")
print(f"Number of movies: {movies_rdd.count()}")

# Read the ratings_1.txt and ratings_2.txt files
ratings_1_rdd = sc.textFile(data_path + "ratings_1.txt")
ratings_2_rdd = sc.textFile(data_path + "ratings_2.txt")
# Read users.txt
users_rdd = sc.textFile(data_path + "users.txt")

print(f"Number of ratings from file 1: {ratings_1_rdd.count()}")
print(f"Number of ratings from file 2: {ratings_2_rdd.count()}")

# Display a few sample lines
print("\nSample data from movies.txt (first 5 lines):")
for line in movies_rdd.take(5):
    print(line)

print("\nSample data from ratings_1.txt (first 5 lines):")
for line in ratings_1_rdd.take(5):
    print(line)

print("\nSample data from Users.txt (first 5 lines):")
for line in users_rdd.take(5):
    print(line)

Number of movies: 50
Number of ratings from file 1: 84
Number of ratings from file 2: 100

Sample data from movies.txt (first 5 lines):
1001,The Godfather (1972),Crime|Drama
1002,The Shawshank Redemption (1994),Drama
1003,Schindler's List (1993),Biography|Drama|History
1004,Raging Bull (1980),Biography|Drama|Sport
1005,Casablanca (1942),Drama|Romance|War

Sample data from ratings_1.txt (first 5 lines):
7,1020,4.5,1577836800
23,1015,3.5,1577923200
45,1030,4.0,1578009600
12,1047,3.0,1578096000
38,1012,4.5,1578182400

Sample data from Users.txt (first 5 lines):
21,M,28,3,12345
2,F,35,7,23456
3,M,42,2,34567
4,F,19,10,45678
5,M,31,1,56789


In [4]:
# Parse users.txt: UserID, Gender, Age, Occupation, Zip-code
def parse_user(line):
    parts = line.split(',')
    user_id = int(parts[0])
    gender = parts[1]  # M hoặc F
    age = int(parts[2])
    occupation = int(parts[3])
    zipcode = parts[4]
    return (user_id, gender)

users_parsed = users_rdd.map(parse_user)
print("Users parsed (5 records):")
for user in users_parsed.take(5):
    print(f"UserID: {user[0]}, Gender: {user[1]}")

# Create dictionary to look up gender by user id
users_dict = users_parsed.collectAsMap()
print(f"\nTotal number of user in dictionary: {len(users_dict)}")

Users parsed (5 records):
UserID: 21, Gender: M
UserID: 2, Gender: F
UserID: 3, Gender: M
UserID: 4, Gender: F
UserID: 5, Gender: M

Total number of user in dictionary: 49


In [5]:
# Process data movies
# Parse movies.txt: MovieID, Title, Genres
def parse_movie(line):
    parts = line.split(',', 2) 
    movie_id = int(parts[0])
    title = parts[1]
    genres = parts[2] if len(parts) > 2 else ""
    return (movie_id, title)

movies_parsed = movies_rdd.map(parse_movie)
print("Movies parsed (5 records):")
for movie in movies_parsed.take(5):
    print(f"MovieID: {movie[0]}, Title: {movie[1]}")

# Create dictionary to look up movie by movie id
movies_dict = movies_parsed.collectAsMap()
print(f"\nTotal number of films in dictionary: {len(movies_dict)}")

Movies parsed (5 records):
MovieID: 1001, Title: The Godfather (1972)
MovieID: 1002, Title: The Shawshank Redemption (1994)
MovieID: 1003, Title: Schindler's List (1993)
MovieID: 1004, Title: Raging Bull (1980)
MovieID: 1005, Title: Casablanca (1942)

Total number of films in dictionary: 50


In [6]:
# Processing data ratings
# Parse ratings: UserID, MovieID, Rating, Timestamp
def parse_rating_with_user(line):
    parts = line.split(',')
    user_id = int(parts[0])
    movie_id = int(parts[1])
    rating = float(parts[2])
    timestamp = int(parts[3])
    return (user_id, movie_id, rating)

# Parse 2 files ratings
ratings_1_parsed = ratings_1_rdd.map(parse_rating_with_user)
ratings_2_parsed = ratings_2_rdd.map(parse_rating_with_user)

print("Ratings 1 parsed (5 records):")
for rating in ratings_1_parsed.take(5):
    print(f"UserID: {rating[0]}, MovieID: {rating[1]}, Rating: {rating[2]}")

print("\nRatings 2 parsed (5 records):")
for rating in ratings_2_parsed.take(5):
    print(f"UserID: {rating[0]}, MovieID: {rating[1]}, Rating: {rating[2]}")

# Merge 2 RDD form ratings 
all_ratings = ratings_1_parsed.union(ratings_2_parsed)
print(f"\nTotal number of ratings from 2 files: {all_ratings.count()}")

Ratings 1 parsed (5 records):
UserID: 7, MovieID: 1020, Rating: 4.5
UserID: 23, MovieID: 1015, Rating: 3.5
UserID: 45, MovieID: 1030, Rating: 4.0
UserID: 12, MovieID: 1047, Rating: 3.0
UserID: 38, MovieID: 1012, Rating: 4.5

Ratings 2 parsed (5 records):
UserID: 12, MovieID: 1012, Rating: 3.5
UserID: 34, MovieID: 1039, Rating: 4.0
UserID: 27, MovieID: 1043, Rating: 4.5
UserID: 8, MovieID: 1020, Rating: 3.0
UserID: 19, MovieID: 1050, Rating: 4.0

Total number of ratings from 2 files: 184


In [7]:
# Add introduction information into ratings
# all_ratings: (user_id, movie_id, rating)
# Add gender: (user_id, movie_id, rating, gender)
def add_gender_to_rating(record):
    user_id, movie_id, rating = record
    gender = users_dict.get(user_id, "Unknown")
    return (movie_id, (rating, gender))  # (movie_id, (rating, gender))

ratings_with_gender = all_ratings.map(add_gender_to_rating)

print("Ratings with gender (10 records):")
for record in ratings_with_gender.take(10):
    print(f"MovieID: {record[0]}, Rating: {record[1][0]}, Gender: {record[1][1]}")

# Filter rating with defined gender either M or F
valid_ratings = ratings_with_gender.filter(lambda x: x[1][1] in ['M', 'F'])

print(f"\nTotal number of rating with valid gender: {valid_ratings.count()}")

# Split rating by gender
male_ratings = valid_ratings.filter(lambda x: x[1][1] == 'M').map(lambda x: (x[0], x[1][0]))  # (movie_id, rating)
female_ratings = valid_ratings.filter(lambda x: x[1][1] == 'F').map(lambda x: (x[0], x[1][0]))  # (movie_id, rating)

print(f"Number of ratings from Male : {male_ratings.count()}")
print(f"Number of ratings from Female: {female_ratings.count()}")

Ratings with gender (10 records):
MovieID: 1020, Rating: 4.5, Gender: M
MovieID: 1015, Rating: 3.5, Gender: M
MovieID: 1030, Rating: 4.0, Gender: M
MovieID: 1047, Rating: 3.0, Gender: F
MovieID: 1012, Rating: 4.5, Gender: F
MovieID: 1050, Rating: 3.5, Gender: F
MovieID: 1037, Rating: 4.0, Gender: M
MovieID: 1040, Rating: 3.0, Gender: M
MovieID: 1025, Rating: 4.5, Gender: F
MovieID: 1010, Rating: 3.5, Gender: M

Total number of rating with valid gender: 183
Number of ratings from Male : 91
Number of ratings from Female: 92


In [8]:
# Calculating average point for each gender
# Calculate stats for Male
male_stats = male_ratings.map(lambda x: (x[0], (x[1], 1))).reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1]))
male_averages = male_stats.map(lambda x: (x[0], x[1][0] / x[1][1]))  # (movie_id, average_rating)

# Calculate stats for Female
female_stats = female_ratings.map(lambda x: (x[0], (x[1], 1))).reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1]))
female_averages = female_stats.map(lambda x: (x[0], x[1][0] / x[1][1]))  # (movie_id, average_rating)

# Join male và female averages to get at least one point or if better 1 points for each movie
movie_gender_ratings = male_averages.fullOuterJoin(female_averages)  # (movie_id, (Option[male_avg], Option[female_avg]))

# Add movie name into results and process None value
def add_movie_title_with_na(record):
    movie_id, (male_avg_option, female_avg_option) = record
    movie_title = movies_dict.get(movie_id, f"Unknown Movie {movie_id}")

    male_avg_display = f"{male_avg_option:.2f}" if male_avg_option is not None else "NA"
    female_avg_display = f"{female_avg_option:.2f}" if female_avg_option is not None else "NA"

    return (movie_id, (movie_title, male_avg_display, female_avg_display))

final_results = movie_gender_ratings.map(add_movie_title_with_na)

# Show results
all_movies = final_results.collect()

for movie_id, (title, male_avg_display, female_avg_display) in all_movies:
    print(f"{title} - Male_Avg: {male_avg_display}, Female_Avg: {female_avg_display}")

Gladiator (2000) - Male_Avg: 3.59, Female_Avg: 3.64
The Terminator (1984) - Male_Avg: 3.93, Female_Avg: 4.14
Lawrence of Arabia (1962) - Male_Avg: 3.55, Female_Avg: 3.31
Mad Max: Fury Road (2015) - Male_Avg: 4.00, Female_Avg: 3.32
No Country for Old Men (2007) - Male_Avg: 3.91, Female_Avg: 3.83
E.T. the Extra-Terrestrial (1982) - Male_Avg: 3.81, Female_Avg: 3.55
Fight Club (1999) - Male_Avg: 3.50, Female_Avg: 3.50
Psycho (1960) - Male_Avg: NA, Female_Avg: 4.00
The Lord of the Rings: The Fellowship of the Ring (2001) - Male_Avg: 4.00, Female_Avg: 3.80
The Godfather: Part II (1974) - Male_Avg: 4.06, Female_Avg: 3.94
The Silence of the Lambs (1991) - Male_Avg: 3.33, Female_Avg: 3.00
Sunset Boulevard (1950) - Male_Avg: 4.33, Female_Avg: 4.50
The Social Network (2010) - Male_Avg: 4.00, Female_Avg: 3.67
The Lord of the Rings: The Return of the King (2003) - Male_Avg: 3.75, Female_Avg: 3.90


In [None]:
# Clean resource
sc.stop()
spark.stop()
print("Stopping Spark Context và Spark Session.")

Đã dừng Spark Context và Spark Session.
