In [1]:
# Import neccessary libraries
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import os

# Khởi tạo Spark Context
conf = SparkConf().setAppName("MovieRatingsAnalytics").setMaster("local[*]")
sc = SparkContext.getOrCreate(conf=conf)
spark = SparkSession.builder.appName("MovieRatingsAnalytics").getOrCreate()

print("Init successfully")

25/11/25 21:40:51 WARN Utils: Your hostname, tienloc-laptop resolves to a loopback address: 127.0.1.1; using 192.168.31.171 instead (on interface wlp0s20f3)
25/11/25 21:40:51 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/25 21:40:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Init successfully


In [2]:
data_path = "file:///home/tienloc/lab2bigdata/data/"

# Read file ratings_1.txt và ratings_2.txt
ratings_1_rdd = sc.textFile(data_path + "ratings_1.txt")
ratings_2_rdd = sc.textFile(data_path + "ratings_2.txt")

print(f"Number of rating from file 1: {ratings_1_rdd.count()}")
print(f"Number of rating from file 2: {ratings_2_rdd.count()}")

# Read file users.txt và occupation.txt
users_rdd = sc.textFile(data_path + "users.txt")
occupation_rdd = sc.textFile(data_path + "occupation.txt")


print(f"Number of user: {users_rdd.count()}")
print(f"Number of occupation: {occupation_rdd.count()}")

print("\nData from ratings_1.txt (5 first lines):")
for line in ratings_1_rdd.take(5):
    print(line)

print("\nData from users.txt (5 first lines):")
for line in users_rdd.take(5):
    print(line)

print("\nData from occupation.txt (5 first lines):")
for line in occupation_rdd.take(5):
    print(line)

Number of rating from file 1: 84
Number of rating from file 2: 100
Number of user: 50
Number of occupation: 15

Data from ratings_1.txt (5 first lines):
7,1020,4.5,1577836800
23,1015,3.5,1577923200
45,1030,4.0,1578009600
12,1047,3.0,1578096000
38,1012,4.5,1578182400

Data from users.txt (5 first lines):
21,M,28,3,12345
2,F,35,7,23456
3,M,42,2,34567
4,F,19,10,45678
5,M,31,1,56789

Data from occupation.txt (5 first lines):
1,Programmer
2,Doctor
3,Engineer
4,Teacher
5,Lawyer


In [4]:
# Process data occupation
# Parse occupation.txt: ID, Occupation
def parse_occupation(line):
    parts = line.split(',', 1)  
    occupation_id = int(parts[0])
    occupation_name = parts[1] if len(parts) > 1 else f"Unknown {occupation_id}"
    return (occupation_id, occupation_name)

occupations_parsed = occupation_rdd.map(parse_occupation)
print("Occupations parsed (5 records):")
for occ in occupations_parsed.take(5):
    print(f"OccupationID: {occ[0]}, Occupation: {occ[1]}")

# Create dictionary to look up by ID
occupations_dict = occupations_parsed.collectAsMap()
print(f"\nTotal number of occupation in dictionary: {len(occupations_dict)}")

# Process data by users
# Parse users.txt: UserID, Gender, Age, Occupation, Zip-code
def parse_user_occupation(line):
    parts = line.split(',')
    user_id = int(parts[0])
    occupation_id = int(parts[3])  # ID occupation
    return (user_id, occupation_id)

users_parsed = users_rdd.map(parse_user_occupation)
print("\nUsers parsed with occupation (5 records):")
for user in users_parsed.take(5):
    user_id, occ_id = user
    occ_name = occupations_dict.get(occ_id, f"Unknown {occ_id}")
    print(f"UserID: {user_id}, OccupationID: {occ_id}, Occupation: {occ_name}")

# Create dictionary to look up occupation by UserID
users_occupation_dict = users_parsed.collectAsMap()
print(f"\nTotal number of users in dictionary: {len(users_occupation_dict)}")

Occupations parsed (5 records):
OccupationID: 1, Occupation: Programmer
OccupationID: 2, Occupation: Doctor
OccupationID: 3, Occupation: Engineer
OccupationID: 4, Occupation: Teacher
OccupationID: 5, Occupation: Lawyer

Total number of occupation in dictionary: 15

Users parsed with occupation (5 records):
UserID: 21, OccupationID: 3, Occupation: Engineer
UserID: 2, OccupationID: 7, Occupation: Manager
UserID: 3, OccupationID: 2, Occupation: Doctor
UserID: 4, OccupationID: 10, Occupation: Accountant
UserID: 5, OccupationID: 1, Occupation: Programmer

Total number of users in dictionary: 49


In [5]:
# Process data ratings
# Parse ratings: UserID, MovieID, Rating, Timestamp
def parse_rating_with_user(line):
    parts = line.split(',')
    user_id = int(parts[0])
    rating = float(parts[2])
    return (user_id, rating)  # (user_id, rating) to join with users

# Parse both 2 files ratings
ratings_1_parsed = ratings_1_rdd.map(parse_rating_with_user)
ratings_2_parsed = ratings_2_rdd.map(parse_rating_with_user)

print("Ratings 1 parsed (5 records):")
for rating in ratings_1_parsed.take(5):
    print(f"UserID: {rating[0]}, Rating: {rating[1]}")

print("\nRatings 2 parsed (5 records):")
for rating in ratings_2_parsed.take(5):
    print(f"UserID: {rating[0]}, Rating: {rating[1]}")

# Merge 2 RDD from ratings together
all_ratings = ratings_1_parsed.union(ratings_2_parsed)
print(f"\nTotal number of ratings from 2 files: {all_ratings.count()}")

Ratings 1 parsed (5 records):
UserID: 7, Rating: 4.5
UserID: 23, Rating: 3.5
UserID: 45, Rating: 4.0
UserID: 12, Rating: 3.0
UserID: 38, Rating: 4.5

Ratings 2 parsed (5 records):
UserID: 12, Rating: 3.5
UserID: 34, Rating: 4.0
UserID: 27, Rating: 4.5
UserID: 8, Rating: 3.0
UserID: 19, Rating: 4.0

Total number of ratings from 2 files: 184


In [6]:
# Calculate average point and total number of votes for each occupation

# Adding occupation into ratings
# all_ratings has format (user_id, rating)
# users_occupation_dict has format {user_id: occupation_id}

def add_occupation_to_rating(record):
    user_id, rating = record
    occupation_id = users_occupation_dict.get(user_id, -1)  
    occupation_name = occupations_dict.get(occupation_id, "Unknown")
    return (occupation_name, rating)

ratings_with_occupation = all_ratings.map(add_occupation_to_rating)

print("Ratings with occupation (5 records):")
for record in ratings_with_occupation.take(5):
    print(f"Occupation: {record[0]}, Rating: {record[1]}")

# Remove rating with occupation "Unknown"
valid_ratings = ratings_with_occupation.filter(lambda x: x[0] != "Unknown")

print(f"\nTotal number of valid occupation - rating pairs: {valid_ratings.count()}")

# Calculating rating and number of rating for each occupation
# (occupation, rating) -> (occupation, (rating, 1))
occupation_ratings_with_count = valid_ratings.map(lambda x: (x[0], (x[1], 1)))

# Reduce by key in order to calculate rating and total number of rating for each occupation
# (occupation, (sum_ratings, total_count))
occupation_stats = occupation_ratings_with_count.reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1]))

print(f"\nTotal number of occupation that has rating: {occupation_stats.count()}")
print("Occupation stats (5 records):")
for stat in occupation_stats.take(5):
    print(f"Occupation: {stat[0]}, Sum: {stat[1][0]}, Count: {stat[1][1]}")

Ratings with occupation (5 records):
Occupation: Designer, Rating: 4.5
Occupation: Consultant, Rating: 3.5
Occupation: Designer, Rating: 4.0
Occupation: Nurse, Rating: 3.0
Occupation: Journalist, Rating: 4.5

Total number of valid occupation - rating pairs: 183

Total number of occupation that has rating: 14
Occupation stats (5 records):
Occupation: Nurse, Sum: 42.5, Count: 11
Occupation: Manager, Sum: 55.5, Count: 16
Occupation: Artist, Sum: 41.0, Count: 11
Occupation: Designer, Sum: 52.0, Count: 13
Occupation: Programmer, Sum: 42.5, Count: 10


In [7]:
# Calculatiing average point for each occupation and show results
def calculate_occupation_average(record):
    occupation, (sum_ratings, count) = record
    average_rating = sum_ratings / count
    return (occupation, (average_rating, count))

occupation_results = occupation_stats.map(calculate_occupation_average)

# Show results
all_occupations = occupation_results.collect()

for occupation, (avg_rating, count) in all_occupations:
    print(f"{occupation} - AverageRating: {avg_rating:.2f} (TotalRatings: {count})")

Nurse - AverageRating: 3.86 (TotalRatings: 11)
Manager - AverageRating: 3.47 (TotalRatings: 16)
Artist - AverageRating: 3.73 (TotalRatings: 11)
Designer - AverageRating: 4.00 (TotalRatings: 13)
Programmer - AverageRating: 4.25 (TotalRatings: 10)
Teacher - AverageRating: 3.70 (TotalRatings: 5)
Consultant - AverageRating: 3.86 (TotalRatings: 14)
Doctor - AverageRating: 3.69 (TotalRatings: 21)
Student - AverageRating: 4.00 (TotalRatings: 8)
Salesperson - AverageRating: 3.65 (TotalRatings: 17)
Engineer - AverageRating: 3.53 (TotalRatings: 17)
Journalist - AverageRating: 3.85 (TotalRatings: 17)
Lawyer - AverageRating: 3.65 (TotalRatings: 17)
Accountant - AverageRating: 3.58 (TotalRatings: 6)


In [8]:
# Clean resources
sc.stop()
spark.stop()
print("Stopping Spark Context và Spark Session.")

Stopping Spark Context và Spark Session.
