In [1]:
# Import necessary libraries
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import os

# Init Spark Context
conf = SparkConf().setAppName("MovieRatingsAnalytics").setMaster("local[*]")
sc = SparkContext.getOrCreate(conf=conf)
spark = SparkSession.builder.appName("MovieRatingsAnalytics").getOrCreate()

print("Init successfully")

25/11/25 21:50:44 WARN Utils: Your hostname, tienloc-laptop resolves to a loopback address: 127.0.1.1; using 192.168.31.171 instead (on interface wlp0s20f3)
25/11/25 21:50:44 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/25 21:50:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Init successfully


In [2]:
data_path = "file:///home/tienloc/lab2bigdata/data/"
# Read data from ratings_1.txt và ratings_2.txt
ratings_1_rdd = sc.textFile(data_path + "ratings_1.txt")
ratings_2_rdd = sc.textFile(data_path + "ratings_2.txt")

print(f"Total number of rating from file 1: {ratings_1_rdd.count()}")
print(f"Total number of rating from file 2: {ratings_2_rdd.count()}")

# Show sample data
print("\nData ratings_1.txt (5 first lines):")
for line in ratings_1_rdd.take(5):
    print(line)

print("\nData ratings_2.txt (5 first lines:")
for line in ratings_2_rdd.take(5):
    print(line)

Total number of rating from file 1: 84
Total number of rating from file 2: 100

Data ratings_1.txt (5 first lines):
7,1020,4.5,1577836800
23,1015,3.5,1577923200
45,1030,4.0,1578009600
12,1047,3.0,1578096000
38,1012,4.5,1578182400

Data ratings_2.txt (5 first lines:
12,1012,3.5,1577837800
34,1039,4.0,1577924200
27,1043,4.5,1578010600
8,1020,3.0,1578097000
19,1050,4.0,1578183400


In [3]:
# Process data ratings with timestamp to get year
# Parse ratings: UserID, MovieID, Rating, Timestamp
def parse_rating_with_year(line):
    parts = line.split(',')
    rating = float(parts[2])
    timestamp = int(parts[3])

    # Convert timestamp into year
    try:
        year = datetime.fromtimestamp(timestamp).year
    except:
        year = 2000  # Default year if fail to parse

    return (year, rating)

# Parse 2 ratings files
ratings_1_parsed = ratings_1_rdd.map(parse_rating_with_year)
ratings_2_parsed = ratings_2_rdd.map(parse_rating_with_year)

print("Ratings 1 parsed with year (5 records):")
for rating in ratings_1_parsed.take(5):
    print(f"Year: {rating[0]}, Rating: {rating[1]}")

print("\nRatings 2 parsed with year (5 records):")
for rating in ratings_2_parsed.take(5):
    print(f"Year: {rating[0]}, Rating: {rating[1]}")

# Merge 2 RDD rating files together
all_ratings = ratings_1_parsed.union(ratings_2_parsed)
print(f"\nTotal number of ratings files: {all_ratings.count()}")

# Determine scope of year
years_sample = all_ratings.map(lambda x: x[0]).distinct().collect()
years_sample.sort()
print(f"\nYears existing in dataset: {years_sample}")
print(f"The lastest year: {min(years_sample)}")
print(f"The earliest year: {max(years_sample)}")

Ratings 1 parsed with year (5 records):
Year: 2000, Rating: 4.5
Year: 2000, Rating: 3.5
Year: 2000, Rating: 4.0
Year: 2000, Rating: 3.0
Year: 2000, Rating: 4.5

Ratings 2 parsed with year (5 records):
Year: 2000, Rating: 3.5
Year: 2000, Rating: 4.0
Year: 2000, Rating: 4.5
Year: 2000, Rating: 3.0
Year: 2000, Rating: 4.0

Total number of ratings files: 184

Years existing in dataset: [2000]
The lastest year: 2000
The earliest year: 2000


In [4]:
# Calculating rating and number of ratings per year
# (year, rating) -> (year, (rating, 1))
year_ratings_with_count = all_ratings.map(lambda x: (x[0], (x[1], 1)))

# Reduce by key to calculating sum of rating and total rating per year
# (year, (sum_ratings, total_count))
year_stats = year_ratings_with_count.reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1]))

# Calculating average point for each year and show result
def calculate_year_average(record):
    year, (sum_ratings, count) = record
    average_rating = sum_ratings / count
    return (year, (count, average_rating))  # (year, (total_ratings, average_rating))

year_results = year_stats.map(calculate_year_average)

# Order ascending year
sorted_year_results = year_results.sortByKey()

# show result
all_years = sorted_year_results.collect()

for year, (total_ratings, avg_rating) in all_years:
    print(f"{year} - TotalRatings: {total_ratings}, AverageRating: {avg_rating:.2f}")

2000 - TotalRatings: 184, AverageRating: 3.75


In [5]:
# Clean resources
sc.stop()
spark.stop()
print("Stopping Spark Context và Spark Session.")

Stopping Spark Context và Spark Session.
