In [1]:
import os
os.environ["PYSPARK_PYTHON"] = r"C:\Users\DELL\AppData\Local\Programs\Python\Python310\python.exe"
os.environ["PYSPARK_DRIVER_PYTHON"] = r"C:\Users\DELL\AppData\Local\Programs\Python\Python310\python.exe"

In [2]:
%pip install pyspark

Note: you may need to restart the kernel to use updated packages.


In [3]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession


# Cấu hình Spark
spark_conf = (
    SparkConf()
    .setAppName("MovieRatingSpark")
    .setMaster("local[*]")
)

# Tạo SparkContext
spark_context = SparkContext.getOrCreate(conf=spark_conf)

# Tạo SparkSession
spark_session = (
    SparkSession
    .builder
    .appName("MovieRatingSpark")
    .getOrCreate()
)

if spark_session:
    print("SparkSession created successfully.")
else:
    print("Error: Failed to create SparkSession.")

SparkSession created successfully.


In [4]:
import os

rs_path = "./resource/"

# Đọc file movies và rating{1,2} với RDD
movies_rdd = spark_context.textFile(os.path.join(rs_path, "movies.txt"))
ratings_1_rdd = spark_context.textFile(os.path.join(rs_path, "ratings_1.txt"))
ratings_2_rdd = spark_context.textFile(os.path.join(rs_path, "ratings_2.txt"))
users_rdd = spark_context.textFile(os.path.join(rs_path, "users.txt"))


# Show data
if movies_rdd and ratings_1_rdd and ratings_2_rdd:
    print("\nmovies.txt:")
    for line in movies_rdd.take(5):
        print(line)
    print("\nratings_1.txt:")
    for line in ratings_1_rdd.take(5):
        print(line)
    print("\nratings_2.txt:")
    for line in ratings_2_rdd.take(5):
        print(line)
    print("\nusers.txt:")
    for line in ratings_2_rdd.take(5):
        print(line)
else:
    print("Error: Failed to load RDD files.")


movies.txt:
1001,The Godfather (1972),Crime|Drama
1002,The Shawshank Redemption (1994),Drama
1003,Schindler's List (1993),Biography|Drama|History
1004,Raging Bull (1980),Biography|Drama|Sport
1005,Casablanca (1942),Drama|Romance|War

ratings_1.txt:
7,1020,4.5,1577836800
23,1015,3.5,1577923200
45,1030,4.0,1578009600
12,1047,3.0,1578096000
38,1012,4.5,1578182400

ratings_2.txt:
12,1012,3.5,1577837800
34,1039,4.0,1577924200
27,1043,4.5,1578010600
8,1020,3.0,1578097000
19,1050,4.0,1578183400

users.txt:
12,1012,3.5,1577837800
34,1039,4.0,1577924200
27,1043,4.5,1578010600
8,1020,3.0,1578097000
19,1050,4.0,1578183400


In [5]:
# Parse movies
def parse_movie(line):
    parts = line.split(',', 2)
    movie_id = int(parts[0])
    title = parts[1]
    genres = parts[2] if len(parts) > 2 else ""
    genres_list = genres.split('|') if genres else []
    return (movie_id, title)
movies_parse = movies_rdd.map(parse_movie)

print("After parsing:")
for col in movies_parse.take(5):
    print(f"Movie: {col[0]} {col[1]}")


After parsing:
Movie: 1001 The Godfather (1972)
Movie: 1002 The Shawshank Redemption (1994)
Movie: 1003 Schindler's List (1993)
Movie: 1004 Raging Bull (1980)
Movie: 1005 Casablanca (1942)


In [6]:
# Parse ratings
def parse_rating(line):
    parts = line.split(',')
    user_id = int(parts[0])
    movie_id = int(parts[1])
    rating = float(parts[2])
    return (user_id, movie_id, rating)

ratings_1_parse = ratings_1_rdd.map(parse_rating)
ratings_2_parse = ratings_2_rdd.map(parse_rating)

#show rating 1
print("Rating 1:")
for col in ratings_1_parse.take(5):
    print(f"User-movie: {col[0]} {col[1]}, rating: {col[2]}")

Rating 1:
User-movie: 7 1020, rating: 4.5
User-movie: 23 1015, rating: 3.5
User-movie: 45 1030, rating: 4.0
User-movie: 12 1047, rating: 3.0
User-movie: 38 1012, rating: 4.5


In [7]:
def parse_user(line):
    parts = line.split(',', 3)
    user_id = int(parts[0])
    gender = parts[1]
    age = int(parts[2])
    return (user_id, age)
user_parse = users_rdd.map(parse_user)

#show user
print("User: ")
for col in user_parse.take(5):
    print(f"User: {col[0]} {col[1]}")

User: 
User: 1 28
User: 2 35
User: 3 42
User: 4 19
User: 5 31


In [8]:
def age_bucket(age):
    if age <= 18:
        return "0-18"
    elif age <= 35:
        return "18-35"
    elif age <= 50:
        return "35-50"
    else:
        return "50+"

In [9]:
# Merge ratings 1 && 2
all_ratings = ratings_1_parse.union(ratings_2_parse)
print(f"After merging: {all_ratings.count()}")

#Tạo user_id key
ratings_by_user = all_ratings.map(
    lambda x: (x[0], (x[1], x[2]))   # (user_id, (movie_id, rating))
)

#Join với age (user_id, age)
ratings_user_age = ratings_by_user.join(user_parse) # (user_id, ((movie_id, rating), age))

# (user_id, ((movie_id, rating), age)) => age_bucket => ((movie_id, age_group), rating)
ratings_age_group = ratings_user_age.map(
    lambda x: ((x[1][0][0], age_bucket(x[1][1])), x[1][0][1])
)

# => ((movie_id, age_group), (rating, count))
ratings_pairs = ratings_age_group.map(
    lambda x: (x[0], (x[1], 1))
)

# Tính sum + count => ((movie_id, age_group), (sumRating, count))
ratings_stats = ratings_pairs.reduceByKey(
    lambda a, b: (a[0] + b[0], a[1] + b[1])
)

# Tính avg => ((movie_id, age_group), avg)
rating_age_group_avg = ratings_stats.mapValues(
    lambda x: x[0] / x[1]
)

#movie_id => key => (movie_id, (age_group, avg_rating))
age_group_avg_movie_key = rating_age_group_avg.map(
    lambda x: (x[0][0], (x[0][1], x[1]))  
)

grouped_age = age_group_avg_movie_key.groupByKey()

#(movie_id, {age_group: avg, ...})
age_group_avg_per_movie = grouped_age.mapValues(lambda items: dict(list(items)))

#Join với movie lấy title
#(movie_id, (title, {age_group: avg}))
movie_age_group = movies_parse.join(age_group_avg_per_movie)

def formatter(x):
    return f"{x:.2f}" if x is not None else "NA"

for _, (title, age_dict) in movie_age_group.take(10):
    print(
        f"{title} - "
        f"[0-18: {formatter(age_dict.get('0-18'))}, "
        f"18-35: {formatter(age_dict.get('18-35'))}, "
        f"35-50: {formatter(age_dict.get('35-50'))}, "
        f"50+: {formatter(age_dict.get('50+'))}]"
    )

After merging: 184
Gladiator (2000) - [0-18: NA, 18-35: 3.44, 35-50: 3.81, 50+: 3.50]
The Terminator (1984) - [0-18: NA, 18-35: 4.17, 35-50: 4.05, 50+: 3.75]
Lawrence of Arabia (1962) - [0-18: NA, 18-35: 3.60, 35-50: 3.29, 50+: 4.50]
Mad Max: Fury Road (2015) - [0-18: NA, 18-35: 3.36, 35-50: 3.64, 50+: NA]
No Country for Old Men (2007) - [0-18: NA, 18-35: 3.81, 35-50: 3.94, 50+: 4.00]
Psycho (1960) - [0-18: NA, 18-35: 4.50, 35-50: 3.50, 50+: NA]
E.T. the Extra-Terrestrial (1982) - [0-18: NA, 18-35: 3.56, 35-50: 3.83, 50+: 3.00]
Fight Club (1999) - [0-18: NA, 18-35: 3.50, 35-50: 3.50, 50+: 3.50]
The Godfather: Part II (1974) - [0-18: NA, 18-35: 3.78, 35-50: 4.25, 50+: NA]
The Lord of the Rings: The Fellowship of the Ring (2001) - [0-18: NA, 18-35: 4.00, 35-50: 3.83, 50+: NA]


In [11]:
# 22520126 - Trương Hoài Bảo
# Clear
spark_context.stop()
spark_session.stop()