In [1]:
import os
os.environ["PYSPARK_PYTHON"] = r"C:\Users\DELL\AppData\Local\Programs\Python\Python310\python.exe"
os.environ["PYSPARK_DRIVER_PYTHON"] = r"C:\Users\DELL\AppData\Local\Programs\Python\Python310\python.exe"

In [2]:
%pip install pyspark




In [3]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession


# Cấu hình Spark
spark_conf = (
    SparkConf()
    .setAppName("MovieRatingSpark")
    .setMaster("local[*]")
)

# Tạo SparkContext
spark_context = SparkContext.getOrCreate(conf=spark_conf)

# Tạo SparkSession
spark_session = (
    SparkSession
    .builder
    .appName("MovieRatingSpark")
    .getOrCreate()
)

if spark_session:
    print("SparkSession created successfully.")
else:
    print("Error: Failed to create SparkSession.")

SparkSession created successfully.


In [4]:
import os

rs_path = "./resource/"

# Đọc file movies và rating{1,2} với RDD
movies_rdd = spark_context.textFile(os.path.join(rs_path, "movies.txt"))
ratings_1_rdd = spark_context.textFile(os.path.join(rs_path, "ratings_1.txt"))
ratings_2_rdd = spark_context.textFile(os.path.join(rs_path, "ratings_2.txt"))
users_rdd = spark_context.textFile(os.path.join(rs_path, "users.txt"))


# Show data
if movies_rdd and ratings_1_rdd and ratings_2_rdd:
    print("\nmovies.txt:")
    for line in movies_rdd.take(5):
        print(line)
    print("\nratings_1.txt:")
    for line in ratings_1_rdd.take(5):
        print(line)
    print("\nratings_2.txt:")
    for line in ratings_2_rdd.take(5):
        print(line)
    print("\nusers.txt:")
    for line in ratings_2_rdd.take(5):
        print(line)
else:
    print("Error: Failed to load RDD files.")


movies.txt:
1001,The Godfather (1972),Crime|Drama
1002,The Shawshank Redemption (1994),Drama
1003,Schindler's List (1993),Biography|Drama|History
1004,Raging Bull (1980),Biography|Drama|Sport
1005,Casablanca (1942),Drama|Romance|War

ratings_1.txt:
7,1020,4.5,1577836800
23,1015,3.5,1577923200
45,1030,4.0,1578009600
12,1047,3.0,1578096000
38,1012,4.5,1578182400

ratings_2.txt:
12,1012,3.5,1577837800
34,1039,4.0,1577924200
27,1043,4.5,1578010600
8,1020,3.0,1578097000
19,1050,4.0,1578183400

users.txt:
12,1012,3.5,1577837800
34,1039,4.0,1577924200
27,1043,4.5,1578010600
8,1020,3.0,1578097000
19,1050,4.0,1578183400


In [5]:
# Parse movies
def parse_movie(line):
    parts = line.split(',', 2)
    movie_id = int(parts[0])
    title = parts[1]
    genres = parts[2] if len(parts) > 2 else ""
    genres_list = genres.split('|') if genres else []
    return (movie_id, title)
movies_parse = movies_rdd.map(parse_movie)

print("After parsing:")
for col in movies_parse.take(5):
    print(f"Movie: {col[0]} {col[1]}")


After parsing:
Movie: 1001 The Godfather (1972)
Movie: 1002 The Shawshank Redemption (1994)
Movie: 1003 Schindler's List (1993)
Movie: 1004 Raging Bull (1980)
Movie: 1005 Casablanca (1942)


In [6]:
# Parse ratings
def parse_rating(line):
    parts = line.split(',')
    user_id = int(parts[0])
    movie_id = int(parts[1])
    rating = float(parts[2])
    return (user_id, movie_id, rating)

ratings_1_parse = ratings_1_rdd.map(parse_rating)
ratings_2_parse = ratings_2_rdd.map(parse_rating)

#show rating 1
print("Rating 1:")
for col in ratings_1_parse.take(5):
    print(f"User-movie: {col[0]} {col[1]}, rating: {col[2]}")

Rating 1:
User-movie: 7 1020, rating: 4.5
User-movie: 23 1015, rating: 3.5
User-movie: 45 1030, rating: 4.0
User-movie: 12 1047, rating: 3.0
User-movie: 38 1012, rating: 4.5


In [7]:
def parse_user(line):
    parts = line.split(',')
    user_id = int(parts[0])
    gender = parts[1]
    return (user_id, gender)
user_parse = users_rdd.map(parse_user)

#show user
print("User: ")
for col in user_parse.take(5):
    print(f"User: {col[0]} {col[1]}")

User: 
User: 1 M
User: 2 F
User: 3 M
User: 4 F
User: 5 M


In [8]:
# Merge ratings 1 && 2
all_ratings = ratings_1_parse.union(ratings_2_parse)
print(f"After merging: {all_ratings.count()}")

#Tạo user_id key
ratings_by_user = all_ratings.map(
    lambda x: (x[0], (x[1], x[2]))   # (user_id, (movie_id, rating))
)

#Join với gender (user_id, gender)
ratings_user_gender = ratings_by_user.join(user_parse) # (user_id, ((movie_id, rating), gender))

# movie_id => key
movie_gender_rating = ratings_user_gender.map(
    lambda x: (
        x[1][0][0],              # movie_id
        (x[1][1], x[1][0][1], 1) # (gender, rating, 1)
    )
)

# (movie_id, gender) => key
movie_gender_key = movie_gender_rating.map(
    lambda x: ((x[0], x[1][0]), (x[1][1], x[1][2]))  
)

# tính (rating, cnt)
#((movie_id, gender), (sum, count))
gender_stats = movie_gender_key.reduceByKey(
    lambda a, b: (a[0] + b[0], a[1] + b[1])  #(sum, count)
)

#((movie_id, gender), avg_rating)
gender_avg = gender_stats.mapValues(
    lambda x: x[0] / x[1]                 
)

#movie_id => key => (movie_id, (gender, avg_rating))
gender_avg_movie_key = gender_avg.map(
    lambda x: (x[0][0], (x[0][1], x[1]))  
)

grouped_gender = gender_avg_movie_key.groupByKey()

gender_avg_per_movie = grouped_gender.mapValues(lambda items: dict(list(items)))

#Join với movie lấy title
movie_gender = movies_parse.join(gender_avg_per_movie)

def formatter(x):
    return f"{x:.2f}" if x is not None else "NA"

for _, (title, gender_dict) in movie_gender.take(20):
    male_avg = gender_dict.get("M")
    female_avg = gender_dict.get("F")
    
    print(f"{title} - Male_Avg: {formatter(male_avg)}, Female_Avg: {formatter(female_avg)}")

After merging: 184
Gladiator (2000) - Male_Avg: 3.59, Female_Avg: 3.64
The Terminator (1984) - Male_Avg: 3.93, Female_Avg: 4.14
Lawrence of Arabia (1962) - Male_Avg: 3.55, Female_Avg: 3.31
Mad Max: Fury Road (2015) - Male_Avg: 4.00, Female_Avg: 3.32
No Country for Old Men (2007) - Male_Avg: 3.92, Female_Avg: 3.83
Psycho (1960) - Male_Avg: NA, Female_Avg: 4.00
E.T. the Extra-Terrestrial (1982) - Male_Avg: 3.81, Female_Avg: 3.55
Fight Club (1999) - Male_Avg: 3.50, Female_Avg: 3.50
The Godfather: Part II (1974) - Male_Avg: 4.06, Female_Avg: 3.94
The Lord of the Rings: The Fellowship of the Ring (2001) - Male_Avg: 4.00, Female_Avg: 3.80
The Silence of the Lambs (1991) - Male_Avg: 3.33, Female_Avg: 3.00
Sunset Boulevard (1950) - Male_Avg: 4.33, Female_Avg: 4.50
The Lord of the Rings: The Return of the King (2003) - Male_Avg: 3.75, Female_Avg: 3.90
The Social Network (2010) - Male_Avg: 4.00, Female_Avg: 3.67


In [9]:
# Clear
spark_context.stop()
spark_session.stop()