In [20]:
import os
os.environ["PYSPARK_PYTHON"] = r"C:\Users\DELL\AppData\Local\Programs\Python\Python310\python.exe"
os.environ["PYSPARK_DRIVER_PYTHON"] = r"C:\Users\DELL\AppData\Local\Programs\Python\Python310\python.exe"

In [21]:
%pip install pyspark

Note: you may need to restart the kernel to use updated packages.


In [22]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession


# Cấu hình Spark
spark_conf = (
    SparkConf()
    .setAppName("MovieRatingSpark")
    .setMaster("local[*]")
)

# Tạo SparkContext
spark_context = SparkContext.getOrCreate(conf=spark_conf)

# Tạo SparkSession
spark_session = (
    SparkSession
    .builder
    .appName("MovieRatingSpark")
    .getOrCreate()
)

if spark_session:
    print("SparkSession created successfully.")
else:
    print("Error: Failed to create SparkSession.")

SparkSession created successfully.


In [23]:
import os

rs_path = "./resource/"

# Đọc file movies và rating{1,2} với RDD
movies_rdd = spark_context.textFile(os.path.join(rs_path, "movies.txt"))
ratings_1_rdd = spark_context.textFile(os.path.join(rs_path, "ratings_1.txt"))
ratings_2_rdd = spark_context.textFile(os.path.join(rs_path, "ratings_2.txt"))
users_rdd = spark_context.textFile(os.path.join(rs_path, "users.txt"))
occupation_rdd = spark_context.textFile(os.path.join(rs_path, "occupation.txt"))


# Show data
if movies_rdd and ratings_1_rdd and ratings_2_rdd:
    print("\nmovies.txt:")
    for line in movies_rdd.take(5):
        print(line)
    print("\nratings_1.txt:")
    for line in ratings_1_rdd.take(5):
        print(line)
    print("\nratings_2.txt:")
    for line in ratings_2_rdd.take(5):
        print(line)
    print("\nusers.txt:")
    for line in ratings_2_rdd.take(5):
        print(line)
    print("\noccupation.txt:")
    for line in occupation_rdd.take(5):
        print(line)
else:
    print("Error: Failed to load RDD files.")


movies.txt:
1001,The Godfather (1972),Crime|Drama
1002,The Shawshank Redemption (1994),Drama
1003,Schindler's List (1993),Biography|Drama|History
1004,Raging Bull (1980),Biography|Drama|Sport
1005,Casablanca (1942),Drama|Romance|War

ratings_1.txt:
7,1020,4.5,1577836800
23,1015,3.5,1577923200
45,1030,4.0,1578009600
12,1047,3.0,1578096000
38,1012,4.5,1578182400

ratings_2.txt:
12,1012,3.5,1577837800
34,1039,4.0,1577924200
27,1043,4.5,1578010600
8,1020,3.0,1578097000
19,1050,4.0,1578183400

users.txt:
12,1012,3.5,1577837800
34,1039,4.0,1577924200
27,1043,4.5,1578010600
8,1020,3.0,1578097000
19,1050,4.0,1578183400

occupation.txt:
1,Programmer
2,Doctor
3,Engineer
4,Teacher
5,Lawyer


In [24]:
# Parse movies
def parse_movie(line):
    parts = line.split(',', 2)
    movie_id = int(parts[0])
    title = parts[1]
    genres = parts[2] if len(parts) > 2 else ""
    genres_list = genres.split('|') if genres else []
    return (movie_id, title)
movies_parse = movies_rdd.map(parse_movie)

print("After parsing:")
for col in movies_parse.take(5):
    print(f"Movie: {col[0]} {col[1]}")


After parsing:
Movie: 1001 The Godfather (1972)
Movie: 1002 The Shawshank Redemption (1994)
Movie: 1003 Schindler's List (1993)
Movie: 1004 Raging Bull (1980)
Movie: 1005 Casablanca (1942)


In [32]:
import datetime
# Parse ratings
def parse_rating(line):
    parts = line.split(',')
    user_id = int(parts[0])
    movie_id = int(parts[1])
    rating = float(parts[2])
    timestamp = int(parts[3])
    try:
        year = datetime.datetime.fromtimestamp(timestamp).year
    except:
        year = 2000  # Default year
    return (year, rating)

ratings_1_parse = ratings_1_rdd.map(parse_rating)
ratings_2_parse = ratings_2_rdd.map(parse_rating)

#show rating 1
print("Rating 1:")
for col in ratings_1_parse.take(5):
    print(f"Rating: {col[1]}, year: {col[0]}")

Rating 1:
Rating: 4.5, year: 2020
Rating: 3.5, year: 2020
Rating: 4.0, year: 2020
Rating: 3.0, year: 2020
Rating: 4.5, year: 2020


In [26]:
def parse_user(line):
    parts = line.split(',', 4)
    user_id = int(parts[0])
    gender = parts[1]
    age = int(parts[2])
    occupation_id = int(parts[3])
    return (user_id, occupation_id)
user_parse = users_rdd.map(parse_user)

#show user
print("User: ")
for col in user_parse.take(5):
    print(f"User: {col[0]} {col[1]}")

User: 
User: 1 3
User: 2 7
User: 3 2
User: 4 10
User: 5 1


In [33]:
# Merge ratings 1 && 2
#  (year, rating)
all_ratings = ratings_1_parse.union(ratings_2_parse)
print(f"After merging: {all_ratings.count()}")

# (year, (rating, 1))
ratings_pair = all_ratings.map(
    lambda x: (x[0], (x[1], 1))
)
# tính sum + count => # (year, (sumRating, count))
ratings_stats = ratings_pair.reduceByKey(
    lambda a, b: (a[0] + b[0], a[1] + b[1])
)
#(year, (avg, count))
rating_year_avg = ratings_stats.mapValues(
    lambda x: (x[0] / x[1], x[1])
)

def formatter(x):
    return f"{x:.2f}" if x is not None else "NA"

for year, (avg, count) in rating_year_avg.take(10):
    print(
        f"{year} - (TotalRatings: {count}), AverageRating: {formatter(avg)}"
    )

After merging: 184
2020 - (TotalRatings: 184), AverageRating: 3.75


In [None]:
# Clear
spark_context.stop()
spark_session.stop()