In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import count, col, countDistinct, monotonically_increasing_id

In [3]:
spark = SparkSession.builder.master("local[4]") \
                    .appName('recommendation_system') \
                    .config("spark.driver.memory", "15g") \
                    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/12/25 12:54:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
df = spark.read.options(
            header=True,
            inferSchema=True
        ) \
        .csv('data/ratings_edited.csv')

                                                                                

In [5]:
df.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- movieIdx: integer (nullable = true)



In [6]:
df.count(), len(df.columns)

                                                                                

(20000263, 3)

### Removing Less common movies and users from the dataset

I am going to extract the ratings corresponding to the most common users and movies.
The threshold I am setting here is number of most common users is N = 10000 and number of most common movies is M = 2000.

In [7]:
N = 10000
M = 2000

In [8]:
most_common_users_df = df.groupBy('userId') \
    .agg(count("*").alias("frequency")) \
    .orderBy(col('frequency').desc()) \
    .select('userId').head(N)

most_common_movies_df = df.groupBy('movieIdx') \
    .agg(count("*").alias("frequency")) \
    .orderBy(col('frequency').desc()) \
    .select('movieIdx').head(M)

most_common_users = [user[0] for user in most_common_users_df]
most_common_movies = [movie[0] for movie in most_common_movies_df]

                                                                                

In [9]:
df = df.filter(
    (col("userId").isin(most_common_users))
    &
    (col("movieIdx").isin(most_common_movies))
)

In [10]:
df.select(countDistinct('movieIdx')).collect()

                                                                                

[Row(count(DISTINCT movieIdx)=2000)]

In [11]:
print(f"The unique number of users is {df.select(countDistinct('userId')).collect()[0][0]}\n"
      f"User ID starts from {df.agg({'userId': 'min'}).collect()[0][0]}\n"
      f"User ID ends at {df.agg({'userId': 'max'}).collect()[0][0]}"
)



The unique number of users is 10000
User ID starts from 10
User ID ends at 138473


                                                                                

In [12]:
user_id_map = df.select('userId').distinct().sort('userId')
user_id_map = user_id_map.withColumn("new_userId", monotonically_increasing_id())
user_id_map.show()



+------+----------+
|userId|new_userId|
+------+----------+
|    10|         0|
|    23|         1|
|    53|         2|
|    57|         3|
|    90|         4|
|    95|         5|
|   103|         6|
|   115|         7|
|   130|         8|
|   133|         9|
|   155|        10|
|   207|        11|
|   246|        12|
|   250|        13|
|   257|        14|
|   270|        15|
|   277|        16|
|   293|        17|
|   297|        18|
|   317|        19|
+------+----------+
only showing top 20 rows



                                                                                

In [13]:
movie_id_map = df.select('movieIdx').distinct().sort('movieIdx')
movie_id_map = movie_id_map.withColumn("movieId", monotonically_increasing_id())
# movie_id_map.show()

In [14]:
df = df.join(
    user_id_map,
    df.userId == user_id_map.userId,
    "inner"
) \
    .drop('userId') \
    .withColumnRenamed('new_userId', 'userId')

df = df.join(
    movie_id_map,
    df.movieIdx == movie_id_map.movieIdx,
    "inner"
) \
    .drop('movieIdx')
# ratings.show()

In [15]:
df.count()

                                                                                

5392234

In [16]:
# df.toPandas().to_csv('./data/ratings_preprocessed.csv')

                                                                                

In [15]:
df.write.options(header='True', delimiter=',').csv('./data/ratings_processed.csv')

                                                                                

In [16]:
spark.stop()