In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import count, col, countDistinct, monotonically_increasing_id

In [2]:
spark = SparkSession.builder.master("local[4]") \
                    .appName('recommendation_system') \
                    .config("spark.driver.memory", "15g") \
                    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/12/23 12:25:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
df = spark.read.options(
            header=True,
            inferSchema=True
        ) \
        .csv('data/ratings_edited.csv')

                                                                                

In [4]:
df.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- movieIdx: integer (nullable = true)



In [5]:
df.count(), len(df.columns)

                                                                                

(20000263, 3)

### Removing Less common movies and users from the dataset

I am going to extract the ratings corresponding to the most common users and movies.
The threshold I am setting here is number of most common users is N = 10000 and number of most common movies is M = 2000.

In [6]:
N = 2000
M = 500

In [7]:
most_common_users_df = df.groupBy('userId') \
    .agg(count("*").alias("frequency")) \
    .orderBy(col('frequency').desc()) \
    .select('userId').head(N)

most_common_movies_df = df.groupBy('movieIdx') \
    .agg(count("*").alias("frequency")) \
    .orderBy(col('frequency').desc()) \
    .select('movieIdx').head(M)

most_common_users = [user[0] for user in most_common_users_df]
most_common_movies = [movie[0] for movie in most_common_movies_df]

                                                                                

In [8]:
df = df.filter(
    (col("userId").isin(most_common_users))
    &
    (col("movieIdx").isin(most_common_movies))
)

In [9]:
df.select(countDistinct('movieIdx')).collect()

                                                                                

[Row(count(DISTINCT movieIdx)=500)]

In [10]:
print(f"The unique number of users is {df.select(countDistinct('userId')).collect()[0][0]}\n"
      f"User ID starts from {df.agg({'userId': 'min'}).collect()[0][0]}\n"
      f"User ID ends at {df.agg({'userId': 'max'}).collect()[0][0]}"
)



The unique number of users is 2000
User ID starts from 103
User ID ends at 138324


                                                                                

In [11]:
user_id_map = df.select('userId').distinct().sort('userId')
user_id_map = user_id_map.withColumn("new_userId", monotonically_increasing_id())
user_id_map.show()



+------+----------+
|userId|new_userId|
+------+----------+
|   103|         0|
|   115|         1|
|   155|         2|
|   207|         3|
|   297|         4|
|   358|         5|
|   393|         6|
|   571|         7|
|   585|         8|
|   613|         9|
|   631|        10|
|   740|        11|
|   767|        12|
|   774|        13|
|   811|        14|
|   902|        15|
|   909|        16|
|   970|        17|
|   981|        18|
|  1199|        19|
+------+----------+
only showing top 20 rows



                                                                                

In [12]:
movie_id_map = df.select('movieIdx').distinct().sort('movieIdx')
movie_id_map = movie_id_map.withColumn("movieId", monotonically_increasing_id())
# movie_id_map.show()

In [13]:
df = df.join(
    user_id_map,
    df.userId == user_id_map.userId,
    "inner"
) \
    .drop('userId') \
    .withColumnRenamed('new_userId', 'userId')

df = df.join(
    movie_id_map,
    df.movieIdx == movie_id_map.movieIdx,
    "inner"
) \
    .drop('movieIdx')
# ratings.show()

In [14]:
df.count()

                                                                                

677529

In [15]:
df.write.options(header='True', delimiter=',').csv('./data/ratings_processed.csv')

                                                                                

In [16]:
spark.stop()