### Importing the goodreads Interactions dataset

In [1]:
val df_user = spark.read
                    .option("header", "true")
                    .option("multiline", "true")
                    .option("inferSchema", "true")
                    .csv("BDAD_Project/user_id_map.csv")
                    .as("dfUser")

df_user.printSchema()

val df_book = spark.read
                    .option("header", "true")
                    .option("multiline", "true")
                    .option("inferSchema", "true")
                    .csv("BDAD_Project/book_id_map.csv")
                    .as("dfBook")

df_book.printSchema()

val df_interactions = spark.read
                    .option("header", "true")
                    .option("multiline", "true")
                    .option("inferSchema", "true")
                    .csv("BDAD_Project/goodreads_interactions.csv")
                    .as("dfInteraction")

df_interactions.printSchema()

In [2]:
df_interactions.count

### Data Cleaning

In [4]:
val joined_df = df_interactions.join(
                    df_book,
                    col("dfInteraction.book_id") === col("dfBook.book_id_csv"),
                    "inner"
                ).join(
                    df_user,
                    col("dfInteraction.user_id") === col("dfUser.user_id_csv"),
                    "inner"
                )
val filtered_df = joined_df.select(
                    col("dfUser.user_id"),
                    col("dfBook.book_id"),
                    col("dfInteraction.is_read"),
                    col("dfInteraction.rating"),
                    col("dfInteraction.is_reviewed")
                )

filtered_df.printSchema()

In [5]:
val finalInteraction_df = filtered_df.withColumn("book_id", col("book_id").cast("string")).na.fill("", Array("user_id", "book_id")).na.fill(0, Array("is_read", "rating", "is_reviewed"))

finalInteraction_df.printSchema()

One record after data cleaning

In [7]:
finalInteraction_df.show(1, false)

### Data Profiling

Number of Ratings given by each user

In [10]:
val userRatingCount = finalInteraction_df.where(col("rating") =!= 0).groupBy("user_id").count()
z.show(userRatingCount)

Number of Ratings recieved by each book

In [12]:
val bookRatingCount = finalInteraction_df.where(col("rating") =!= 0).groupBy("book_id").count()
z.show(bookRatingCount)


Average Rating of each book

In [14]:
val bookAvgRating = finalInteraction_df.where(col("rating") =!= 0).groupBy("book_id").avg("rating") 
z.show(bookAvgRating)

Number of Books for different Ratings

In [16]:
var ratingBookCount = bookAvgRating.withColumn("rating",  floor(col("avg(rating)"))).groupBy("rating").count()
z.show(ratingBookCount)

Number of Books read by a user

In [18]:
val userBookCount = finalInteraction_df.where(col("is_read") === 1).groupBy("user_id").count()
z.show(userBookCount)

### Saving the cleaned dataset

In [20]:
finalInteraction_df.write.mode("overwrite").parquet("BDAD_Project/interactions.parquet")