### Importing the goodreads books dataset

In [1]:
val filePath = "bdad/project/dataset/goodreads_books.json"

val books_df = spark.read.json(filePath)
z.show(books_df)

In [2]:
books_df.printSchema()

In [3]:
books_df.count

### Data Cleaning

Extracting the required fields from array of struct like fields (`authors`, `popular_shelves`)

In [6]:
val author_books_df = books_df.withColumn("author_ids", expr("""authors.author_id"""))
val author_genre_books_df = author_books_df.withColumn("popular_shelves_genre", expr("""slice(popular_shelves.name,1,5)"""))
z.show(author_genre_books_df)

Selecting the important columns from the dataset

In [8]:
val books_base_df = author_genre_books_df.select(
    "book_id",
    "title",
    "title_without_series",
    "average_rating",
    "description",
    "num_pages",
    "ratings_count",
    "author_ids",
    "publisher",
    "popular_shelves_genre",
    "is_ebook",
    "similar_books",
    "country_code",
    "language_code"
)
z.show(books_base_df)

Changing the type of several columns and also filling in default values for some columns that have missing values

In [10]:
val casted_books_df = books_base_df.withColumn("average_rating", col("average_rating").cast("double")).withColumn("ratings_count", col("ratings_count").cast("int")).withColumn("num_pages", col("num_pages").cast("int")).withColumn("num_pages", col("num_pages").cast("int")).withColumn("is_ebook", col("is_ebook").cast("boolean"))
val non_null_books_df = casted_books_df.na.fill(0, Array("num_pages", "ratings_count", "average_rating")).na.fill(false, Array("is_ebook"))
non_null_books_df.printSchema()

One record after data cleaning

In [12]:
non_null_books_df.show(1, false)

### Data Profiling

In [14]:
non_null_books_df.groupBy("country_code").count().show()

In [15]:
non_null_books_df.groupBy("language_code").count().show()

In [16]:
non_null_books_df.groupBy("is_ebook").count().show()

In [17]:
non_null_books_df.select("num_pages").describe().show()

In [18]:
non_null_books_df.withColumn("similar_books_length", size(col("similar_books"))).select(avg($"similar_books_length")).show()

In [19]:
non_null_books_df.withColumn("description_length", length(col("description"))).select(avg($"description_length")).show()


In [20]:
non_null_books_df.withColumn("genres", explode(col("popular_shelves_genre"))).groupBy("genres").count().show()

### Saving the cleaned dataset

In [22]:
non_null_books_df.write.mode("overwrite").parquet("bdad/project/dataset/books.parquet")