In [0]:
val interactionsDF = spark.read.parquet("BDAD_Project/interactions.parquet").cache()
val authorsDF = spark.read.parquet("BDAD_Project/goodreads_authors_3.parquet").cache()
val booksDF = spark.read.parquet("BDAD_Project/books_genres.parquet").cache()
val reviewsDF = spark.read.parquet("BDAD_Project/reviews.parquet").cache()
val reviewSentimentDF = spark.read.parquet("BDAD_Project/reviewsSentiment.parquet").cache()

authorsDF.printSchema()
booksDF.printSchema()
reviewsDF.printSchema()
interactionsDF.printSchema()
reviewSentimentDF.printSchema()

### Most popular genres

In [2]:
var bookGenres = booksDF.select("genres").withColumn("all_genres", explode(col("genres"))).groupBy("all_genres").count().orderBy(desc("count"))
z.show(bookGenres, 10)

### Top K books

In [4]:
var ratingsCountThreshold = 500;
booksDF.select("book_id", "title", "average_rating", "ratings_count").filter(booksDF("ratings_count") > ratingsCountThreshold).orderBy(desc("average_rating")).show(5, false)

### Top K books by genre

In [6]:
val allGenres = booksDF.select("genres").withColumn("genres_1", explode(col("genres"))).select("genres_1").distinct().collect()

In [7]:
for (genre <- allGenres) {
    println(genre)
    booksDF.select("book_id", "title", "average_rating", "ratings_count", "genres").filter(array_contains(col("genres"), genre(0))).orderBy(desc("average_rating")).filter(booksDF("ratings_count") > ratingsCountThreshold).show(5, false)
}

### Top rated authors 

In [9]:
val averageAuthorRating = authorsDF.select(avg($"ratings_count"))
val average_ratings = averageAuthorRating.collect()(0)(0)
val topAuthors = authorsDF.select("author_id", "name", "average_rating", "ratings_count").filter(authorsDF("ratings_count") > average_ratings).orderBy(desc("average_rating"))
z.show(topAuthors, 5)

### Recommendation to user based on his/her read books

In [11]:
var sampleUserId = "8842281e1d1347389f2ab93d60773d4d"
var userRatingThres = 4.5;

var userDF = reviewsDF.where(col("user_id") === sampleUserId).cache()
var userTopBooks = userDF.where(col("rating") > userRatingThres).select("book_id").collect.toList.map(x => x(0))
var userSimilarBooksDF = booksDF.filter(col("book_id").isin(userTopBooks:_*)).select("book_id", "title", "average_rating", "ratings_count", "similar_books").cache()

userSimilarBooksDF.orderBy(desc("average_rating")).show(5, false)

var userSimilarBooks = userSimilarBooksDF.select("similar_books").withColumn("similar_book_ids", explode(col("similar_books"))).select("similar_book_ids").distinct().collect.toList.map(x => x(0))
val userBookRecDF = booksDF.filter(col("book_id").isin(userSimilarBooks:_*)).filter(booksDF("ratings_count") > ratingsCountThreshold).orderBy(desc("average_rating")).select("book_id", "title", "average_rating", "ratings_count").show(5, false)

### Recommendation to user based on similar users

In [13]:
// var sampleUserId = "8842281e1d1347389f2ab93d60773d4d"
// var userRatingThres = 3.5;

// var userDF = reviewsDF.where(col("user_id") === sampleUserId).cache()
var userTopBooks = userDF.where(col("rating") > userRatingThres).select("book_id").collect.toList.map(x => x(0))
var similarUsers = reviewsDF.filter(col("book_id").isin(userTopBooks:_*)).filter(col("rating") > userRatingThres).select("user_id").distinct().collect.toList.map(x => x(0))

var similarUsersBooks = reviewsDF.filter(col("user_id").isin(similarUsers:_*)).filter(col("rating") > userRatingThres).select("book_id").distinct().collect.toList.map(x => x(0))
var similarUserBooksFiltered = similarUsersBooks diff userTopBooks
val similarUsersBookRecDF = booksDF.filter(col("book_id").isin(similarUserBooksFiltered:_*)).filter(booksDF("ratings_count") > ratingsCountThreshold).orderBy(desc("average_rating")).select("book_id", "title", "average_rating", "ratings_count").show(10, false)

### Recommendation to user based on their liked authors


In [15]:
// var sampleUserId = "8842281e1d1347389f2ab93d60773d4d"
// var userRatingThres = 3.5;

//val userDF = reviewsDF.where(col("user_id") === sampleUserId).cache()

val userTopBookIds = userDF.where(col("rating") > userRatingThres).select("book_id").collect.toList.map(x => x(0))
val userTopBooks = booksDF.filter(col("book_id").isin(userTopBookIds:_*)).select("book_id", "title", "average_rating", "ratings_count", "author_ids").cache()

val userTopAuthors = userTopBooks.select("author_ids").withColumn("authors", explode(col("author_ids"))).select("authors").distinct().collect.toList.map(x => x(0))


var recByAuthors = booksDF.select("book_id", "title", "average_rating", "ratings_count", "author_ids").withColumn("authors", explode(col("author_ids"))).filter(col("authors").isin(userTopAuthors:_*)).select("authors", "book_id", "title", "average_rating", "ratings_count")
var recByAuthorsDF = recByAuthors.join(authorsDF.select("name", "author_id"), recByAuthors("authors") === authorsDF("author_id"), "left_outer").drop(recByAuthors("authors")).filter(recByAuthors("ratings_count") > ratingsCountThreshold).orderBy(desc("average_rating")).select("name", "title", "average_rating", "ratings_count")

z.show(recByAuthorsDF, 5)


### Recommendation to user based on review sentiments


In [17]:
// var sampleUserId = "8842281e1d1347389f2ab93d60773d4d"
// var ratingsCountThreshold = 4

var userSentimentDF = reviewSentimentDF.where(col("user_id") === sampleUserId).cache()
var userSentimentTopBooks = userSentimentDF.orderBy(desc("score")).where(col("rating") > userRatingThres).select("book_id").limit(10).collect.toList.map(x => x(0))
var userSentimentSimilarBooksDF = booksDF.filter(col("book_id").isin(userSentimentTopBooks:_*)).select("book_id", "title", "average_rating", "ratings_count", "similar_books").cache()

userSentimentSimilarBooksDF.orderBy(desc("average_rating")).show(5, false)

var userSentimentSimilarBooks = userSentimentSimilarBooksDF.select("similar_books").withColumn("similar_book_ids", explode(col("similar_books"))).select("similar_book_ids").distinct().collect.toList.map(x => x(0))
val userSentimentBookRecDF = booksDF.filter(col("book_id").isin(userSentimentSimilarBooks:_*)).filter(booksDF("ratings_count") > ratingsCountThreshold).orderBy(desc("average_rating")).select("book_id", "title", "average_rating", "ratings_count").show(5, false)

### Most popular genre for a user

In [19]:
// var sampleUserId = "8842281e1d1347389f2ab93d60773d4d"
// var userRatingThres = 3.5;

//val userDF = reviewsDF.where(col("user_id") === sampleUserId).cache()
val userTopBookIds = userDF.where(col("rating") > userRatingThres).select("book_id").collect.toList.map(x => x(0))
val userTopBooks = booksDF.filter(col("book_id").isin(userTopBookIds:_*)).select("genres").withColumn("all_genres", explode(col("genres"))).groupBy("all_genres").count().orderBy(desc("count"))
z.show(userTopBooks, 10)

### Sentiment Analysis Score

In [21]:
// val AFINN = sc.textFile("BDAD_Project/AFINN-111.txt").map(x=> x.split("\t")).map(x=>(x(0).toString,x(1).toInt))
// val tempRDD = reviewsDF.select(col("review_id"), col("review_text")).rdd.map(a => (a.getString(0), a.getString(1).split(" "))).flatMapValues(identity[Array[String]]).map(a => (a._2.filter(_ >= ' '), a._1))
// val temp1RDD = tempRDD.join(AFINN).map(a => a._2).map(a => (a._1, (a._2, 1))).reduceByKey((x, y) => (x._1 + y._1, x._2 + y._2)).map(a => (a._1, a._2._1.toFloat / a._2._2)).toDF().withColumnRenamed("_1", "reviewID").withColumnRenamed("_2", "score")
// val reviewSentimentDF = reviewsDF.join(temp1RDD,reviewsDF("review_id") ===  temp1RDD("reviewID"),"inner").drop(col("reviewID"))
// reviewSentimentDF.write.mode("overwrite").parquet("BDAD_Project/reviewsSentiment.parquet")

val AFINN = sc.textFile("bdad/project/AFINN-111.txt").map(x=> x.split("\t")).map(x=>(x(0).toString,x(1).toInt)).toDF().withColumnRenamed("_1", "word").withColumnRenamed("_2", "score")
var temp = reviewsDF.withColumn("reviewText", explode(split(col("review_text")," ")))
var temp1 = temp.join(AFINN, temp("reviewText") ===  AFINN("word"),"inner").drop("word").groupBy(col("review_id")).avg("score").withColumnRenamed("avg(score)", "score").withColumnRenamed("review_iD", "reviewID")
var temp2 = reviewsDF.join(temp1, reviewsDF("review_id") === temp1("reviewId")).drop("reviewID")
temp2.write.mode("overwrite").parquet("BDAD_Project/reviewsSentiment.parquet")

In [22]:
// val reviewSentimentDF = spark.read.parquet("BDAD_Project/reviewsSentiment.parquet")
// z.show(reviewSentimentDF.groupBy("book_id").count.orderBy(desc("count")))

val bookId = 11870085
// reviewSentimentDF.filter(col("book_id") === bookId).count
z.show(booksDF.filter(col("book_id") === bookId))


### Top and Bottom 3 reviews by likes count

In [24]:
// z.show(reviewSentimentDF.filter(col("book_id") === bookId).select(col("review_text"), col("n_votes")).orderBy(desc("n_votes")).select(col("review_text")).withColumnRenamed("review_text", "Top 3").limit(3))
// z.show(reviewSentimentDF.filter(col("book_id") === bookId).select(col("review_text"), col("n_votes")).orderBy(asc("n_votes")).select(col("review_text")).withColumnRenamed("review_text", "Bottom 3").limit(3))

z.show(reviewSentimentDF.filter(col("book_id") === bookId).filter(col("score") > 0).select(col("review_text"), col("n_votes"), col("score")).orderBy(desc("n_votes")).withColumnRenamed("review_text", "Top voted positive sentiment review").limit(3))
z.show(reviewSentimentDF.filter(col("book_id") === bookId).filter(col("score") < 0).select(col("review_text"), col("n_votes"), col("score")).orderBy(desc("n_votes")).withColumnRenamed("review_text", "Top voted negative sentiment review").limit(3))

### Top and Bottom 3 reviews by sentiment scores

In [26]:
z.show(reviewSentimentDF.filter(col("book_id") === bookId).select(col("review_text"), col("score")).orderBy(desc("score")).select(col("review_text")).withColumnRenamed("review_text", "Top 3").limit(3))
z.show(reviewSentimentDF.filter(col("book_id") === bookId).select(col("review_text"), col("score")).orderBy(asc("score")).select(col("review_text")).withColumnRenamed("review_text", "Bottom 3").limit(3))

### Fraction of positive and negative reviews by sentiment scores

In [28]:
var t1 = reviewSentimentDF.filter(col("book_id") === bookId).select(col("score")).filter(col("score") < 0).count
var t2 = reviewSentimentDF.filter(col("book_id") === bookId).select(col("score")).filter(col("score") > 0).count

var positive_review = t2.toFloat / (t1+t2)
var negative_review = t1.toFloat / (t1+t2)