In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark=SparkSession.builder.config("spark-driver.host", "localhost").config("spark.driver.memory","4g") \
.config("spark.executor.memory","4g").appName("mr").getOrCreate()

In [4]:
spark

In [5]:
#import required libraries

from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer
from pyspark.sql.functions import col, avg, count
from pyspark.ml import Pipeline


In [6]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("Connect to HDFS") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://10.0.2.15:9000") \
    .getOrCreate()

# Test HDFS access by listing files
hdfs_path = "hdfs:10.0.2.15:9000/user/Group04/Books.csv"
df = spark.read.csv(hdfs_path, header=True, inferSchema=True)
df.show(5)

+----------+--------------------+-----+--------------+--------------------+------------------+------------+-----------+--------------------+--------------------+
|        Id|               Title|Price|       User_id|         profileName|review/helpfulness|review/score|review/time|      review/summary|         review/text|
+----------+--------------------+-----+--------------+--------------------+------------------+------------+-----------+--------------------+--------------------+
|1882931173|Its Only Art If I...| NULL| AVCGYZL8FQQTD|"Jim of Oz ""jim-...|               7/7|         4.0|  940636800|Nice collection o...|This is only for ...|
|0826414346|Dr. Seuss: Americ...| NULL|A30TK6U7DNS82R|       Kevin Killian|             10/10|         5.0| 1095724800|   Really Enjoyed It|I don't care much...|
|0826414346|Dr. Seuss: Americ...| NULL|A3UH4UZ4RSVO82|        John Granger|             10/11|         5.0| 1078790400|Essential for eve...|"If people become...|
|0826414346|Dr. Seuss: Ameri

In [7]:
df = df.na.drop("any")
df = df.dropDuplicates()
df.show(5)

+----------+--------------------+-----+--------------+-------------------+------------------+------------+-----------+--------------------+--------------------+
|        Id|               Title|Price|       User_id|        profileName|review/helpfulness|review/score|review/time|      review/summary|         review/text|
+----------+--------------------+-----+--------------+-------------------+------------------+------------+-----------+--------------------+--------------------+
|0595129463|Marshall Hollenze...|12.95|A39OBC2D154CCU| Elizabeth Bookspan|               1/1|         5.0|  987897600|You Won't Be Able...|Marshall Hollenze...|
|0679751254|Lenin's Tomb: The...|12.11|A3PHHV3UJAUP8B| "mobuto ""-----"""|               3/5|         5.0|  984873600|  Soviet Tocqueville|Remnick writes el...|
|0919345476|The Witches' God:...|16.75| ASH2T0XFJFLPQ|       John Culloty|               6/6|         5.0| 1011139200|A Primer of Pagan...|This book provide...|
|0833025147|In Athena's Camp:...|3

In [8]:

# Calculate count of reviews and average review score for  books
popular_books_df = df.groupBy('Id', 'Title') \
    .agg(
        count('review/score').alias('review_count'),
        avg('review/score').alias('average_review_score')
    )

In [9]:
# Define the ID to be removed
target_id = "159335648X"
popular_books_df.filter(df.Id != target_id)

DataFrame[Id: string, Title: string, review_count: bigint, average_review_score: double]

In [10]:
#sort the books in desc order 

sorted_books_df = popular_books_df.orderBy(col('review_count').desc(), col('average_review_score').desc())

In [11]:
sorted_books_df.show(10)

+----------+--------------------+------------+--------------------+
|        Id|               Title|review_count|average_review_score|
+----------+--------------------+------------+--------------------+
|1932100385|The China Study: ...|        1462|   4.581379310344827|
|0307280721|Eldest (Inheritan...|        1274|   3.740916271721959|
|0312857055|Wizard's First Ru...|        1268|  3.8632411067193675|
|0440224675|            Hannibal|        1248|  3.1850362027353176|
|0435126024|Jane Eyre (New Wi...|        1189|   4.550883095037847|
|1847022251|Jane Eyre (Large ...|        1189|   4.550883095037847|
|159335648X|Killing Floor (Ja...|        1130|  4.1066666666666665|
|069452607X|       Good to Great|         993|   4.384146341463414|
|1589269063|The Five Love Lan...|         985|  4.7080366225839265|
|9626346825|A Christmas Carol...|         956|    4.68586387434555|
+----------+--------------------+------------+--------------------+
only showing top 10 rows



In [12]:
sorted_books_pd = sorted_books_df.toPandas()

In [13]:
sorted = sorted_books_pd.head(6)
sorted.head(6)

Unnamed: 0,Id,Title,review_count,average_review_score
0,1932100385,The China Study: The Most Comprehensive Study ...,1462,4.581379
1,307280721,"Eldest (Inheritance, Book 2)",1274,3.740916
2,312857055,"Wizard's First Rule (Sword of Truth, Book 1)",1268,3.863241
3,440224675,Hannibal,1248,3.185036
4,1847022251,Jane Eyre (Large Print),1189,4.550883
5,435126024,Jane Eyre (New Windmill),1189,4.550883


In [16]:
import pandas as pd
books=pd.read_csv('Books.csv')

In [17]:
sorted_books = pd.read_csv('v1.csv')

In [19]:
sorted_books_pd = sorted_books_df.toPandas()
import pickle
# Save as .pkl file
with open("PopularBookRecommendation.pkl", "wb") as f:
    pickle.dump(sorted_books, f)


In [None]:
spark.stop()