In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark=SparkSession.builder.config("spark-driver.host", "localhost").config("spark.driver.memory","4g") \
.config("spark.executor.memory","4g").appName("mr").getOrCreate()

In [4]:
spark

In [5]:
#import required libraries

from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer
from pyspark.sql.functions import col, avg, count
from pyspark.ml import Pipeline


In [6]:
spark = SparkSession.builder \
    .appName("Connect to HDFS") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://10.0.2.15:9000") \
    .getOrCreate()

# Test HDFS access by listing files
hdfs_path = "hdfs:10.0.2.15:9000/user/Group04/Books.csv"
df = spark.read.csv(hdfs_path, header=True, inferSchema=True)
df.show(5)

df.show(5)

+----------+--------------------+-----+--------------+--------------------+------------------+------------+-----------+--------------------+--------------------+
|        Id|               Title|Price|       User_id|         profileName|review/helpfulness|review/score|review/time|      review/summary|         review/text|
+----------+--------------------+-----+--------------+--------------------+------------------+------------+-----------+--------------------+--------------------+
|1882931173|Its Only Art If I...| NULL| AVCGYZL8FQQTD|"Jim of Oz ""jim-...|               7/7|         4.0|  940636800|Nice collection o...|This is only for ...|
|0826414346|Dr. Seuss: Americ...| NULL|A30TK6U7DNS82R|       Kevin Killian|             10/10|         5.0| 1095724800|   Really Enjoyed It|I don't care much...|
|0826414346|Dr. Seuss: Americ...| NULL|A3UH4UZ4RSVO82|        John Granger|             10/11|         5.0| 1078790400|Essential for eve...|"If people become...|
|0826414346|Dr. Seuss: Ameri

In [7]:
df = df.na.drop("any")
df = df.dropDuplicates()
df.show(5)

+----------+--------------------+-----+--------------+-------------------+------------------+------------+-----------+--------------------+--------------------+
|        Id|               Title|Price|       User_id|        profileName|review/helpfulness|review/score|review/time|      review/summary|         review/text|
+----------+--------------------+-----+--------------+-------------------+------------------+------------+-----------+--------------------+--------------------+
|0595129463|Marshall Hollenze...|12.95|A39OBC2D154CCU| Elizabeth Bookspan|               1/1|         5.0|  987897600|You Won't Be Able...|Marshall Hollenze...|
|0679751254|Lenin's Tomb: The...|12.11|A3PHHV3UJAUP8B| "mobuto ""-----"""|               3/5|         5.0|  984873600|  Soviet Tocqueville|Remnick writes el...|
|0919345476|The Witches' God:...|16.75| ASH2T0XFJFLPQ|       John Culloty|               6/6|         5.0| 1011139200|A Primer of Pagan...|This book provide...|
|0833025147|In Athena's Camp:...|3

In [8]:
df.columns

['Id',
 'Title',
 'Price',
 'User_id',
 'profileName',
 'review/helpfulness',
 'review/score',
 'review/time',
 'review/summary',
 'review/text']

In [9]:
from pyspark.sql.functions import col, lower, row_number
from pyspark.sql.window import Window

# Filter books where review/summary or review/text contains the word "Christmas"
filtered_books = df.filter(
    (lower(col("review/summary")).contains("christmas")) | 
    (lower(col("review/text")).contains("christmas"))
)

# Group by Title and calculate review count and average review score
popular_books_with_christmas = filtered_books.groupBy("Title", "Id") \
    .agg(
        count("review/score").alias("review_count"),
        avg("review/score").alias("average_review_score")
    )

# Define a window partitioned by Title to ensure unique titles
window_spec = Window.partitionBy("Title").orderBy(
    col("review_count").desc(), col("average_review_score").desc()
)

# Add a row number within each partition (Title) to ensure uniqueness
unique_books = popular_books_with_christmas.withColumn("row_number", row_number().over(window_spec)) \
    .filter(col("row_number") == 1)  # Keep only the first record for each title

# Order the final results by review count and average review score to get the top 6 books
top_6_unique_books = unique_books.orderBy(
    col("review_count").desc(), col("average_review_score").desc()
).limit(6)

# Show the top 6 unique books
top_6_unique_books.show(truncate=False)


+-----------------------------------------------------------------------------------------------------+----------+------------+--------------------+----------+
|Title                                                                                                |Id        |review_count|average_review_score|row_number|
+-----------------------------------------------------------------------------------------------------+----------+------------+--------------------+----------+
|A Christmas Carol, in Prose: Being a Ghost Story of Christmas (Collected Works of Charles Dickens)   |0742623157|593         |4.758853288364249   |1         |
|A Christmas Carol (Classic Fiction)                                                                  |9626346825|593         |4.758853288364249   |1         |
|The Night Before Christmas, The (Wee Books for Wee Folk)                                             |1557094101|181         |4.541436464088398   |1         |
|The Night Before Christmas Pop-up      

In [10]:
# Convert Spark DataFrame to Pandas DataFrame
pandas_df = top_6_unique_books.toPandas()

# Display the Pandas DataFrame
(pandas_df.head(6))


Unnamed: 0,Title,Id,review_count,average_review_score,row_number
0,"A Christmas Carol, in Prose: Being a Ghost Sto...",742623157,593,4.758853,1
1,A Christmas Carol (Classic Fiction),9626346825,593,4.758853,1
2,"The Night Before Christmas, The (Wee Books for...",1557094101,181,4.541436,1
3,The Night Before Christmas Pop-up,689838999,52,4.884615,1
4,A First Book of Christmas Songs: 20 Favorite S...,486297187,36,4.0,1
5,Hogfather (Discworld),753105209,31,4.516129,1


In [11]:
# Drop the specified columns
pandas_df = pandas_df.drop(['review_count', 'row_number'], axis=1)

# Save the updated DataFrame to a CSV file
output_file_path = "filtered_books.csv"
pandas_df.to_csv(output_file_path, index=False)

print(f"CSV file has been saved as {output_file_path}")

CSV file has been saved as filtered_books.csv


In [12]:
# Display the filtered books DataFrame
filtered_books.show(truncate=False)

+----------+-------------------------------------------------------------+-----+--------------+------------------------------+------------------+------------+-----------+---------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [13]:
import pandas as pd
sorted =pd.read_csv('v2.csv')

In [14]:
sorted

Unnamed: 0,Title,Id,average_review_score,image,authors
0,"A Christmas Carol, in Prose: Being a Ghost Sto...",742623157,4.758853,https://m.media-amazon.com/images/I/61WkuG9yCO...,['Charles Dickens']
1,A Christmas Carol (Classic Fiction),9626346825,4.758853,https://covers.shakespeareandcompany.com/97801...,['Charles Dickens']
2,"The Night Before Christmas, The (Wee Books for...",1557094101,4.541436,http://books.google.com/books/content?id=9MsbD...,['Clement Clarke Moore']
3,The Night Before Christmas Pop-up,689838999,4.884615,http://books.google.com/books/content?id=wKLEs...,['Matthew Reinhart']
4,A First Book of Christmas Songs: 20 Favorite S...,486297187,4.88654,http://books.google.com/books/content?id=g4-zt...,['Bergerac']
5,Hogfather (Discworld),753105209,4.516129,http://books.google.com/books/content?id=CeslE...,['Terry Pratchett']


In [15]:
sorted.columns

Index(['Title', 'Id', 'average_review_score', 'image', 'authors'], dtype='object')

In [16]:
import pickle
# Save as .pkl file
with open("PopularChristmasBooks.pkl", "wb") as f:
    pickle.dump(sorted, f)


In [None]:
spark.stop()