In [None]:
# Husayn El Sharif
# Local version of spark job to run on S3
# use environment spark4_env001

In [2]:
# Import libraries
# argparse → handles command-line arguments
# SparkSession → entry point to Spark functionality
# F → alias for pyspark.sql.functions for transformations
from pyspark.sql import SparkSession
import pyspark.sql.functions as F




In [3]:

# ------------------------------
# Create Spark Session
# ------------------------------
# SparkSession is the main entry point to work with Spark
spark = SparkSession.builder \
    .appName("Most popular listings") \
    .getOrCreate()


Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/12/28 07:44:57 WARN Utils: Your hostname, Husayn-SLS2, resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/12/28 07:44:57 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/28 07:44:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
# ------------------------------
# Load Listings Dataset
# ------------------------------
# Reads a CSV file into a Spark DataFrame
# header=True → first row contains column names
# inferSchema=True → automatically detect column data types
# sep, quote, escape, multiLine settings → ensure correct handling of CSV formatting
# mode="PERMISSIVE" → Spark will not fail if it encounters malformed rows

listings = spark.read.csv("airbnb_data/listings.csv.gz",
    header=True,
    inferSchema=True,
    sep=",",
    quote='"',
    escape='"',
    multiLine=True,
    mode="PERMISSIVE"
)



                                                                                

In [5]:
# ------------------------------
# Load Reviews Dataset
# ------------------------------
# Same CSV reading configuration applied to reviews dataset

reviews = spark.read.csv("airbnb_data/reviews.csv.gz",
    header=True,
    inferSchema=True,
    sep=",",
    quote='"',
    escape='"',
    multiLine=True,
    mode="PERMISSIVE"
)


                                                                                

In [6]:
# ------------------------------
# Join Listings with Reviews
# ------------------------------
# Inner join → keep only listings that actually have reviews
# Match records where listing.id = reviews.listing_id
listings_reviews = listings.join(
    reviews, listings.id == reviews.listing_id, how='inner'
)


In [7]:
# ------------------------------
# Count Number of Reviews Per Listing
# ------------------------------
# Group by listing ID and name
# Count how many reviews each listing has
# Order results from most reviewed → least reviewed

reviews_per_listing = (
    listings_reviews 
    .groupBy(listings.id, listings.name) 
    .agg(
    F.count(reviews.id).alias('num_reviews')
    ) 
  .orderBy('num_reviews', ascending=False) 
  )



In [8]:
# ------------------------------
# Save Output
# ------------------------------
# Writes result as CSV to the specified output directory
# header=True → include column headers in output

reviews_per_listing.write.csv(
    "reviews_per_listing_output",
    header=True,
    )

                                                                                

In [9]:
# stop spark session
spark.stop()