In [1]:
# Husayn El Sharif



In [9]:
# imports
from pyspark.sql import SparkSession

from pyspark.sql.functions import regexp_replace, col, max, min, avg, year

import pyspark.sql.functions as F

In [3]:
# Start Session
spark = SparkSession.builder \
    .appName("Read Inside Airbnb data") \
    .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/12/25 17:06:14 WARN Utils: Your hostname, Husayn-SLS2, resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/12/25 17:06:14 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/25 17:06:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
# read data
listings = spark.read.csv("airbnb_data/listings.csv.gz", 
    header=True,
    inferSchema=True,
    sep=",", 
    quote='"',
    escape='"', 
    multiLine=True,
    mode="PERMISSIVE" 
)

# numeric pricing
listings = listings.withColumn("price_numeric", regexp_replace(col("price"), "[$,]", "").cast("double") )

                                                                                

In [13]:
# schema of listings
for field in listings.schema:
    print(field)

StructField('id', LongType(), True)
StructField('listing_url', StringType(), True)
StructField('scrape_id', LongType(), True)
StructField('last_scraped', DateType(), True)
StructField('source', StringType(), True)
StructField('name', StringType(), True)
StructField('description', StringType(), True)
StructField('neighborhood_overview', StringType(), True)
StructField('picture_url', StringType(), True)
StructField('host_id', IntegerType(), True)
StructField('host_url', StringType(), True)
StructField('host_name', StringType(), True)
StructField('host_since', DateType(), True)
StructField('host_location', StringType(), True)
StructField('host_about', StringType(), True)
StructField('host_response_time', StringType(), True)
StructField('host_response_rate', StringType(), True)
StructField('host_acceptance_rate', StringType(), True)
StructField('host_is_superhost', StringType(), True)
StructField('host_thumbnail_url', StringType(), True)
StructField('host_picture_url', StringType(), True)


In [7]:
# Grouping by property type

group_property_type = (
    listings
    .groupBy(listings['property_type'])
)

group_property_type.count().show(truncate=False)

[Stage 2:>                                                          (0 + 1) / 1]

+-------------------------------+-----+
|property_type                  |count|
+-------------------------------+-----+
|Entire chalet                  |1    |
|Farm stay                      |2    |
|Entire rental unit             |5801 |
|Shared room in hostel          |3    |
|Private room in condo          |65   |
|Room in boutique hotel         |150  |
|Private room in casa particular|10   |
|Entire cabin                   |5    |
|Entire guesthouse              |218  |
|Private room in bungalow       |5    |
|Entire guest suite             |204  |
|Private room in home           |1071 |
|Entire place                   |23   |
|Camper/RV                      |62   |
|Private room in barn           |1    |
|Castle                         |3    |
|Tiny home                      |30   |
|Entire vacation home           |56   |
|Private room in camper/rv      |3    |
|Lighthouse                     |1    |
+-------------------------------+-----+
only showing top 20 rows


                                                                                

In [10]:
group_property_type = (
    listings
    .groupBy(listings['property_type'])
    .agg( F.count('property_type').alias('count'))
    .orderBy('count', ascending=[False])
).show()

+--------------------+-----+
|       property_type|count|
+--------------------+-----+
|  Entire rental unit| 5801|
|         Entire home| 4359|
|        Entire condo| 2360|
|Private room in home| 1071|
|       Room in hotel|  604|
|        Entire villa|  383|
|Private room in r...|  289|
|Entire serviced a...|  274|
|   Entire guesthouse|  218|
|  Entire guest suite|  204|
|    Entire townhouse|  161|
|Room in boutique ...|  150|
|Private room in r...|  129|
|Private room in c...|   65|
|Private room in g...|   63|
|           Camper/RV|   62|
|  Room in aparthotel|   59|
|Private room in g...|   59|
|Entire vacation home|   56|
|Private room in t...|   54|
+--------------------+-----+
only showing top 20 rows


In [11]:
# Joins

# read reviews data

reviews = spark.read.csv("airbnb_data/reviews.csv.gz", 
    header=True,
    inferSchema=True,
    sep=",", 
    quote='"',
    escape='"', 
    multiLine=True,
    mode="PERMISSIVE" 
)


                                                                                

In [12]:
# check schema of reviews
for field in reviews.schema:
    print(field)

StructField('listing_id', LongType(), True)
StructField('id', LongType(), True)
StructField('date', DateType(), True)
StructField('reviewer_id', IntegerType(), True)
StructField('reviewer_name', StringType(), True)
StructField('comments', StringType(), True)


In [18]:
# join listings and reviews on listing.id = reviews.listing_id

listings_and_reviews = listings.join(
    reviews,
    listings['id'] == reviews['listing_id'],
    how="inner", 
)

In [22]:
listings_and_reviews.show(50, truncate=False)

                                                                                

+------+-----------------------------------+--------------+------------+---------------+------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [24]:
# reviews per listing
reviews_per_listing = (
    listings_and_reviews
    .groupBy(listings['id'], listings['name'])
    .agg( F.count(reviews.id).alias('num_reviews'))
    .orderBy('num_reviews', ascending=False)
    .show(truncate=False)
)

[Stage 19:>                                                         (0 + 1) / 1]

+------------------+--------------------------------------------------+-----------+
|id                |name                                              |num_reviews|
+------------------+--------------------------------------------------+-----------+
|772480133052161093|At Mine | Beach Suite in Fort Lauderdale          |1939       |
|775420168099754401|At Mine | Double Suite in Fort Lauderdale         |1918       |
|17008487          |King Room - Oceanfront Property                   |1406       |
|47354493          |2 BEDROOM SUITE ✦ Wyndham Palm Aire Resort ✦      |1033       |
|29566676          |#1 FLL Airport/Port Everglades 10-15 mins         |989        |
|40804483          |Cutest Micro-Studio Kitchenette | Pool | parking  |968        |
|20455583          |Private suite in new eco friendly house           |911        |
|27230672          |▪️Relaxing & Spacious Studio with Private Yard▪️  |845        |
|12601797          |Direct Ocean View from the Balcony                |815  

                                                                                

In [25]:
spark.stop()