In [None]:
# Husayn El Sharif
comment = """
This script demonstrates basic operations using PySpark.
Using AirBnB dataset as an example.
use environment: spark4_env001"""

In [1]:
# imports

from pyspark.sql import SparkSession # Import SparkSession from PySpark. SQL module

import requests

In [4]:
# Download data from insideairbnb.com for Broward County - Florida - USA and download to folder "airbnb_data"
urls = [
    "https://data.insideairbnb.com/united-states/fl/broward-county/2025-09-26/data/listings.csv.gz",
    "https://data.insideairbnb.com/united-states/fl/broward-county/2025-09-26/data/calendar.csv.gz",
    "https://data.insideairbnb.com/united-states/fl/broward-county/2025-09-26/data/reviews.csv.gz",
    ]

# loop through the URLs and download each file
for url in urls:
    filename = url.split("/")[-1]  # Extract the filename from the URL
    response = requests.get(url)   # Send a GET request to the URL
    with open(f"airbnb_data/{filename}", "wb") as file:  # Open a file in write-binary mode
        file.write(response.content)  # Write the content of the response to the file
        

In [5]:
# Read Data in Spark Session
spark = SparkSession.builder.appName("AirBnB_Data_Example").getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/12/25 09:21:53 WARN Utils: Your hostname, Husayn-SLS2, resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/12/25 09:21:53 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/25 09:21:54 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
# Read Listings Data
listings_df = spark.read.csv("airbnb_data/listings.csv.gz", 
                             header=True, 
                             inferSchema=True, 
                             sep=",", 
                             quote='"', 
                             escape='"',  # Handle escaped quotes
                             multiLine=True, 
                             mode="PERMISSIVE")

In [7]:
# check out schema for Listings data
for field in listings_df.schema:
    print(field)

StructField('id', LongType(), True)
StructField('listing_url', StringType(), True)
StructField('scrape_id', LongType(), True)
StructField('last_scraped', DateType(), True)
StructField('source', StringType(), True)
StructField('name', StringType(), True)
StructField('description', StringType(), True)
StructField('neighborhood_overview', StringType(), True)
StructField('picture_url', StringType(), True)
StructField('host_id', IntegerType(), True)
StructField('host_url', StringType(), True)
StructField('host_name', StringType(), True)
StructField('host_since', DateType(), True)
StructField('host_location', StringType(), True)
StructField('host_about', StringType(), True)
StructField('host_response_time', StringType(), True)
StructField('host_response_rate', StringType(), True)
StructField('host_acceptance_rate', StringType(), True)
StructField('host_is_superhost', StringType(), True)
StructField('host_thumbnail_url', StringType(), True)
StructField('host_picture_url', StringType(), True)


In [8]:
# some operations
# select the neighbourhood_cleansed column
neighbourhoods = listings_df.select(
    listings_df.neighbourhood_cleansed
)

neighbourhoods.show(25) # Display the first 25 neighbourhoods

+----------------------+
|neighbourhood_cleansed|
+----------------------+
|             Hollywood|
|      Hallandale Beach|
|           Dania Beach|
|             Hollywood|
|       Deerfield Beach|
|             Hollywood|
|             Hollywood|
|           Dania Beach|
|             Hollywood|
|               Miramar|
|             Hollywood|
|       Fort Lauderdale|
|       Fort Lauderdale|
|       Fort Lauderdale|
|             Hollywood|
|       Fort Lauderdale|
|         Pompano Beach|
|             Hollywood|
|               Miramar|
|       Fort Lauderdale|
|       Fort Lauderdale|
|       Fort Lauderdale|
|       Fort Lauderdale|
|       Fort Lauderdale|
|             Hollywood|
+----------------------+
only showing top 25 rows
