In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Read Inside Airbnb data") \
    .getOrCreate()

In [2]:
listings = spark.read.csv("../data/listings.csv.gz", 
    header=True,
    inferSchema=True,
    sep=",", 
    quote='"',
    escape='"', 
    multiLine=True,
    mode="PERMISSIVE" 
)

In [3]:
listings.printSchema()

root
 |-- id: long (nullable = true)
 |-- listing_url: string (nullable = true)
 |-- scrape_id: long (nullable = true)
 |-- last_scraped: date (nullable = true)
 |-- source: string (nullable = true)
 |-- name: string (nullable = true)
 |-- description: string (nullable = true)
 |-- neighborhood_overview: string (nullable = true)
 |-- picture_url: string (nullable = true)
 |-- host_id: integer (nullable = true)
 |-- host_url: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- host_since: date (nullable = true)
 |-- host_location: string (nullable = true)
 |-- host_about: string (nullable = true)
 |-- host_response_time: string (nullable = true)
 |-- host_response_rate: string (nullable = true)
 |-- host_acceptance_rate: string (nullable = true)
 |-- host_is_superhost: string (nullable = true)
 |-- host_thumbnail_url: string (nullable = true)
 |-- host_picture_url: string (nullable = true)
 |-- host_neighbourhood: string (nullable = true)
 |-- host_listings_count: int

In [6]:
# 1. Get a non-null picture URL for any property ("picture_url" field)
# Select any non-null picture URL
listings \
    .select(listings.picture_url) \
    .dropna() \
    .show(1, truncate=False)

+----------------------------------------------------------------------------------------------------------+
|picture_url                                                                                               |
+----------------------------------------------------------------------------------------------------------+
|https://a0.muscache.com/pictures/hosting/Hosting-264776/original/3cc7b93f-dbda-4ded-ac15-e9d96691e7ca.jpeg|
+----------------------------------------------------------------------------------------------------------+
only showing top 1 row


In [9]:
# 2. Get number of properties that get more than 10 reviews per month

#listings.schema['reviews_per_month']
prop_more_than_10_rpm = listings \
    .filter(listings.reviews_per_month > 10) \
    .count()
print(prop_more_than_10_rpm)


57


In [13]:
# 3. Get properties that have more bathrooms than bedrooms
listings \
    .filter(listings.bathrooms > listings.bedrooms) \
    .select('id', 'price', 'name', 'bathrooms', 'bedrooms') \
    .show()

+------+-------+--------------------+---------+--------+
|    id|  price|                name|bathrooms|bedrooms|
+------+-------+--------------------+---------+--------+
|266037| $62.00|Central London wi...|      1.5|       1|
|268398| $66.00|Also five minutes...|      1.5|       1|
| 24328|$213.00|Battersea live/wo...|      1.5|       1|
|432841|$104.00|Large double bedr...|      1.5|       1|
|433867| $52.00|Bedroom In Great ...|      1.5|       1|
|283569| $94.00|Spacious luxury 2...|      1.5|       1|
|437722|$500.00|Very Central! Bay...|      2.0|       1|
|442457| $40.00|Room in London wi...|      1.5|       1|
|445346| $76.00|Stunning large ro...|      1.5|       1|
| 56229| $78.00|Cosy Double studi...|      1.5|       1|
|449399| $63.00|Cosy single room ...|      1.5|       1|
| 81951|$190.00|LONDON DETACHED H...|      2.0|       1|
| 84223| $67.00|Designer room Par...|      1.5|       1|
| 84532| $42.00|Cosy Double room ...|      1.5|       1|
| 85191| $75.00|Maisonette in C

In [19]:
# 4. Get properties where the price is greater than 5,000. Collect the result as a Python list
# Remember to convert a price into a number first!
from pyspark.sql.functions import regexp_replace

price_num_df = listings \
  .withColumn('price_num', regexp_replace('price', '[$,]', '').cast('float')) \

list_price_greater_5000 = price_num_df \
    .filter(price_num_df.price_num > 5000) \
    .select('id', 'price', 'name') \
    .collect()
print(*list_price_greater_5000[:20], sep='\n')

Row(id=9470827, price='$8,000.00', name='Room in a cosy flat. Central, clean')
Row(id=10475894, price='$6,308.00', name='Spacious Private Ground Floor Room')
Row(id=13254774, price='$53,588.00', name='No Longer Available')
Row(id=13841484, price='$74,100.00', name='Bright & airy DoubleBed with EnSuite in Zone 2!')
Row(id=17709189, price='$7,360.00', name='Stunning home overlook canary wharf')
Row(id=36304540, price='$7,377.00', name='The Apartments by The Sloane Club, L 2 Bedroom Apt')
Row(id=38538876, price='$7,796.00', name='Kensington- Luxury 2 bedroom ground floor flat')
Row(id=40518546, price='$5,034.00', name='Spacious London Flat')
Row(id=40881056, price='$5,700.00', name='Single room. 7ft x 9ft - Over looking garden')
Row(id=42920521, price='$5,372.00', name='Luxury modern apartment in Dulwich Village')
Row(id=48043486, price='$6,000.00', name='Beautiful 2 BR flat in Kilburn with free parking')
Row(id=48118935, price='$7,007.00', name='Semi-detached mews house in Knightsbridge.

In [62]:
# 5. Get a list of properties with the following characteristics:
# * price < 150
# * more than 20 reviews
# * review_scores_rating > 4.5
# Consider using the "&" operator
price_num_df \
    .filter((price_num_df.price_num < 150) & 
            (price_num_df.number_of_reviews > 20) & 
            (price_num_df.review_scores_rating > 4.5)) \
    .select('id', 'name', 'price', 'number_of_reviews','review_scores_rating') \
    .show()

+------+--------------------+-------+-----------------+--------------------+
|    id|                name|  price|number_of_reviews|review_scores_rating|
+------+--------------------+-------+-----------------+--------------------+
|264777|One Bedroom Apart...| $98.00|               24|                4.58|
|264779|Refurbished Two B...|$144.00|               36|                4.64|
|266037|Central London wi...| $62.00|              532|                 4.9|
|268398|Also five minutes...| $66.00|              563|                4.63|
|270600|Patio Apartment i...| $73.00|               88|                4.64|
|427348|Lovely 2 bedroom ...|$135.00|               23|                4.77|
|427584|Hackney Stylish &...|$129.00|              152|                4.65|
|427936|Boutique Room w/ ...| $70.00|              208|                4.98|
| 13913|Holiday London DB...| $72.00|               54|                4.85|
| 15400|Bright Chelsea  A...|$120.00|               97|                 4.8|

In [23]:
# 6. Get a list of properties with the following characteristics:
# * price < 150 OR more than one bathroom
# Use the "|" operator to implement the OR operator
price_num_df \
    .filter((price_num_df.price_num < 150) | (price_num_df.bathrooms > 1)) \
    .select('id', 'name', 'price', 'bathrooms') \
    .show()

+------+--------------------+-------+---------+
|    id|                name|  price|bathrooms|
+------+--------------------+-------+---------+
|264776|Huge Four Bedroom...|$297.00|      2.0|
|264777|One Bedroom Apart...| $98.00|      1.0|
|264778|Two Bedroom Newly...|$148.00|      1.0|
|264779|Refurbished Two B...|$144.00|      1.0|
|264780|Spacious refurbis...|$157.00|      2.0|
|264781|Two Bedrooms Gard...|$148.00|      2.0|
|264782|One Bedroom Garde...|$120.00|      1.0|
|264783|Four Bedroom Gard...|$216.00|      2.0|
|264789|Huge Three Bedroo...|$238.00|      2.0|
|266037|Central London wi...| $62.00|      1.5|
|268398|Also five minutes...| $66.00|      1.5|
|270600|Patio Apartment i...| $73.00|      1.0|
|426351|2 Bed Edwardian M...|$134.00|      1.0|
|427348|Lovely 2 bedroom ...|$135.00|      2.0|
|427584|Hackney Stylish &...|$129.00|      1.0|
|427936|Boutique Room w/ ...| $70.00|      1.0|
| 13913|Holiday London DB...| $72.00|      1.0|
| 15400|Bright Chelsea  A...|$120.00|   

In [33]:
# 7. Get the highest listing price in this dataset
# Consider using the "max" function from "pyspark.sql.functions"
from pyspark.sql.functions import max

price_num_df \
    .select(max(price_num_df.price_num)) \
    .show()

+--------------+
|max(price_num)|
+--------------+
|       74100.0|
+--------------+



In [65]:
# 8. Get the name and a price of property with the highest price
# Try to use "collect" method to get the highest price first, and then use it in a "filter" call 
max_value = price_num_df.agg(max(price_num_df.price_num)).collect()[0][0]
price_num_df \
    .select('name', 'price') \
    .filter(price_num_df.price_num == max_value) \
    .show()


+--------------------+----------+
|                name|     price|
+--------------------+----------+
|Bright & airy Dou...|$74,100.00|
+--------------------+----------+



In [54]:
# 9. Get the number of hosts in the dataset
num_of_hosts = listings \
    .select(listings.host_id) \
    .distinct() \
    .count()
print(num_of_hosts)

55804


In [66]:
# 10. Get listings with a first review in 2024
# Consider using the "year" function from "pyspark.sql.functions"
from pyspark.sql.functions import year

df_with_year = listings.withColumn("first_review_year", year("first_review"))
df_with_year \
    .filter(df_with_year.first_review_year == 2024) \
    .select('id', 'name', 'price', 'first_review') \
    .show()

+-------+--------------------+-------+------------+
|     id|                name|  price|first_review|
+-------+--------------------+-------+------------+
|  52624|Close to Wimbledo...|   NULL|  2024-08-11|
| 476456|Bridgerton inspir...| $91.00|  2024-09-14|
| 228389|one Double bed ro...| $50.00|  2024-03-21|
| 649929|Sm double room  w...|   NULL|  2024-06-04|
|1796751|Superlux flat in ...|$540.00|  2024-01-01|
|1573381|Central, modern p...|$189.00|  2024-11-29|
|1286670|Stunning Bright C...|$187.00|  2024-09-21|
|2498749|Victorian 2-bedro...|$110.00|  2024-12-09|
|2304024|The Pink House, N...|$112.00|  2024-07-14|
|2345520|Stylish garden fl...|$165.00|  2024-09-15|
|2358305|Luxurious Flat in...|$190.00|  2024-06-19|
|2435076|Double Standard R...|$112.00|  2024-09-01|
|2436958|Single En-suite Room| $76.00|  2024-08-28|
|2437103|Superior Single R...| $82.00|  2024-06-21|
|2737874|Charmant appart t...|$152.00|  2024-04-28|
|2856972|Small 1 bed apart...|$156.00|  2024-10-07|
|3334699|Sty