In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Spark aggregation functions") \
    .getOrCreate()

In [2]:
listings = spark.read.csv("../data/listings.csv.gz", 
    header=True,
    inferSchema=True,
    sep=",", 
    quote='"',
    escape='"', 
    multiLine=True,
    mode="PERMISSIVE" 
)
listings.printSchema()

root
 |-- id: long (nullable = true)
 |-- listing_url: string (nullable = true)
 |-- scrape_id: long (nullable = true)
 |-- last_scraped: date (nullable = true)
 |-- source: string (nullable = true)
 |-- name: string (nullable = true)
 |-- description: string (nullable = true)
 |-- neighborhood_overview: string (nullable = true)
 |-- picture_url: string (nullable = true)
 |-- host_id: integer (nullable = true)
 |-- host_url: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- host_since: date (nullable = true)
 |-- host_location: string (nullable = true)
 |-- host_about: string (nullable = true)
 |-- host_response_time: string (nullable = true)
 |-- host_response_rate: string (nullable = true)
 |-- host_acceptance_rate: string (nullable = true)
 |-- host_is_superhost: string (nullable = true)
 |-- host_thumbnail_url: string (nullable = true)
 |-- host_picture_url: string (nullable = true)
 |-- host_neighbourhood: string (nullable = true)
 |-- host_listings_count: int

In [3]:
reviews = spark.read.csv("../data/reviews.csv.gz", 
    header=True,
    inferSchema=True,
    sep=",",
    quote='"',
    escape='"',
    multiLine=True,
    mode="PERMISSIVE"
)
reviews.printSchema()

root
 |-- listing_id: long (nullable = true)
 |-- id: long (nullable = true)
 |-- date: date (nullable = true)
 |-- reviewer_id: integer (nullable = true)
 |-- reviewer_name: string (nullable = true)
 |-- comments: string (nullable = true)



In [8]:
# 1. Count the number of reviews per listing using the "reviews" dataset
import pyspark.sql.functions as F

reviews_count = reviews.groupBy("listing_id") \
    .agg(F.count("id").alias("num_reviews"))

listings_with_counts = listings \
    .join(reviews_count,
    listings.id == reviews_count.listing_id,
    "left") \
    .select(listings.id, listings.name, reviews_count.num_reviews) \
    .fillna(0)

listings_with_counts.orderBy(F.desc("num_reviews")).show(truncate=False)

+--------+--------------------------------------------------+-----------+
|id      |name                                              |num_reviews|
+--------+--------------------------------------------------+-----------+
|47408549|Double Room+ Ensuite                              |1855       |
|30760930|Double Garden View room - London House Hotel***   |1682       |
|43120947|Private double room with en suite facilities      |1615       |
|19670926|Locke Studio Apartment at Leman Locke             |1436       |
|45006692|Budget Double Room In Colliers Hotel.             |1433       |
|1436172 |Cosy Double in Kings Cross Houseshare nr Eurostar |1195       |
|2126708 |London's best transport hub 5 mins walk! Safe too!|1122       |
|1436177 |En-suite Double in Kings Cross Houseshare Eurostar|1005       |
|47438714|KX Basic- Small Double- shared bathroom           |978        |
|3855375 |Double in Kings Cross Houseshare nr Eurostar      |973        |
|46233904|Superior Studio, avg size 23

In [18]:
# 2. Compute the total number of listings and average review score per host
listings_hosts = listings.groupBy("host_id") \
    .agg(F.count("id").alias("num_listings"), 
         F.round(F.avg("review_scores_rating"), 2).alias("avg_review_score")) \
    .orderBy(F.desc("num_listings"))

listings_hosts.show()

+---------+------------+----------------+
|  host_id|num_listings|avg_review_score|
+---------+------------+----------------+
|446820235|         495|            4.54|
|314162972|         420|            4.37|
| 28820321|         285|            4.58|
|  1432477|         246|             4.4|
|156158778|         213|            4.89|
| 33889201|         197|            4.63|
| 47609036|         142|            4.75|
|228928499|         132|            4.85|
|124359784|         128|             4.4|
| 83740964|         123|            4.42|
|439074505|         121|            4.38|
|185324181|         118|            4.09|
|590452007|         116|            4.11|
|215357262|         115|            4.51|
| 30253178|         115|            4.48|
| 74167394|         109|            4.09|
| 89355192|         104|            4.65|
|105564995|         104|            4.43|
|575347835|          98|            NULL|
|  9063235|          97|            4.44|
+---------+------------+----------

In [19]:
# 3: Find the top ten listings with the highest number of reviews
listings_with_counts.orderBy(F.desc("num_reviews")).show(10, truncate=False)

+--------+--------------------------------------------------+-----------+
|id      |name                                              |num_reviews|
+--------+--------------------------------------------------+-----------+
|47408549|Double Room+ Ensuite                              |1855       |
|30760930|Double Garden View room - London House Hotel***   |1682       |
|43120947|Private double room with en suite facilities      |1615       |
|19670926|Locke Studio Apartment at Leman Locke             |1436       |
|45006692|Budget Double Room In Colliers Hotel.             |1433       |
|1436172 |Cosy Double in Kings Cross Houseshare nr Eurostar |1195       |
|2126708 |London's best transport hub 5 mins walk! Safe too!|1122       |
|1436177 |En-suite Double in Kings Cross Houseshare Eurostar|1005       |
|47438714|KX Basic- Small Double- shared bathroom           |978        |
|3855375 |Double in Kings Cross Houseshare nr Eurostar      |973        |
+--------+----------------------------

In [23]:
# 4. Find the top five neighborhoods with the most listings
listings_neighborhoods = listings.groupBy("neighbourhood_cleansed") \
    .agg(F.count("id").alias("num_listings")) \
    .dropna() \
    .orderBy(F.desc("num_listings"))

listings_neighborhoods.show(5, truncate=False)

+----------------------+------------+
|neighbourhood_cleansed|num_listings|
+----------------------+------------+
|Westminster           |11367       |
|Tower Hamlets         |7566        |
|Camden                |6564        |
|Kensington and Chelsea|6348        |
|Hackney               |6279        |
+----------------------+------------+
only showing top 5 rows


In [26]:
# 5. Get a data frame with the following four columns:
# * Listing's ID
# * Listing's name
# * Reviewer's name
# * Review's comment
# Use "join" to combine data from two datasets

listings_reviews = listings.join(
    reviews, listings.id == reviews.listing_id, how='inner'
)

res_df = listings_reviews \
    .select(listings.id, listings.name, reviews.reviewer_name, reviews.comments)

res_df.show()

+-----+--------------------+-------------+--------------------+
|   id|                name|reviewer_name|            comments|
+-----+--------------------+-------------+--------------------+
|13913|Holiday London DB...|      Michael|My girlfriend and...|
|13913|Holiday London DB...|      Mathias|Alina was a reall...|
|13913|Holiday London DB...|      Kristin|Alina is an amazi...|
|13913|Holiday London DB...|      Camilla|Alina's place is ...|
|13913|Holiday London DB...|        Jorik|Nice location in ...|
|13913|Holiday London DB...|         Vera|I'm very happy to...|
|13913|Holiday London DB...|         Honi|I stayed with Ali...|
|13913|Holiday London DB...|   Alessandro|Alina was a perfe...|
|13913|Holiday London DB...|         Oleh|Alina's flat is e...|
|13913|Holiday London DB...|           Mo|The House is a pi...|
|13913|Holiday London DB...|            A|Was great base fo...|
|13913|Holiday London DB...|       Daniel|Alina was an amaz...|
|13913|Holiday London DB...|      Belind

In [37]:
# 6.Get top five listings with the highest average review comment length. Only return listings with at least 5 reviews
# Use the "length" function from the "pyspark.sql.functions" to get a lenght of a review
res_df.groupby("id", "name") \
    .agg(F.count("comments").alias("num_reviews"), \
         F.round(F.avg(F.length("comments")), 2).alias("avg_review_length")) \
    .filter(F.col("num_reviews") >= 5) \
    .orderBy(F.desc("avg_review_length")) \
    .show(5)

+------------------+--------------------+-----------+-----------------+
|                id|                name|num_reviews|avg_review_length|
+------------------+--------------------+-----------+-----------------+
|618608352812465378|Beautiful Georgia...|          6|          1300.17|
|          28508447|The warm and cosy...|          6|          1089.33|
|627425975703032358|Superb loft beaut...|          9|           951.78|
|           2197681|Luxurious apartme...|          5|            939.2|
|          13891813|Beautiful 2 Bedro...|          5|            905.0|
+------------------+--------------------+-----------+-----------------+
only showing top 5 rows


In [49]:
# 7. Using the "join" operator find listings without reviews.
# Hint: Use "left_join" or "left_anti" join type when implementing this

#listings.orderBy(F.asc("number_of_reviews")).select("id", "number_of_reviews").show()
listings_without_reviews = listings \
    .join(reviews_count,
    listings.id == reviews_count.listing_id,
    "left") \
    .filter(F.col("num_reviews").isNull()) \
    .select(listings.id, listings.name, reviews_count.num_reviews) \
    .show()

+--------+--------------------+-----------+
|      id|                name|num_reviews|
+--------+--------------------+-----------+
|  770408|Large Cosy Apartm...|       NULL|
|  606217|SPARE ROOM TO LET...|       NULL|
| 4713410|Large 1 bedroom a...|       NULL|
| 4775781|Sunny bedroom bes...|       NULL|
| 5134081|Spacious Double R...|       NULL|
| 5813434|Twickenham For Ru...|       NULL|
| 5510087|Cosy Flat in Hamm...|       NULL|
| 7080965|       Bright double|       NULL|
| 8093650|Double room in ce...|       NULL|
| 9000678|Airy apartment on...|       NULL|
| 9248608|Beautifully renov...|       NULL|
|10262172|comfortable londo...|       NULL|
|11586308|Lovely terraced h...|       NULL|
|11517011|Beautiful quiet a...|       NULL|
|12693281|Peaceful flat in ...|       NULL|
|12847415|Modern self conta...|       NULL|
|14117333|Lovely Double Roo...|       NULL|
|14224191|5* Luxury Huge Fa...|       NULL|
|14726795|Large Double Room...|       NULL|
|15373142|Private room with...| 