In [1]:
# import dependencies
import pandas as pd
from sqlalchemy import create_engine, text
import json
import config
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
"""
Exploratory Data Analysis (EDA) for Capital Crashpad Listings
This notebook ingests, combines, and inspects Airbnb listings data across multiple quarterly snapshots.
- Scans all available quarter directories and parses them into sortable time indices.
- Loads each quarter’s listings_detailed.csv, annotates with quarter and index, and concatenates into a single DataFrame.
- Provides initial checks on data shape, column consistency, and missing columns across quarters.
- Lays the foundation for further cleaning, normalization, and time-series analysis of the DC Airbnb market.
"""

# set path and scan for quarter folders
DATA_DIR = Path("../resources/data/raw_data")
quarters = sorted([p.name for p in DATA_DIR.iterdir() if p.is_dir()])


def parse_quarter(folder_name):
    # e.g. '2024_sep' → '2024_Q3'
    year, month = folder_name.split("_")
    month_to_q = {"mar": "Q1", "jun": "Q2", "sep": "Q3", "dec": "Q4"}
    return f"{year}_{month_to_q[month.lower()]}"


# map month to quarter number
month_to_qnum = {"mar": 1, "jun": 2, "sep": 3, "dec": 4}

# list of (folder_name, year, qnum) tuples
quarter_tuples = []
for p in DATA_DIR.iterdir():
    if p.is_dir():
        year, month = p.name.split("_")
        qnum = month_to_qnum[month.lower()]
        quarter_tuples.append((p.name, int(year), qnum))

# sort quarters by year, then quarter number
quarters_sorted = [t[0] for t in sorted(quarter_tuples, key=lambda x: (x[1], x[2]))]

quarters_sorted

# loop through each quarter, read listings_detailed, and append with quarter info
listings_raw = []
for i, q_folder in enumerate(quarters_sorted):
    csv_path = DATA_DIR / q_folder / "listings_detailed.csv"
    if not csv_path.exists():
        print(f"Missing: {csv_path}")
        continue
    df = pd.read_csv(csv_path)
    df["quarter"] = parse_quarter(q_folder)
    df["quarter_index"] = i
    listings_raw.append(df)
    print(f"Loaded {q_folder} with {df.shape[0]} rows.")

# concatenate all quarters into one DataFrame
listings_raw = pd.concat(listings_raw, ignore_index=True)

# inspect shape and columns
print("Shape:", listings_raw.shape)
print("Columns:", listings_raw.columns.tolist())
print("Missing columns by quarter:")
for i, q_folder in enumerate(quarters_sorted):
    csv_path = DATA_DIR / q_folder / "listings_detailed.csv"
    if csv_path.exists():
        df = pd.read_csv(csv_path, nrows=1)
        print(f"{q_folder}: {set(listings_raw.columns) - set(df.columns)}")

Loaded 2023_jun with 6541 rows.
Loaded 2023_sep with 6705 rows.
Loaded 2023_dec with 6853 rows.
Loaded 2024_mar with 6705 rows.
Loaded 2024_jun with 4928 rows.
Loaded 2024_sep with 5454 rows.
Loaded 2024_dec with 5964 rows.
Loaded 2025_mar with 6257 rows.
Loaded 2025_jun with 6423 rows.
Loaded 2025_sep with 6374 rows.
Shape: (62204, 81)
Columns: ['id', 'listing_url', 'scrape_id', 'last_scraped', 'source', 'name', 'description', 'neighborhood_overview', 'picture_url', 'host_id', 'host_url', 'host_name', 'host_since', 'host_location', 'host_about', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'neighbourhood', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude', 'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bathrooms_

In [3]:
pd.set_option("display.max_rows", None)
print(listings_raw.isna().sum())
print("#################\n\n\n#################")
print(listings_raw.dtypes)
pd.reset_option("display.max_rows")

id                                                  0
listing_url                                         0
scrape_id                                           0
last_scraped                                        0
source                                              0
name                                                0
description                                      7442
neighborhood_overview                           20823
picture_url                                         0
host_id                                             0
host_url                                            0
host_name                                           8
host_since                                          8
host_location                                   11852
host_about                                      22265
host_response_time                               7883
host_response_rate                               7883
host_acceptance_rate                             6111
host_is_superhost           

In [4]:
"""
Adding minimal geographic context to neighbourhood names
"""

# dict for updating neighbourhood names
neighbourhoods_dict = {
    "Historic Anacostia": "SE Historic Anacostia",
    "Edgewood, Bloomingdale, Truxton Circle, Eckington": "NE/NW Edgewood, Bloomingdale, Truxton Circle, Eckington",
    "Capitol Hill, Lincoln Park": "SE Capitol Hill, Lincoln Park",
    "Eastland Gardens, Kenilworth": "NE Eastland Gardens, Kenilworth",
    "Kalorama Heights, Adams Morgan, Lanier Heights": "NW-mid Kalorama Heights, Adams Morgan, Lanier Heights",
    "Brightwood Park, Crestwood, Petworth": "NW-mid Brightwood Park, Crestwood, Petworth",
    "Spring Valley, Palisades, Wesley Heights, Foxhall Crescent, Foxhall Village, Georgetown Reservoir": "NW-far Spring Valley, Palisades, Wesley Heights, Foxhall Crescent, Foxhall Village, Georgetown Reservoir",
    "Cathedral Heights, McLean Gardens, Glover Park": "NW-far Cathedral Heights, McLean Gardens, Glover Park",
    "Lamont Riggs, Queens Chapel, Fort Totten, Pleasant Hill": "NE/NW Lamont Riggs, Queens Chapel, Fort Totten, Pleasant Hill",
    "Shaw, Logan Circle": "NW-mid Shaw, Logan Circle",
    "Howard University, Le Droit Park, Cardozo/Shaw": "NW-mid Howard University, Le Droit Park, Cardozo/Shaw",
    "Takoma, Brightwood, Manor Park": "NW-mid Takoma, Brightwood, Manor Park",
    "Colonial Village, Shepherd Park, North Portal Estates": "NW-mid Colonial Village, Shepherd Park, North Portal Estates",
    "Dupont Circle, Connecticut Avenue/K Street": "NW-mid Dupont Circle, Connecticut Avenue/K Street",
    "Capitol View, Marshall Heights, Benning Heights": "SE Capitol View, Marshall Heights, Benning Heights",
    "Downtown, Chinatown, Penn Quarters, Mount Vernon Square, North Capitol Street": "NW-mid Downtown, Chinatown, Penn Quarters, Mount Vernon Square, North Capitol Street",
    "Union Station, Stanton Park, Kingman Park": "NE Union Station, Stanton Park, Kingman Park",
    "Georgetown, Burleith/Hillandale": "NW-far Georgetown, Burleith/Hillandale",
    "Columbia Heights, Mt. Pleasant, Pleasant Plains, Park View": "NW-mid Columbia Heights, Mt. Pleasant, Pleasant Plains, Park View",
    "Douglas, Shipley Terrace": "SE Douglas, Shipley Terrace",
    "Cleveland Park, Woodley Park, Massachusetts Avenue Heights, Woodland-Normanstone Terrace": "NW-far Cleveland Park, Woodley Park, Massachusetts Avenue Heights, Woodland-Normanstone Terrace",
    "River Terrace, Benning, Greenway, Dupont Park": "NE/SE River Terrace, Benning, Greenway, Dupont Park",
    "Friendship Heights, American University Park, Tenleytown": "NW-far Friendship Heights, American University Park, Tenleytown",
    "West End, Foggy Bottom, GWU": "NW-mid West End, Foggy Bottom, GWU",
    "Southwest Employment Area, Southwest/Waterfront, Fort McNair, Buzzard Point": "SW Southwest Employment Area, Southwest/Waterfront, Fort McNair, Buzzard Point",
    "Hawthorne, Barnaby Woods, Chevy Chase": "NW-far Hawthorne, Barnaby Woods, Chevy Chase",
    "North Michigan Park, Michigan Park, University Heights": "NE North Michigan Park, Michigan Park, University Heights",
    "North Cleveland Park, Forest Hills, Van Ness": "NW-far North Cleveland Park, Forest Hills, Van Ness",
    "Brookland, Brentwood, Langdon": "NE Brookland, Brentwood, Langdon",
    "Twining, Fairlawn, Randle Highlands, Penn Branch, Fort Davis Park, Fort Dupont": "SE Twining, Fairlawn, Randle Highlands, Penn Branch, Fort Davis Park, Fort Dupont",
    "Mayfair, Hillbrook, Mahaning Heights": "NE Mayfair, Hillbrook, Mahaning Heights",
    "Ivy City, Arboretum, Trinidad, Carver Langston": "NE Ivy City, Arboretum, Trinidad, Carver Langston",
    "Fairfax Village, Naylor Gardens, Hillcrest, Summit Park": "SE Fairfax Village, Naylor Gardens, Hillcrest, Summit Park",
    "Near Southeast, Navy Yard": "SE Near Southeast, Navy Yard",
    "Congress Heights, Bellevue, Washington Highlands": "SE Congress Heights, Bellevue, Washington Highlands",
    "Sheridan, Barry Farm, Buena Vista": "SE Sheridan, Barry Farm, Buena Vista",
    "Woodridge, Fort Lincoln, Gateway": "NE Woodridge, Fort Lincoln, Gateway",
    "Woodland/Fort Stanton, Garfield Heights, Knox Hill": "SE Woodland/Fort Stanton, Garfield Heights, Knox Hill",
    "Deanwood, Burrville, Grant Park, Lincoln Heights, Fairmont Heights": "NE Deanwood, Burrville, Grant Park, Lincoln Heights, Fairmont Heights",
}

Note the flags here. I'm defining "likely commercial" as:  
1. An entire home/apartment
1. AND the host has 2 or more listings
1. AND it's available more than 180 days/year.  

Could change those. Probably will, just to see.

In [5]:
"""
Clean and normalize key columns in the Airbnb listings DataFrame.

- Cleans the 'price' column by removing currency symbols and commas, converting to float.
- Standardizes neighborhood names using a mapping dictionary.
- Cleans and categorizes the 'license' column into standardized categories.
- Adds boolean columns for common analysis flags:
    - is_entire_home: True if listing is an entire home/apartment.
    - is_multi_listing_host: True if host has 2 or more listings.
    - is_high_availability: True if listing is available more than 180 days/year.
    - likely_commercial: True if listing meets all three criteria above.
- Prints summary information about cleaned columns and unique values.
"""

# clean price column
listings_raw.price = (
    listings_raw.price.str.replace("$", "").str.replace(",", "").astype(float)
)

# rename neighborhoods
listings_raw["neighbourhood"] = listings_raw.neighbourhood_cleansed.replace(
    neighbourhoods_dict
)


# categorizing license status
# Hosted License: 5007242201001033 => Hosted License
def clean_license_column(series):
    def categorize_license(license):
        if pd.isna(license) or not str(license).strip():
            return "No License"
        license_clean = str(license).split(":")[0].strip().lower()
        if license_clean in ["hosted license", "unhosted license"]:
            return "Licensed"
        elif license_clean == "exempt":
            return "Exempt"
        else:
            return "No License"

    return series.apply(categorize_license)


# replace license column with cleaned/categorized values
listings_raw["license"] = clean_license_column(listings_raw["license"])

# add booleans for likely commercial listings
listings_raw["is_entire_home"] = listings_raw["room_type"] == "Entire home/apt"
listings_raw["is_multi_listing_host"] = (
    listings_raw["calculated_host_listings_count"] >= 2
)
listings_raw["is_high_availability"] = listings_raw["availability_365"] > 180
listings_raw["likely_commercial"] = (
    listings_raw["is_entire_home"]
    & listings_raw["is_high_availability"]
    & listings_raw["is_multi_listing_host"]
)

# check output
print(f"Price column dtype: {listings_raw.price.dtype}\n")
print(f"Neighbourhoods: {listings_raw.neighbourhood_cleansed.unique()}\n")
print(f"License types: {listings_raw.license.unique()}\n")
print(
    f"Number of unique neighbourhoods: {len(listings_raw.neighbourhood_cleansed.unique())}"
)

Price column dtype: float64

Neighbourhoods: ['Historic Anacostia' 'Edgewood, Bloomingdale, Truxton Circle, Eckington'
 'Columbia Heights, Mt. Pleasant, Pleasant Plains, Park View'
 'Brightwood Park, Crestwood, Petworth' 'Capitol Hill, Lincoln Park'
 'Takoma, Brightwood, Manor Park'
 'Ivy City, Arboretum, Trinidad, Carver Langston'
 'Friendship Heights, American University Park, Tenleytown'
 'Kalorama Heights, Adams Morgan, Lanier Heights' 'Shaw, Logan Circle'
 'Spring Valley, Palisades, Wesley Heights, Foxhall Crescent, Foxhall Village, Georgetown Reservoir'
 'Cathedral Heights, McLean Gardens, Glover Park'
 'Congress Heights, Bellevue, Washington Highlands'
 'West End, Foggy Bottom, GWU'
 'Colonial Village, Shepherd Park, North Portal Estates'
 'Brookland, Brentwood, Langdon'
 'Lamont Riggs, Queens Chapel, Fort Totten, Pleasant Hill'
 'Union Station, Stanton Park, Kingman Park'
 'Dupont Circle, Connecticut Avenue/K Street'
 'Howard University, Le Droit Park, Cardozo/Shaw'
 'Georgetow

In [6]:
# prepare cleaned listings dataframe

listings_clean = listings_raw[
    [
        "id",
        "host_id",
        "quarter",
        "quarter_index",
        "neighbourhood_cleansed",
        "latitude",
        "longitude",
        "price",
        "room_type",
        "minimum_nights",
        "availability_365",
        "calculated_host_listings_count",
        "license",
        "is_entire_home",
        "is_multi_listing_host",
        "is_high_availability",
        "likely_commercial",
    ]
]

# rename columns
listings_clean = listings_clean.rename(
    columns={"id": "listing_id", "neighbourhood_cleansed": "neighbourhood"}
)