# Download Data from InsideAirbnb
This file is used for download, cleaning, and formatting the data on Santa Clara County from InsideAirbnb. The final dataframe is saved as `listings.csv` in this directory.

In [1]:
import pandas as pd

## Load Listings

In [2]:
url = 'https://data.insideairbnb.com/united-states/ca/santa-clara-county/2024-12-23/data/listings.csv.gz'
df = pd.read_csv(url, compression='gzip')
df = df.set_index('id')

## Clean Number of Features

In [3]:
# Only keep the columns we need
df = df[[
    # Listing identification
    'name', 'picture_url', 
    # Host descriptors (without identification)
    'host_since', 'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 
    # Listing descriptors
    'latitude', 'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'amenities', 'price', 'minimum_nights', 'maximum_nights', 
    # Availability
    'has_availability', 'availability_30', 'availability_60', 'availability_90', 'availability_365',
    # Reviews
    'number_of_reviews', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 'instant_bookable', 'reviews_per_month']]

## Remove Listings Without Price or Review Data

In [4]:
# Remove listings with no price
df = df[df['price'].notna()]

# Remove listings with no review scores rating (based on analysis of missing values, all other review scores are missing if this is missing)
df = df[df['review_scores_rating'].notna()]

# At this point, only a few (10-15) entries contain missing values, we can just drop them.
df = df[df.notna().any(axis=1)]

## Fix Data Types

In [5]:
# Convert price to float
df['price'] = df['price'].str.replace('$', '').str.replace(',', '').astype(float)


# Remove outliers above mean + 3 * standard deviation
threshold = df['price'].mean() + 3 * df['price'].std()
df = df[df['price'] < threshold]


# Replace single quotes with double quotes to make it valid JSON
df['host_verifications'] = df['host_verifications'].str.replace("'", '"')  

## Export to CSV

In [6]:
df.to_csv('scc_airbnbs.csv')