In [None]:
import pandas as pd, numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import json
from datetime import datetime as dt
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.neighbors import KNeighborsClassifier
import plotly.express as px
from math import radians # to convert latitude/longitude
import dataframe_image as dfi

from Project1 import *

import plotly.io as pio
pio.renderers.default = 'iframe'

# Boston Airbnb Dataset
https://www.kaggle.com/datasets/airbnb/boston?resource=download

Three questions/topics to explore:
- How does **geography** inlfuence Airbnb prices in Boston?
    - What areas/zip codes/neighborhoods are more expensive than others?
- How do **property characteristics** influence Airbnb prices in Boston?
    - Number of bathrooms, bedrooms, beds, square footage, reviews, etc.
- How does **time (seasonality)** influence Airbnb prices in Boston?
    - Period of the week (days), period of the month (weeks), period of the year (months)?
    - We only have about 2 years of data to work with, but we'll see what comes out.

# First, load and clean the data

In [None]:
boston_listings = pd.read_csv('./data/airbnb_boston/listings.csv', index_col='id')
boston_calendar = pd.read_csv('./data/airbnb_boston/calendar.csv')
boston_reviews = pd.read_csv('./data/airbnb_boston/reviews.csv')

boston_listings = boston_listings.map(convert_dollars_to_float).map(convert_percentages_to_float).map(convert_string_date_to_dt)
boston_calendar = boston_calendar.apply(convert_dollars_to_float).map(convert_percentages_to_float).map(convert_string_date_to_dt)
boston_reviews = boston_reviews.apply(convert_dollars_to_float).map(convert_percentages_to_float).map(convert_string_date_to_dt)

# estimate missing monthly_price/weekly_price fields using regression (based on price field)
boston_listings = estimate_y_from_X(data=boston_listings, y_label='monthly_price', X_labels='price')['filled_data']
boston_listings = estimate_y_from_X(data=boston_listings, y_label='weekly_price', X_labels='price')['filled_data']

# estimate missing zip codes using KNN classification (based on latitude and longitude)
boston_listings = classify_y_based_on_X(
    data=get_cleaned_zipcodes(boston_listings), y_label='zipcode_cleaned', X_labels=['latitude', 'longitude'])['filled_data']

In [None]:
# identify column types
boston_listing_col_types = get_columns_and_types(boston_listings)
boston_calendar_col_types = get_columns_and_types(boston_calendar)
boston_review_col_types = get_columns_and_types(boston_reviews)
# boston_listing_col_types

# Question 1: How does geography influence Airbnb rental prices?

## Add a choropleth (geographical map) and overlay the listings prices from the dataset.
- Thank you for the geojson data: https://github.com/codeforgermany/click_that_hood/blob/main/public/data/boston.geojson?short_path=46589b4
- Use log scale for the prices (so the colorbar isn't too compressed) - or change the scale of the colorbar.

A quick visual inspection of the map allow us to see more expensive listings tend to northward, and in the following neighborhoods (in no particular order):
- West End, North End, South End, Downtown, Leather District, Chinatown, Leather District, South Boston Waterfront, Fenway.

- What if we run a regression-type model of price on latitude+longitude?

In [None]:
# geoplot of neighborhoods with listings overlaid
with open('./data/airbnb_boston/boston.geojson', 'r') as f:
    geojson = json.load(f)
    
geoplot_data = boston_listings[['latitude', 'longitude', 'zipcode_cleaned', 'price']].dropna()
geoplot_data['log_price'] = geoplot_data['price'].apply(np.log)

fig = px.choropleth(
    data_frame={'name': [i['properties']['name'] for i in geojson['features']]}, 
    geojson=geojson, 
    locations='name', 
    featureidkey="properties.name",
    title='Boston Neighborhoods and Airbnb Prices<br>Colorbar is Log Scale',
)
fig.update_geos(fitbounds="locations", visible=False) 
fig.add_trace(
    px.scatter_geo(
        data_frame=geoplot_data, 
        lat='latitude', 
        lon='longitude', 
        color='log_price',
        hover_data={
            'latitude': ':.2f', 
            'longitude': ':.2f', 
            'price': ':.2f', 
            'log_price': ':.2f',
        },
    ).data[0])

# relabel the colorbar (as showing log values is confusing)
fig.update_coloraxes(colorbar={
    'title': 'Price',
    'tickvals': geoplot_data['log_price'].quantile([0.01, 0.999]).values,
    'ticktext': ['Cheaper', 'Pricier'],
})
fig.update_layout(showlegend=False)

fig.show()

pio.write_image(fig, 'prices_and_neighborhoods.png') 

In [None]:
prices_by_neighborhood = pd.concat([
    boston_listings[['price', 'neighbourhood_cleansed']].groupby('neighbourhood_cleansed').mean().squeeze().to_frame('Mean Price'),
    boston_listings[['price', 'neighbourhood_cleansed']].groupby('neighbourhood_cleansed').median().squeeze().to_frame('Median Price'),
], axis=1).sort_values(by='Median Price', ascending=False)
prices_by_neighborhood_styled = prices_by_neighborhood.style.format('${:.0f}').set_caption('Mean and Median Price by Neighborhood<br>Sorted by Median')
dfi.export(prices_by_neighborhood_styled, 'prices_by_neighborhood_table.png')
prices_by_neighborhood_styled

In [None]:
fig, ax = plt.subplots(figsize=(15, 5))
ax = prices_by_neighborhood['Median Price'].plot.bar(cmap='viridis', title='Sorted Bar Plot of Median Price by Neighborhood')
ax.axvline(11.5, color='black', alpha=0.2)
ax.axhline(150, color='black', alpha=0.2)
fig.tight_layout()
plt.savefig('prices_by_prices_by_neighborhood_barplot.png')

# Let's look at how well a regression can fit price on latitude and longitude
- This gives a very poor fit, even on the training set. Why?
    - It could be that latitude and longtide - which reflect locations on a sphere (the Earth) do not align well with 2d locations on a map.
    - Therefore, the link betweeb price and lat/lon may not be linear, which means regression is not the right tool.
    - Let's keep exploring.

In [None]:
reg_price_on_lat_lon = estimate_y_from_X(
    data=boston_listings,
    y_label='price',
    X_labels=['latitude', 'longitude'],
    train_size=0.6,
    random_state=42,
    add_constant=False,
)

In [None]:
sns.scatterplot(
    data=pd.concat({
        'y_train': reg_price_on_lat_lon['y_train'],
        'y_fitted': reg_price_on_lat_lon['model'].fittedvalues,
    }, axis=1),
    x='y_train',
    y='y_fitted',
)

# Question 2: How do characteristics influence Airbnb listing prices?
- Here, we can structure a regression on categorical variables against price.
    - But we should ensure that numeric categoricals - like number of bedrooms - appear here!
    - Quantitative variables - like square footage - should also appear!
- Remember to create dummies from the categoricals.
- From a manual review of categories, these look like they may be informative (not free text, not arbitrary values, etc)
    - room_type
- TODO: clean up AMENITIES

In [None]:
boston_listings['room_type'].unique()

In [None]:
boston_listing_col_types['int']

In [None]:
boston_listings['accommodates']

In [None]:
ignore = [
    'name', 'summary', 'space', 'description', 'neighborhood_overview', 'host_verifications', 'neighbourhood', 'host_about', 'notes',
    'transit', 'host_location', 'access', 'host_about', 'interaction', 'zipcode', 'smart_location', 'house_rules',
    'neighbourhood_cleansed', 'city', 'street', 'host_neighbourhood', 'host_name',
]

unique_characteristics = {}
for i in boston_listing_col_types['object'] + boston_listing_col_types['int']:
    if i in ignore or '_url' in i:
        continue
    unique_values = boston_listings[i].dropna().unique()
    if len(unique_values) > 1:
        unique_characteristics[i] = unique_values

In [None]:
sns.pairplot(
    data=boston_listings[['price'] + ['bathrooms', 'bedrooms', 'beds']].dropna()
)

In [None]:
sns.scatterplot(
    data=boston_listings[['price', 'square_feet']].dropna(),
    x='square_feet', y='price')

In [None]:
boston_listings[boston_listing_col_types['object']].iloc[-1]

# Question 3: How does time influence Airbnb listing prices?
- Day of week.
- Week of month.
- Month of year.

In [None]:
# convert price column to a float, assuming format of $#.#
boston_calendar.price = [float(i.replace('$', '').replace(',', '')) if type(i) is str else i for i in boston_calendar.price]

# concert date to a datetime object
boston_calendar.date = [pd.Timestamp(i) if type(i) is str else i for i in boston_calendar.date]

# add a month, year columns for seasonality analysis
day_map = {0: 'Mon', 1: 'Tues', 2: 'Wed', 3: 'Thurs', 4: 'Fri', 5: 'Sat', 6: 'Sun'}
boston_calendar['weekday'] = [i.weekday() for i in boston_calendar.date]
boston_calendar['month'] = [i.month for i in boston_calendar.date]
boston_calendar['year'] = [i.year for i in boston_calendar.date]

In [None]:
boxplot_data = boston_calendar.assign(log_price=lambda x: np.log(x['price'])).dropna()
ax = sns.boxplot(
    data=boxplot_data,
    x='month',
    y='log_price',
    showfliers=False,
    palette={month: 'salmon' if month in [9, 10, 11] else 'dodgerblue' for month in boxplot_data.month.unique()},
    hue='month', legend=False,
)
ax.set_title("Distribution of Log Listing Prices by Calendar Month");
plt.savefig('price_by_month_boxplot.png')

In [None]:
ax = sns.boxplot(
    data=boxplot_data,
    x='weekday',
    y='log_price',
    showfliers=False,
    palette={weekday: 'salmon' if weekday in [4, 5] else 'dodgerblue' for weekday in boxplot_data.weekday.unique()},
    hue='weekday', legend=False,
)
ax.set_title("Distribution of Log Listing Prices by Day of the Week");
ax.set_xticks(ax.get_xticks());
ax.set_xticklabels(day_map[i] for i in ax.get_xticks());
plt.savefig('price_by_weekday_boxplot.png')

What other numerical values can we use (and what needs to be filled before we can)?
- host_response_rate (missing around 400 entries)
- host_acceptance_rate (missing around 400 entries)
- square_feet (missing a lot)
- extra people?
- cleaning fee?
- security_deposit?

In [None]:
boston_listings[boston_listing_col_types['float']].dropna(how='all', axis=1).iloc[-1] # 'host_acceptance_rate'].dropna()

# Assemble a model
- Just realizing that the price is boston_listings doesn't say the date for which that listing is active (unless it is just for the "last_scraped" date, but the whole dataset is for a single last_scraped date).
- So... that means we can't use the day/week/month info since it would be a constant if we're only modeling a single date.
- Using the various categorical variables gives a pretty meh fit.

In [None]:
y_label = 'price'

categorical_vars_to_use = [
    'bathrooms', 
    'bedrooms', 
    'beds', 
    'accommodates', 
    'property_type', 
    'room_type', 
    'cancellation_policy', 
    'host_identity_verified',
]
# numeric_vars_to_use = ['square_feet']

model_data = pd.concat([
    boston_listings[[y_label]],
    # boston_listings[numeric_vars_to_use],
    pd.get_dummies(boston_listings[categorical_vars_to_use], columns=categorical_vars_to_use, drop_first=True),
    pd.get_dummies(boston_listings.neighbourhood_cleansed, drop_first=True),
    # pd.get_dummies(boston_calendar.month, drop_first=True),
    ], axis=1)


In [None]:
# sm.OLS(endog=model_data[y_label], exog=model_data.drop(y_label, axis=1)).fit().summary()

In [None]:
out = estimate_y_from_X(model_data, y_label=y_label, X_labels=list(model_data.drop(y_label, axis=1).columns), train_size=2/3, random_state=42)

In [None]:
boston_calendar.query('available=="t"')