In [22]:
import pandas as pd
import altair as alt
import altair_viewer
import numpy as np
import datetime
import re
alt.data_transformers.enable("vegafusion")

DataTransformerRegistry.enable('vegafusion')

In [24]:
df = pd.read_csv("listings (1).csv")

In [26]:
df["last_review"] = pd.to_datetime(df["last_review"], errors="coerce")
df = df.dropna(subset=["last_review", "neighbourhood_group", "host_id", "host_name"])  

date_min = df["last_review"].min().strftime("%Y-%m-%d")
date_max = df["last_review"].max().strftime("%Y-%m-%d")

start_date = alt.param(
    name="start_date",
    bind=alt.binding(input="date", name="Start Date: "),
    value=date_min
)

end_date = alt.param(
    name="end_date",
    bind=alt.binding(input="date", name="End Date: "),
    value=date_max
)

min_reviews = alt.param(
    name="min_reviews",
    bind=alt.binding_range(min=0, max=500, step=10, name="Min Reviews: "),
    value=0
)


host_selection = alt.selection_point(
    fields=['host_id'],  
    empty="all",
    on="click"
)


agg_data = (
    alt.Chart(df)
    .transform_filter(
        (alt.datum.last_review >= alt.expr.toDate(start_date)) &
        (alt.datum.last_review <= alt.expr.toDate(end_date)) &
        (alt.datum.number_of_reviews > min_reviews)
    )
    .transform_aggregate(
        num_listings='count()',
        avg_price='mean(price)',
        groupby=['host_id', 'host_name']  
    )
)


bar_chart = (
    agg_data
    .transform_window(
        rank="rank()",
        sort=[alt.SortField("num_listings", order="descending")]
    )
    .transform_filter("datum.rank <= 10")  
    .mark_bar()
    .encode(
        x=alt.X('num_listings:Q', title='Number of Listings'),
        y=alt.Y('host_name:N', sort='-x', title='Host'),  
        tooltip=['host_id:N', 'host_name:N', 'num_listings:Q', 'avg_price:Q'],  
        color=alt.condition(
            host_selection,  
            alt.value("orange"),
            alt.value("steelblue")
        )
    )
    .add_params(start_date, end_date, min_reviews, host_selection)
    .properties(title="Top 10 Hosts by Number of Listings", width=400, height=300)
)


price_chart = (
    agg_data
    .transform_window(
        rank="rank()",
        sort=[alt.SortField("avg_price", order="descending")]
    )
    .transform_filter("datum.rank <= 10")
    .mark_bar()
    .encode(
        x=alt.X('avg_price:Q', title='Average Price'),
        y=alt.Y('host_name:N', sort='-x', title='Host'),
        tooltip=['host_name:N', 'num_listings:Q', 'avg_price:Q'],
        color=alt.value("steelblue")  
    )
    .add_params(start_date, end_date, min_reviews) 
    .properties(title="Top 10 Hosts by Average Price", width=400, height=300)
)


neigh_chart = (
    alt.Chart(df)
    .transform_filter(
        (alt.datum.last_review >= alt.expr.toDate(start_date)) &
        (alt.datum.last_review <= alt.expr.toDate(end_date)) &
        (alt.datum.number_of_reviews > min_reviews)
    )
    .transform_filter(host_selection) 
    .transform_aggregate(
        count='count()',
        groupby=['host_id', 'host_name', 'neighbourhood_group']
    )
    .mark_bar()
    .encode(
        x=alt.X('count:Q', title='Number of Listings'),
        y=alt.Y('neighbourhood_group:N', sort='-x', title='Neighborhood Group'),
        color=alt.Color(
            'neighbourhood_group:N',
            legend=alt.Legend(title="Neighborhood Group", orient="right")
        ),
        tooltip=['neighbourhood_group:N', 'count:Q']
    )
    .add_params(start_date, end_date, min_reviews, host_selection)
    .properties(title="Listings by Neighborhood Group (Selected Host)", width=800, height=200)
)


dashboard = (bar_chart | price_chart) & neigh_chart
dashboard = dashboard.resolve_legend(
    color="independent" 
).configure_legend(
    labelLimit=0,
    columns=1,
    symbolLimit=500
).configure_view(
    continuousWidth=800
)
dashboard.show()

In [21]:
dashboard.save('systemB_part2.html')

In [23]:

with open('systemB_part2.html', 'r', encoding='utf-8') as f:
    html_content = f.read()

tasks_content = '''
    <h3>Tasks</h3>
    <ol start="3">
        <li><b>Identify the top three host names based on the number of their listings that received more than 50 reviews during Q4 of 2024 (October 1 - December 31).</b><br></li>

        <li><b>Display the geographical distribution of listings for the host with the highest number of listings identified in Task 3.</b><br></li>
    </ol>
'''

modified_html = re.sub(
    r'<body>',
    f'<body>{tasks_content}',
    html_content
)


with open('systemB_part2.html', 'w', encoding='utf-8') as f:
    f.write(modified_html)
