In [1]:
import pandas as pd
import altair as alt
import re

alt.data_transformers.disable_max_rows()

def preprocess_data(file_path):
    data = pd.read_csv(file_path)
    data['price'] = pd.to_numeric(data['price'].replace('[$,]', '', regex=True), errors='coerce')
    data['last_review'] = pd.to_datetime(data['last_review'], errors='coerce')
    data = data.dropna(subset=['price', 'room_type', 'number_of_reviews', 'neighbourhood_group', 'neighbourhood', 'last_review', 'host_id', 'longitude','latitude'])
    data = data[data['price'] > 0]
    return data

data = preprocess_data("listings (1).csv")
data.columns

Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365', 'number_of_reviews_ltm', 'license'],
      dtype='object')

In [3]:
WIDTH, HEIGHT = 800, 1000
# Date selection
min_date = data['last_review'].min().replace(hour=0, minute=0, second=0, microsecond=0)
max_date = data['last_review'].max().replace(hour=23, minute=59, second=59, microsecond=999999)
# print(f"min_date: {min_date}, max_date: {max_date}")  # Debug time range

# Define selections
neighbourhood_selection = alt.selection_point(fields=['neighbourhood_group'], bind='legend')

brush = alt.selection_interval(encodings=['x','y'])

# Price range sliders (min and max)
price_min_slider = alt.binding_range(min=0, max=int(data['price'].max()), step=10, name='Minimum Price (USD):')
price_max_slider = alt.binding_range(min=0, max=int(data['price'].max()), step=10, name='Maximum Price (USD):')
price_min = alt.param(bind=price_min_slider)
price_max = alt.param(bind=price_max_slider)
# Room type selector
selection_room_type = alt.param(
    bind=alt.binding_select(
        options=['All'] + sorted(data['room_type'].unique().tolist()),
        name='Select Room Type:'
    ),
    name='room_type'
)

host_selection = alt.selection_point(fields=['host_name'], bind='legend')

reviews_min_slider = alt.binding_range(min=0, max=300, step=1, name='Minimum Number of Reviews:')
reviews_min = alt.param(bind=reviews_min_slider)

neighborhood_dropdown = alt.binding_select(options=['All'] + sorted(data['neighbourhood_group'].unique().tolist()), name='Select Neighborhood Group:')
neighborhood_select = alt.param(bind=neighborhood_dropdown)

search_input = alt.selection_point(
    fields=['host_name'], 
    empty=False,  
    bind=alt.binding(
        input='search',
        placeholder="Enter host name",
        name='Search Host: '
    )
)


barchart = alt.Chart(data).mark_point().encode(
    x=alt.X('last_review:T', title='Last Review Date'),
    y=alt.Y('mean(price):Q', title='Average Price (USD)', scale=alt.Scale(zero=False)),
    color=alt.Color('neighbourhood_group:N', legend=alt.Legend(orient='right', title='Neighbourhood Group')),
    tooltip=[
        alt.Tooltip('neighbourhood_group:N', title='Neighbourhood Group'),
        alt.Tooltip('last_review:T', title='Review Date', format='%Y-%m-%d'),
        alt.Tooltip('mean(price):Q', title='Average Price', format='$.2f')
    ]
).add_params(
    neighbourhood_selection, brush, price_min, price_max, selection_room_type
    , neighborhood_select, reviews_min, host_selection 
).transform_filter(
    neighbourhood_selection
).transform_filter(
    (alt.datum.price >= price_min) & (alt.datum.price <= price_max)
).properties(
    title='Daily Average Price for Selected Time Range (Point Selection Mode)',
    width= 600,
    height=500
)

# Create overall mean line
overall_mean_rule_ng = alt.Chart(data).mark_rule(color='firebrick').encode(
    y='mean(price):Q',  # Use mean price as rule position
    size=alt.SizeValue(3)
).transform_filter(
    brush  
)

# Create selected range mean line
selected_mean_rule_ng = alt.Chart(data).mark_rule(color='darkblue', strokeDash=[4, 2]).encode(
    y='mean(price):Q',  # Use mean price as rule position
    size=alt.SizeValue(3)
).transform_filter(
    brush  
).transform_filter(
    neighbourhood_selection  
)

# Combine charts: Merge bar chart and rule lines into one layer
point_chart = alt.layer(
    barchart,
    overall_mean_rule_ng,
    selected_mean_rule_ng 
).properties(
    title='Monthly Average Housing Price Bar Chart (Red: Overall Mean, Blue: Selected Mean)'
)

# pie chart: use data selected by barchart
pie_chart = alt.Chart(data).mark_arc().encode(
    theta=alt.Theta('count:Q', stack=True),  # Calculate angle based on listing count
    color=alt.Color('price_category:N', legend=alt.Legend(
        title='Price Range',
        orient='bottom',  
        columns=2
        )), 
    tooltip=[
        alt.Tooltip('price_category:N', title='Price Range'),
        alt.Tooltip('count:Q', title='House Count'),
        alt.Tooltip('total_count:Q', title='Total House Count'),  
        alt.Tooltip('percentage:Q', title='Percentage', format='.2%')  
    ]
).transform_filter(
    brush 
).transform_filter(
    (alt.expr.if_(selection_room_type == 'All', True, alt.datum.room_type == selection_room_type))
).transform_filter(
    (alt.expr.if_(neighborhood_select == 'All', True, alt.datum.neighbourhood_group == neighborhood_select))
).transform_filter(
    neighbourhood_selection
).transform_calculate(
    price_category=''' 
        datum.price <= 200 ? "0-200" :
        (datum.price > 200 && datum.price <= 400) ? "200-400" :
        (datum.price > 400 && datum.price <= 600) ? "400-600" :
        (datum.price > 600 && datum.price <= 800) ? "600-800" :
        (datum.price > 800 && datum.price <= 1000) ? "800-1000" :
        "1000+"
    '''
).transform_aggregate(
    count='count()',  
    groupby=['price_category']
).transform_joinaggregate(
    total_count='sum(count)'  
).transform_calculate(
    percentage='datum.count / datum.total_count'  
).properties(
    title='Room Type Price Range Distribution',
    width= 300,
    height=200
)

# Bubble chart
bubble_chart = barchart.mark_circle().encode(
    x=alt.X('host_name:N', title='Host', sort='-y',
            axis=alt.Axis(labelAngle=-45, labelAlign="right", labelBaseline="middle")),
    y=alt.Y('total_listings:Q', title='Listing Count'),
    size=alt.Size('total_listings:Q', scale=alt.Scale(range=[50, 500]), title='Listing Count'),
    color=alt.Color('host_name:N', legend=alt.Legend(title='Host')),
    tooltip=[
        alt.Tooltip('host_name:N', title='Host'),
        alt.Tooltip('total_listings:Q', title='House Count')
    ]
).transform_filter(
    brush
).transform_filter(
    (alt.datum.number_of_reviews > reviews_min)
).transform_aggregate(
    total_listings='count()',
    groupby=['host_id', 'host_name']
).transform_window(
    rank='rank()',
    sort=[alt.SortField('total_listings', order='descending')]
).transform_filter(
    alt.datum.rank <= 10
).properties(
    title='2024 Q4 Active Hosts (by Listing Count)',
    width= 600,
    height= 300
)

# 房源点
map_chart = alt.Chart(data).mark_circle(size=60,opacity=0.7).encode(
    longitude='longitude:Q',
    latitude='latitude:Q',
    color=alt.Color('neighbourhood_group:N', legend=alt.Legend(orient='right', title='Neighbourhood Group')),
    tooltip=[
        'name:N',
        'room_type:N',
        alt.Tooltip('price:Q', format='$,.0f'),
        alt.Tooltip('number_of_reviews:Q', format=',d'),
        'neighbourhood_group:N',
        'host_name:N'
    ]
).add_params(
    search_input
).transform_filter(
    search_input
).properties(
    title = 'Geographical distribution of property selected by landlord',
    width= 300,
    height= 500
)

pic = alt.hconcat(point_chart, pie_chart).resolve_legend(
    color='independent',
    size='independent'
).resolve_scale(
    color='independent'
)
pic2 = alt.hconcat(bubble_chart, map_chart).properties(
    spacing=50  
).resolve_legend(
    color='independent',
    size='independent'
).resolve_scale(
    color='independent'
)
system = alt.vconcat(
    pic, pic2
).properties(
    spacing=50  
).configure_view(
    strokeWidth=0,  
    clip=False
).configure_axisX(
    labelAngle=-30,  
    labelLimit=100   
)

system.save('System_C.html')
print("System C dashboard saved as 'System_C.html'")
system

System C dashboard saved as 'System_C.html'


In [5]:

# 读取 HTML 文件内容
with open('System_C.html', 'r', encoding='utf-8') as f:
    html_content = f.read()

# 任务描述（英文）
tasks_content = '''
    <h3>Tasks</h3>
    <ol>
    <li><b>Find the neighbourhood_group with the highest average housing price in 2024 (from January 1, 2024, to December 31, 2024).</b><br></li>

    <li><b>Find the proportion of “Private room” listings priced between 0-200 in the highest-priced neighbourhood_group of 2024.</b><br>
    </li>

    <li><b>Identify the top three host names based on the number of their listings that received more than 50 reviews during Q4 of 2024 (September 1 - December 31).</b><br></li>

    <li><b>Display the geographical distribution of listings for the host with the highest number of listings identified in Task 3.</b><br></li>
    </ol>
'''

# 替换 HTML 文件中的任务内容
modified_html = re.sub(
    r'<body>',
    f'<body>{tasks_content}',
    html_content
)

# 将修改后的 HTML 文件写入
with open('System_C.html', 'w', encoding='utf-8') as f:
    f.write(modified_html)

print("Tasks added to system_C.html successfully!")

Tasks added to system_C.html successfully!
