# Airbnb Data Exploration

Data exploration of Airbnb for London

In [4]:
# imports
import pandas as pd
import seaborn as sns
import numpy as np
import plotly.express as px

In [14]:
listings_df = pd.read_csv("../data/02_intermediate/listings.csv", dtype='unicode')
listings_df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,13913,Holiday London DB Room Let-on going,54730,Alina,,Islington,51.56802,-0.11121,Private room,65,1,21,2020-02-22,0.18,2,359
1,15400,Bright Chelsea Apartment. Chelsea!,60302,Philippa,,Kensington and Chelsea,51.48796,-0.16898,Entire home/apt,100,10,89,2020-03-16,0.71,1,232
2,17402,Superb 3-Bed/2 Bath & Wifi: Trendy W1,67564,Liz,,Westminster,51.52195,-0.14094,Entire home/apt,300,3,42,2019-11-02,0.38,15,307
3,17506,Boutique Chelsea/Fulham Double bed 5-star ensuite,67915,Charlotte,,Hammersmith and Fulham,51.47935,-0.19743,Private room,150,3,0,,,2,362
4,25023,All-comforts 2-bed flat near Wimbledon tennis,102813,Amanda,,Wandsworth,51.44687,-0.21874,Entire home/apt,65,21,35,2020-03-30,0.7,1,15


In [37]:
listings_df['host_id'].unique().shape[0]

53837

In [33]:
def check_uniqueness(df, column_name):
    print("Number of rows: " + str(df.shape[0]))
    print("Number of distinct rows: " + str(df[column_name].unique().shape[0]))  # making sure no duplicate listings

def convert_dtype(df, column_names, dtypes):
    for col_name, dtype in zip(column_names, dtypes):
        df[col_name] = df[col_name].astype(dtype) # convert string to int    
    return df

def custom_bar(df_series, x_label, y_label):
    fig = px.bar(df_series.sort_values(ascending=False), template='plotly_dark')
    fig.update_layout(
        showlegend=False,
        autosize=False,
        width=1200,
        height=500,
        yaxis=dict(
            title_text=x_label,
            tickmode="array",
            titlefont=dict(size=14),
        ),
        xaxis=dict(
        title_text=y_label,
        tickmode="array",
        titlefont=dict(size=14),
        ))
    fig.show()

def custom_scatter(df, x_col, y_col, color_col, text_col, size_col):
    fig = px.scatter(neighbourhood_avgs,
                    x=x_col,
                    y=y_col,
                    color=color_col,
                    text=text_col,
                    size=size_col,
                    size_max=30,
                    template='plotly_dark')

    fig.update_traces(textposition='top center')

    fig.update_layout(
        showlegend=False,
        autosize=False,
        width=1200,
        height=800,
        yaxis=dict(
            title_text=y_col,
            tickmode="array",
            titlefont=dict(size=14),
        ),
        xaxis=dict(
        title_text=x_col,
        tickmode="array",
        titlefont=dict(size=14),
        ))
    
    fig.show()

## Host Listings

In [16]:
check_uniqueness(listings_df, 'id')

Number of rows: 86358
Number of distinct rows: 86358


In [22]:
df = convert_dtype(listings_df, ['calculated_host_listings_count', 'price', 'availability_365'], ['int', 'int', 'int'])

In [29]:
neighbourhood_avgs = listings_df.groupby('neighbourhood').mean()
neighbourhood_avgs.rename(columns = {
'price':'Avg. Listing Price (£)',
'calculated_host_listings_count': 'Avg. No of Host Owned Listings',
'availability_365': 'Yearly Availability (out 365 days)'},
inplace = True)
neighbourhood_avgs['neighbourhood'] = neighbourhood_avgs.index
neighbourhood_avgs.head()

Unnamed: 0_level_0,Avg. Listing Price (£),Avg. No of Host Owned Listings,Yearly Availability (out 365 days),neighbourhood
neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Barking and Dagenham,59.833333,2.515152,136.941919,Barking and Dagenham
Barnet,93.64012,11.7,137.138922,Barnet
Bexley,57.452107,2.233716,145.83908,Bexley
Brent,102.219194,18.886372,119.416123,Brent
Bromley,65.461774,2.102446,131.181957,Bromley


In [30]:
custom_bar(neighbourhood_avgs['Avg. Listing Price (£)'], "Airbnb London Neighbourhood", "Avg. Listing Price (£)")

In [31]:
custom_bar(neighbourhood_avgs['Avg. No of Host Owned Listings'], "Airbnb London Neighbourhood", "Avg. No of Host Owned Listings")

In [32]:
custom_bar(neighbourhood_avgs['Yearly Availability (out 365 days)'], "Airbnb London Neighbourhood", "Yearly Availability (out 365 days)")

In [34]:
custom_scatter(neighbourhood_avgs, "Avg. No of Host Owned Listings", "Yearly Availability (out 365 days)", "neighbourhood", "neighbourhood", 'Avg. Listing Price (£)')

In [154]:
listings_df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,13913,Holiday London DB Room Let-on going,54730,Alina,,Islington,51.56802,-0.11121,Private room,65,1,21,2020-02-22,0.18,2,359
1,15400,Bright Chelsea Apartment. Chelsea!,60302,Philippa,,Kensington and Chelsea,51.48796,-0.16898,Entire home/apt,100,10,89,2020-03-16,0.71,1,232
2,17402,Superb 3-Bed/2 Bath & Wifi: Trendy W1,67564,Liz,,Westminster,51.52195,-0.14094,Entire home/apt,300,3,42,2019-11-02,0.38,15,307
3,17506,Boutique Chelsea/Fulham Double bed 5-star ensuite,67915,Charlotte,,Hammersmith and Fulham,51.47935,-0.19743,Private room,150,3,0,,,2,362
4,25023,All-comforts 2-bed flat near Wimbledon tennis,102813,Amanda,,Wandsworth,51.44687,-0.21874,Entire home/apt,65,21,35,2020-03-30,0.7,1,15


In [155]:
listings_df['room_type'].unique()

array(['Private room', 'Entire home/apt', 'Hotel room', 'Shared room'],
      dtype=object)

In [159]:
listing_type_df = listings_df[['neighbourhood', 'room_type']]
listing_type_df.head()

Unnamed: 0,neighbourhood,room_type
0,Islington,Private room
1,Kensington and Chelsea,Entire home/apt
2,Westminster,Entire home/apt
3,Hammersmith and Fulham,Private room
4,Wandsworth,Entire home/apt


In [162]:
onehot_encoding = pd.get_dummies(listing_type_df['room_type'])
listing_type_encoded = listing_type_df.join(onehot_encoding)
listing_type_encoded.head()

In [189]:
listing_type_bars_df = listing_type_encoded.groupby('neighbourhood').sum()
#Total sum per row: 
listing_type_bars_df['total_listings'] = listing_type_bars_df.sum(axis=1)
listing_type_bars_df = listing_type_bars_df.sort_values('total_listings', ascending=False)
listing_type_bars_df.head()

Unnamed: 0_level_0,Entire home/apt,Hotel room,Private room,Shared room,total_listings
neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Westminster,7389.0,113.0,2371.0,70.0,9943.0
Tower Hamlets,4067.0,40.0,4153.0,61.0,8321.0
Camden,4125.0,98.0,1977.0,43.0,6243.0
Hackney,3382.0,13.0,2786.0,29.0,6210.0
Kensington and Chelsea,4853.0,182.0,988.0,56.0,6079.0


In [195]:
fig = px.bar(listing_type_bars_df[['Entire home/apt', 'Hotel room', 'Private room', 'Shared room']], template='plotly_dark')
fig.update_layout(
    autosize=False,
    width=1400,
    height=500,
    yaxis=dict(
        title_text="Listing Count",
        tickmode="array",
        titlefont=dict(size=14),
    ),
    xaxis=dict(
    title_text="Airbnb London Neighbourhoods",
    tickmode="array",
    titlefont=dict(size=14),
    ))
fig.show()

In [197]:
merged_df = neighbourhood_avgs.join(listing_type_bars_df)
merged_df.head()

Unnamed: 0_level_0,Avg. Listing Price (£),Avg. No of Host Owned Listings,Yearly Availability (out 365 days),neighbourhood,Entire home/apt,Hotel room,Private room,Shared room,total_listings
neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Barking and Dagenham,59.833333,2.515152,136.941919,Barking and Dagenham,110.0,3.0,279.0,4.0,396.0
Barnet,93.64012,11.7,137.138922,Barnet,719.0,32.0,905.0,14.0,1670.0
Bexley,57.452107,2.233716,145.83908,Bexley,76.0,2.0,180.0,3.0,261.0
Brent,102.219194,18.886372,119.416123,Brent,1227.0,20.0,1323.0,35.0,2605.0
Bromley,65.461774,2.102446,131.181957,Bromley,239.0,15.0,394.0,6.0,654.0


In [202]:
fig = px.scatter(merged_df, x="Avg. No of Host Owned Listings", y="Entire home/apt", color="neighbourhood", text="neighbourhood",template='plotly_dark')
fig.update_traces(textposition='top center')
fig.update_layout(
    showlegend=False,
    autosize=False,
    width=1200,
    height=500,
    yaxis=dict(
        title_text="Yearly Availability (out 365 days)",
        tickmode="array",
        titlefont=dict(size=14),
    ),
    xaxis=dict(
    title_text="Avg. No of Host Owned Listings",
    tickmode="array",
    titlefont=dict(size=14),
    ))


fig.show()