In [2]:
import pandas as pd
import numpy as np

# plot
import plotly.express as px
import plotly.graph_objects as go

#Import machine learning
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
# import xgboost

from sklearn.model_selection import train_test_split #split
from sklearn.metrics import r2_score, mean_squared_error #metrics

In [3]:
listings_df = pd.read_csv('../data/listings.csv') # list of info about the house, location, host, etc.
reviews_df = pd.read_csv('../data/reviews.csv') # reviews => details of reviews in listings_df
calendar_df = pd.read_csv('../data/calendar.csv') # calendar (house availbility)

In [4]:
listings2_df = pd.read_csv('../data/listings 2.csv') # more details for host listings
reviews2_df = pd.read_csv('../data/reviews 2.csv') # more details for reviews

## Cleaning the Data

1. Cleaning listings dataset

In [7]:
listings2_df.columns

Index(['id', 'listing_url', 'scrape_id', 'last_searched', 'last_scraped',
       'name', 'description', 'neighborhood_overview', 'picture_url',
       'host_id', 'host_url', 'host_name', 'host_since', 'host_location',
       'host_about', 'host_response_time', 'host_response_rate',
       'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url',
       'host_picture_url', 'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'latitude', 'longitude', 'property_type', 'room_type', 'accommodates',
       'bathrooms', 'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'calendar_updated', 'has_availability',
       'availability_

In [10]:
listings2_df[['neighbourhood', 'region_id',
       'region_name', 'region_parent_id', 'region_parent_name',
       'region_parent_parent_id', 'region_parent_parent_name',
       'reviews_per_month']].head()

Unnamed: 0,neighbourhood,region_id,region_name,region_parent_id,region_parent_name,region_parent_parent_id,region_parent_parent_name,reviews_per_month
0,"Orange, New South Wales, Australia",16150,Orange,1,New South Wales,,,5.29
1,"Orange, New South Wales, Australia",16150,Orange,1,New South Wales,,,1.4
2,,16200,Parkes,1,New South Wales,,,1.93
3,"Parramatta, New South Wales, Australia",16250,Parramatta,1,New South Wales,,,0.65
4,,16250,Parramatta,1,New South Wales,,,


In [11]:
accom_host_col = ['id', 'name', 'host_id', 'host_name', 'host_neighbourhood', 'minimum_nights', 'maximum_nights']
accom_info_col = ['region_id','region_name', 'region_parent_id', 'region_parent_name', 'latitude', 'longitude', 'property_type', 'room_type', 'accommodates',
       'bathrooms', 'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price']
accom_review_col = ['number_of_reviews', 'last_review', 'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value']

In [6]:
listings2_df.isnull().mean()

id                           0.000000
listing_url                  0.000000
scrape_id                    0.000000
last_searched                0.247606
last_scraped                 0.000000
                               ...   
region_parent_id             0.000000
region_parent_name           0.000000
region_parent_parent_id      1.000000
region_parent_parent_name    1.000000
reviews_per_month            0.168133
Length: 80, dtype: float64

In [31]:
listings_df['neighbourhood'] = listings_df['neighbourhood'].str.strip()

Unnamed: 0,neighbourhood_group,accom_cnts
1,New South Wales,63530
7,Victoria,38770
4,Queensland,25815
8,Western Australia,11945
5,South Australia,7009
6,Tasmania,4921
0,Australian Capital Territory,1147
2,Northern Territory,755
3,Other Territories,22


In [None]:
listings_df_1 = listings_df[listings_df['neighbourhood_group'] == 'New South Wales']
listings_df_1['price_per_day'] = listings_df_1['price'] / listings_df_1['minimum_nights']

In [36]:
#
listings_df_1.groupby(['neighbourhood']) \
    .agg({'price_per_day': np.mean, 'latitude': np.mean, 'longitude': np.mean}) \
    .reset_index() \
    .rename(columns={'price_per_day':'mean_price_per_day'}) \
    .sort_values(['mean_price_per_day'], ascending=False)

Unnamed: 0,neighbourhood,mean_price_per_day,latitude,longitude
144,Wollondilly,328.682639,-34.137358,150.631163
116,Singleton,323.512766,-32.698130,151.197655
108,Port Stephens,302.030687,-32.729392,152.122679
30,Cessnock,287.910638,-32.809762,151.313971
102,Palerang,279.260789,-35.335802,149.662611
...,...,...,...,...
21,Burwood,60.052270,-33.877686,151.102466
2,Ashfield,59.237469,-33.887473,151.128652
11,Blacktown,58.551380,-33.748125,150.888201
87,Marrickville,58.180776,-33.902482,151.163693


In [41]:
#
listings_df.groupby('neighbourhood_group')['id'].count().reset_index() \
    .rename(columns={'id':'accom_cnts'}) \
    .sort_values(['accom_cnts'], ascending=False)

Unnamed: 0,neighbourhood_group,accom_cnts
1,New South Wales,63530
7,Victoria,38770
4,Queensland,25815
8,Western Australia,11945
5,South Australia,7009
6,Tasmania,4921
0,Australian Capital Territory,1147
2,Northern Territory,755
3,Other Territories,22


In [40]:
listings_df_2 = listings_df
listings_df_2['price_per_day'] = listings_df_2['price'] / listings_df_2['minimum_nights']

listings_df_2.groupby('neighbourhood_group')['price_per_day'].mean().reset_index() \
    .rename(columns={'price_per_day':'mean_price_per_day'}) \
    .sort_values(['mean_price_per_day'], ascending=False)

Unnamed: 0,neighbourhood_group,mean_price_per_day
3,Other Territories,171.901218
1,New South Wales,142.062455
7,Victoria,135.475943
6,Tasmania,134.911724
5,South Australia,126.899885
4,Queensland,126.06353
8,Western Australia,106.212791
2,Northern Territory,103.739835
0,Australian Capital Territory,96.670127


In [None]:
# df.corr()