In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np
import seaborn as sns
import gmaps
import scipy.stats as stats
from scipy.stats import linregress

from config import api_key

In [3]:
listings_df = pd.read_csv('./listings.csv')
listings_df.describe()

Unnamed: 0,id,scrape_id,host_id,host_response_rate,host_listings_count,host_total_listings_count,neighbourhood_group_cleansed,latitude,longitude,accommodates,...,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,license,jurisdiction_names,calculated_host_listings_count,reviews_per_month
count,3585.0,3585.0,3585.0,3114.0,3585.0,3585.0,0.0,3585.0,3585.0,3585.0,...,2762.0,2767.0,2765.0,2767.0,2763.0,2764.0,0.0,0.0,3585.0,2829.0
mean,8440875.0,20200000000000.0,24923110.0,0.949891,58.902371,58.902371,,42.340032,-71.084818,3.041283,...,9.431571,9.258041,9.646293,9.646549,9.414043,9.168234,,,12.733891,1.970908
std,4500787.0,0.0,22927810.0,0.125177,171.119663,171.119663,,0.024403,0.031565,1.778929,...,0.931863,1.168977,0.762753,0.735507,0.903436,1.011116,,,29.415076,2.120561
min,3353.0,20200000000000.0,4240.0,0.0,0.0,0.0,,42.235942,-71.171789,1.0,...,2.0,2.0,2.0,4.0,2.0,2.0,,,1.0,0.01
25%,4679319.0,20200000000000.0,6103425.0,0.97,1.0,1.0,,42.329995,-71.105083,2.0,...,9.0,9.0,9.0,9.0,9.0,9.0,,,1.0,0.48
50%,8577620.0,20200000000000.0,19281000.0,1.0,2.0,2.0,,42.345201,-71.078429,2.0,...,10.0,10.0,10.0,10.0,10.0,9.0,,,2.0,1.17
75%,12789530.0,20200000000000.0,36221470.0,1.0,7.0,7.0,,42.354685,-71.062155,4.0,...,10.0,10.0,10.0,10.0,10.0,10.0,,,6.0,2.72
max,14933460.0,20200000000000.0,93854110.0,1.0,749.0,749.0,,42.389982,-71.0001,16.0,...,10.0,10.0,10.0,10.0,10.0,10.0,,,136.0,19.15


In [4]:
listings_df_clean = listings_df[['id', 'host_id', 'host_since', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost',
                                'host_neighbourhood', 'host_listings_count', 'host_total_listings_count', 'host_has_profile_pic', 'host_identity_verified',
                                'neighbourhood_cleansed', 'zipcode', 'latitude', 'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
                                'bedrooms', 'beds', 'bed_type', 'price', 'security_deposit', 'cleaning_fee', 'guests_included', 'extra_people', 'minimum_nights',
                                'maximum_nights', 'availability_365', 'number_of_reviews', 'first_review', 'last_review', 'review_scores_rating', 
                                 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location',
                                 'review_scores_value', 'requires_license', 'instant_bookable', 'cancellation_policy', 'reviews_per_month']]
listings_df_clean['price'].describe()

count    3585.000000
mean      173.925802
std       148.331321
min        10.000000
25%        85.000000
50%       150.000000
75%       220.000000
max      4000.000000
Name: price, dtype: float64

In [5]:
ratings_clean = listings_df_clean.dropna(axis = 0, subset= ['review_scores_rating','host_response_rate'])
ratings_clean.head()

Unnamed: 0,id,host_id,host_since,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_neighbourhood,host_listings_count,host_total_listings_count,...,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,requires_license,instant_bookable,cancellation_policy,reviews_per_month
1,3075044,2572247,6/7/2012,within an hour,1.0,100%,f,Roslindale,1,1,...,10.0,9.0,10.0,10.0,9.0,9.0,f,t,moderate,1.3
2,6976,16701,5/11/2009,within a few hours,1.0,88%,t,Roslindale,1,1,...,10.0,9.0,10.0,10.0,9.0,10.0,f,f,moderate,0.47
3,1436513,6031442,4/21/2013,within a few hours,1.0,50%,f,,1,1,...,10.0,10.0,10.0,10.0,10.0,10.0,f,f,moderate,1.0
4,7651065,15396970,5/11/2014,within an hour,1.0,100%,t,Roslindale,1,1,...,10.0,10.0,10.0,10.0,9.0,10.0,f,f,flexible,2.25
5,12386020,64200298,3/23/2016,within a few hours,1.0,95%,t,Roslindale,2,2,...,10.0,10.0,10.0,10.0,9.0,10.0,f,f,flexible,1.7


In [29]:
# Property types pie chart
prop = ratings_clean['property_type'].unique()
prop_type = ratings_clean.dropna(axis = 0, subset= ['property_type'])

# Collapsing smallest four property types into other
prop_type = prop_type.replace({'Villa': 'Other', 'Dorm': 'Other', 'Entire Floor': 'Other', 'Guesthouse': 'Other' })
prop_count = prop_type['property_type'].value_counts()
count = list(prop_count)
count
prop_type.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2550 entries, 1 to 3583
Data columns (total 45 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   id                           2550 non-null   int64  
 1   host_id                      2550 non-null   int64  
 2   host_since                   2550 non-null   object 
 3   host_response_time           2550 non-null   object 
 4   host_response_rate           2550 non-null   float64
 5   host_acceptance_rate         2550 non-null   object 
 6   host_is_superhost            2550 non-null   object 
 7   host_neighbourhood           2323 non-null   object 
 8   host_listings_count          2550 non-null   int64  
 9   host_total_listings_count    2550 non-null   int64  
 10  host_has_profile_pic         2550 non-null   object 
 11  host_identity_verified       2550 non-null   object 
 12  neighbourhood_cleansed       2550 non-null   object 
 13  zipcode           

In [7]:
neighboorhood_ratings = prop_type[['neighbourhood_cleansed', 'review_scores_rating']]

In [8]:
jamaica_plain = neighboorhood_ratings['review_scores_rating'].loc[neighboorhood_ratings['neighbourhood_cleansed'] == 'Jamaica Plain']
south_end = neighboorhood_ratings['review_scores_rating'].loc[neighboorhood_ratings['neighbourhood_cleansed'] == 'South End']
back_bay = neighboorhood_ratings['review_scores_rating'].loc[neighboorhood_ratings['neighbourhood_cleansed'] == 'Back Bay']
dorchester= neighboorhood_ratings['review_scores_rating'].loc[neighboorhood_ratings['neighbourhood_cleansed'] == 'Dorchester']
fenway = neighboorhood_ratings['review_scores_rating'].loc[neighboorhood_ratings['neighbourhood_cleansed'] == 'Fenway']
beacon_hill = neighboorhood_ratings['review_scores_rating'].loc[neighboorhood_ratings['neighbourhood_cleansed'] == 'Beacon Hill']
allston = neighboorhood_ratings['review_scores_rating'].loc[neighboorhood_ratings['neighbourhood_cleansed'] == 'Allston']
south_boston = neighboorhood_ratings['review_scores_rating'].loc[neighboorhood_ratings['neighbourhood_cleansed'] == 'South Boston']
east_boston = neighboorhood_ratings['review_scores_rating'].loc[neighboorhood_ratings['neighbourhood_cleansed'] == 'East Boston']
brighton = neighboorhood_ratings['review_scores_rating'].loc[neighboorhood_ratings['neighbourhood_cleansed'] == 'Brighton']
downtown = neighboorhood_ratings['review_scores_rating'].loc[neighboorhood_ratings['neighbourhood_cleansed'] == 'Downtown']
roxbury = neighboorhood_ratings['review_scores_rating'].loc[neighboorhood_ratings['neighbourhood_cleansed'] == 'Roxbury']
north_end = neighboorhood_ratings['review_scores_rating'].loc[neighboorhood_ratings['neighbourhood_cleansed'] == 'North End']
mission_hill = neighboorhood_ratings['review_scores_rating'].loc[neighboorhood_ratings['neighbourhood_cleansed'] == 'Mission Hill']
charlestown = neighboorhood_ratings['review_scores_rating'].loc[neighboorhood_ratings['neighbourhood_cleansed'] == 'Charlestown']
south_boston_waterfront = neighboorhood_ratings['review_scores_rating'].loc[neighboorhood_ratings['neighbourhood_cleansed'] == 'South Boston Waterfront']
stats.f_oneway(jamaica_plain, south_end, back_bay, dorchester, fenway, beacon_hill, allston, south_boston, east_boston, brighton, downtown, roxbury, north_end, mission_hill, charlestown, south_boston_waterfront)

F_onewayResult(statistic=7.43940323692705, pvalue=2.1997476184772181e-16)

In [9]:
final_data = prop_type[['id', 'price', 'host_response_rate', 'neighbourhood_cleansed', 'review_scores_rating', 'cancellation_policy',
                        'property_type', 'bedrooms']]
final_data.head()

Unnamed: 0,id,price,host_response_rate,neighbourhood_cleansed,review_scores_rating,cancellation_policy,property_type,bedrooms
1,3075044,65,1.0,Roslindale,94.0,moderate,Apartment,1.0
2,6976,65,1.0,Roslindale,98.0,moderate,Apartment,1.0
3,1436513,75,1.0,Roslindale,100.0,moderate,House,1.0
4,7651065,79,1.0,Roslindale,99.0,flexible,House,1.0
5,12386020,75,1.0,Roslindale,100.0,flexible,Condominium,1.0


In [54]:
avg_neighborhood = final_data.groupby('neighbourhood_cleansed', as_index = False).agg({'review_scores_rating':'mean'})
#avg_price = avg_neighborhood.sort_values('price', ascending = False)
avg_rating = avg_neighborhood.sort_values('review_scores_rating', ascending = False)
avg_rating

#avg_neighborhood.sort_values('review_scores_rating', ascending = False)

Unnamed: 0,neighbourhood_cleansed,review_scores_rating
13,Leather District,97.5
24,West Roxbury,95.354839
18,Roslindale,95.285714
21,South Boston Waterfront,94.303571
12,Jamaica Plain,94.158672
20,South Boston,94.0
14,Longwood Medical Area,94.0
5,Charlestown,93.828571
3,Beacon Hill,93.602484
17,North End,93.371429


In [56]:
final_data_1 = final_data.loc[final_data['neighbourhood_cleansed'] != 'Leather District']
final_data_2 = final_data_1.loc[final_data_1['neighbourhood_cleansed'] != 'Longwood Medical Area']
final_data_3 = final_data_2.loc[final_data_2['neighbourhood_cleansed'] != 'Bay Village']
final_data_4 = final_data_3.loc[final_data_3['neighbourhood_cleansed'] != 'Mattapan']
final_data_5 = final_data_4.loc[final_data_4['neighbourhood_cleansed'] != 'Hyde Park']
final_data_6 = final_data_5.loc[final_data_5['neighbourhood_cleansed'] != 'West End']
final_data_7 = final_data_6.loc[final_data_6['neighbourhood_cleansed'] != 'West Roxbury']
final_data_8 = final_data_7.loc[final_data_7['neighbourhood_cleansed'] != 'Chinatown']
final_data_9 = final_data_8.loc[final_data_8['neighbourhood_cleansed'] != 'Roslindale']

final_data_9

Unnamed: 0,id,price,host_response_rate,neighbourhood_cleansed,review_scores_rating,cancellation_policy,property_type,bedrooms
57,594693,70,1.00,Jamaica Plain,93.0,strict,Apartment,1.0
58,2384581,95,0.93,Jamaica Plain,93.0,flexible,House,1.0
60,4262255,80,1.00,Jamaica Plain,99.0,moderate,Apartment,1.0
61,4000384,120,0.70,Jamaica Plain,97.0,flexible,House,1.0
62,6693305,65,1.00,Jamaica Plain,100.0,flexible,Apartment,1.0
...,...,...,...,...,...,...,...,...
3573,14504583,65,1.00,Charlestown,80.0,flexible,Apartment,1.0
3575,5280827,69,0.96,Charlestown,70.0,strict,Apartment,1.0
3578,14536322,85,1.00,Allston,100.0,strict,House,1.0
3580,8373729,69,0.96,Charlestown,90.0,strict,Apartment,1.0


In [55]:
avg_neighborhood = final_data_9.groupby('neighbourhood_cleansed', as_index = False).agg({'review_scores_rating':'mean'})
#avg_price = avg_neighborhood.sort_values('price', ascending = False)
avg_rating = avg_neighborhood.sort_values('review_scores_rating', ascending = False)
avg_rating


Unnamed: 0,neighbourhood_cleansed,review_scores_rating
14,South Boston Waterfront,94.303571
9,Jamaica Plain,94.158672
13,South Boston,94.0
4,Charlestown,93.828571
2,Beacon Hill,93.602484
11,North End,93.371429
15,South End,93.144033
3,Brighton,92.866667
6,Downtown,92.154545
1,Back Bay,91.730594


In [None]:
final_data_9['avg_rating'] = ""
for index, row in final_data_9.iterrows():
    if 

In [61]:
#ratings_box = final_data_9.boxplot("review_scores_rating", by= "neighbourhood_cleansed", figsize= (36,42), fontsize= 28)
final_try = final_data_9.groupby('neighbourhood_cleansed')
final_data_9['review_scores_rating'].median()
final_try.sort_values(ascending = False, inplace = True)
final_data_9 = final_data_9[final_try.index]
final_data_9.boxplot("review_scores_rating", by= "neighbourhood_cleansed", figsize= (36,42), fontsize= 28)
plt.xticks(rotation=30, horizontalalignment='right')
plt.xlabel("")
plt.ylabel("")
plt.title("Rating by Neighborhood", fontsize= 32)

AttributeError: 'float' object has no attribute 'sort'

In [63]:
neighboorhood_price = prop_type[['neighbourhood_cleansed', 'price']]

In [64]:
jamaica_plain = neighboorhood_price['price'].loc[neighboorhood_price['neighbourhood_cleansed'] == 'Jamaica Plain']
south_end = neighboorhood_price['price'].loc[neighboorhood_price['neighbourhood_cleansed'] == 'South End']
back_bay = neighboorhood_price['price'].loc[neighboorhood_price['neighbourhood_cleansed'] == 'Back Bay']
dorchester= neighboorhood_price['price'].loc[neighboorhood_price['neighbourhood_cleansed'] == 'Dorchester']
fenway = neighboorhood_price['price'].loc[neighboorhood_price['neighbourhood_cleansed'] == 'Fenway']
beacon_hill = neighboorhood_price['price'].loc[neighboorhood_price['neighbourhood_cleansed'] == 'Beacon Hill']
allston = neighboorhood_price['price'].loc[neighboorhood_price['neighbourhood_cleansed'] == 'Allston']
south_boston = neighboorhood_price['price'].loc[neighboorhood_price['neighbourhood_cleansed'] == 'South Boston']
east_boston = neighboorhood_price['price'].loc[neighboorhood_price['neighbourhood_cleansed'] == 'East Boston']
brighton = neighboorhood_price['price'].loc[neighboorhood_price['neighbourhood_cleansed'] == 'Brighton']
downtown = neighboorhood_price['price'].loc[neighboorhood_price['neighbourhood_cleansed'] == 'Downtown']
roxbury = neighboorhood_price['price'].loc[neighboorhood_price['neighbourhood_cleansed'] == 'Roxbury']
north_end = neighboorhood_price['price'].loc[neighboorhood_price['neighbourhood_cleansed'] == 'North End']
mission_hill = neighboorhood_price['price'].loc[neighboorhood_price['neighbourhood_cleansed'] == 'Mission Hill']
charlestown = neighboorhood_price['price'].loc[neighboorhood_price['neighbourhood_cleansed'] == 'Charlestown']
south_boston_waterfront = neighboorhood_price['price'].loc[neighboorhood_price['neighbourhood_cleansed'] == 'South Boston Waterfront']
stats.f_oneway(jamaica_plain, south_end, back_bay, dorchester, fenway, beacon_hill, allston, south_boston, east_boston, brighton, downtown, roxbury, north_end, mission_hill, charlestown, south_boston_waterfront)

F_onewayResult(statistic=40.41000142103462, pvalue=1.8521855938323337e-105)

In [None]:
prop = prop_price.boxplot("price", by= "neighbourhood_cleansed", figsize= (36,42), fontsize= 28)
#prop_price(lwd=3)
#plt.gca('linew',2)
prop.spines['left'].set_linewidth(5)
#prop.spines['right'].set_linewidth(5)
#lines.linewidth: 5
plt.xticks(rotation=30, horizontalalignment='right')
plt.xlabel("")
plt.ylabel("")
plt.title("Price By Neighborhood", fontsize= 32)