In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np
import seaborn as sns
import gmaps
import scipy.stats as stats
from scipy.stats import linregress



In [3]:
listings_df = pd.read_csv('./listings.csv')
listings_df.describe()

Unnamed: 0,id,scrape_id,host_id,host_response_rate,host_listings_count,host_total_listings_count,neighbourhood_group_cleansed,latitude,longitude,accommodates,...,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,license,jurisdiction_names,calculated_host_listings_count,reviews_per_month
count,3585.0,3585.0,3585.0,3114.0,3585.0,3585.0,0.0,3585.0,3585.0,3585.0,...,2762.0,2767.0,2765.0,2767.0,2763.0,2764.0,0.0,0.0,3585.0,2829.0
mean,8440875.0,20200000000000.0,24923110.0,0.949891,58.902371,58.902371,,42.340032,-71.084818,3.041283,...,9.431571,9.258041,9.646293,9.646549,9.414043,9.168234,,,12.733891,1.970908
std,4500787.0,0.0,22927810.0,0.125177,171.119663,171.119663,,0.024403,0.031565,1.778929,...,0.931863,1.168977,0.762753,0.735507,0.903436,1.011116,,,29.415076,2.120561
min,3353.0,20200000000000.0,4240.0,0.0,0.0,0.0,,42.235942,-71.171789,1.0,...,2.0,2.0,2.0,4.0,2.0,2.0,,,1.0,0.01
25%,4679319.0,20200000000000.0,6103425.0,0.97,1.0,1.0,,42.329995,-71.105083,2.0,...,9.0,9.0,9.0,9.0,9.0,9.0,,,1.0,0.48
50%,8577620.0,20200000000000.0,19281000.0,1.0,2.0,2.0,,42.345201,-71.078429,2.0,...,10.0,10.0,10.0,10.0,10.0,9.0,,,2.0,1.17
75%,12789530.0,20200000000000.0,36221470.0,1.0,7.0,7.0,,42.354685,-71.062155,4.0,...,10.0,10.0,10.0,10.0,10.0,10.0,,,6.0,2.72
max,14933460.0,20200000000000.0,93854110.0,1.0,749.0,749.0,,42.389982,-71.0001,16.0,...,10.0,10.0,10.0,10.0,10.0,10.0,,,136.0,19.15


In [4]:
listings_df_clean = listings_df[['id', 'host_id', 'host_since', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost',
                                'host_neighbourhood', 'host_listings_count', 'host_total_listings_count', 'host_has_profile_pic', 'host_identity_verified',
                                'neighbourhood_cleansed', 'zipcode', 'latitude', 'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
                                'bedrooms', 'beds', 'bed_type', 'price', 'security_deposit', 'cleaning_fee', 'guests_included', 'extra_people', 'minimum_nights',
                                'maximum_nights', 'availability_365', 'number_of_reviews', 'first_review', 'last_review', 'review_scores_rating', 
                                 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location',
                                 'review_scores_value', 'requires_license', 'instant_bookable', 'cancellation_policy', 'reviews_per_month']]
listings_df_clean['price'].describe()

count    3585.000000
mean      173.925802
std       148.331321
min        10.000000
25%        85.000000
50%       150.000000
75%       220.000000
max      4000.000000
Name: price, dtype: float64

In [5]:
ratings_clean = listings_df_clean.dropna(axis = 0, subset= ['review_scores_rating'])
ratings_clean.head()

Unnamed: 0,id,host_id,host_since,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_neighbourhood,host_listings_count,host_total_listings_count,...,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,requires_license,instant_bookable,cancellation_policy,reviews_per_month
1,3075044,2572247,6/7/2012,within an hour,1.0,100%,f,Roslindale,1,1,...,10.0,9.0,10.0,10.0,9.0,9.0,f,t,moderate,1.3
2,6976,16701,5/11/2009,within a few hours,1.0,88%,t,Roslindale,1,1,...,10.0,9.0,10.0,10.0,9.0,10.0,f,f,moderate,0.47
3,1436513,6031442,4/21/2013,within a few hours,1.0,50%,f,,1,1,...,10.0,10.0,10.0,10.0,10.0,10.0,f,f,moderate,1.0
4,7651065,15396970,5/11/2014,within an hour,1.0,100%,t,Roslindale,1,1,...,10.0,10.0,10.0,10.0,9.0,10.0,f,f,flexible,2.25
5,12386020,64200298,3/23/2016,within a few hours,1.0,95%,t,Roslindale,2,2,...,10.0,10.0,10.0,10.0,9.0,10.0,f,f,flexible,1.7


In [6]:
# Property types pie chart
prop = ratings_clean['property_type'].unique()
prop_type = ratings_clean.dropna(axis = 0, subset= ['property_type'])

# Collapsing smallest four property types into other
prop_type = prop_type.replace({'Villa': 'Other', 'Dorm': 'Other', 'Entire Floor': 'Other', 'Guesthouse': 'Other' })
prop_count = prop_type['property_type'].value_counts()
count = list(prop_count)
count
prop_type.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2770 entries, 1 to 3583
Data columns (total 45 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   id                           2770 non-null   int64  
 1   host_id                      2770 non-null   int64  
 2   host_since                   2770 non-null   object 
 3   host_response_time           2550 non-null   object 
 4   host_response_rate           2550 non-null   float64
 5   host_acceptance_rate         2550 non-null   object 
 6   host_is_superhost            2770 non-null   object 
 7   host_neighbourhood           2525 non-null   object 
 8   host_listings_count          2770 non-null   int64  
 9   host_total_listings_count    2770 non-null   int64  
 10  host_has_profile_pic         2770 non-null   object 
 11  host_identity_verified       2770 non-null   object 
 12  neighbourhood_cleansed       2770 non-null   object 
 13  zipcode           

In [7]:
neighboorhood_ratings = prop_type[['neighbourhood_cleansed', 'review_scores_rating']]

In [8]:
jamaica_plain = neighboorhood_ratings['review_scores_rating'].loc[neighboorhood_ratings['neighbourhood_cleansed'] == 'Jamaica Plain']
south_end = neighboorhood_ratings['review_scores_rating'].loc[neighboorhood_ratings['neighbourhood_cleansed'] == 'South End']
back_bay = neighboorhood_ratings['review_scores_rating'].loc[neighboorhood_ratings['neighbourhood_cleansed'] == 'Back Bay']
dorchester= neighboorhood_ratings['review_scores_rating'].loc[neighboorhood_ratings['neighbourhood_cleansed'] == 'Dorchester']
fenway = neighboorhood_ratings['review_scores_rating'].loc[neighboorhood_ratings['neighbourhood_cleansed'] == 'Fenway']
beacon_hill = neighboorhood_ratings['review_scores_rating'].loc[neighboorhood_ratings['neighbourhood_cleansed'] == 'Beacon Hill']
allston = neighboorhood_ratings['review_scores_rating'].loc[neighboorhood_ratings['neighbourhood_cleansed'] == 'Allston']
south_boston = neighboorhood_ratings['review_scores_rating'].loc[neighboorhood_ratings['neighbourhood_cleansed'] == 'South Boston']
east_boston = neighboorhood_ratings['review_scores_rating'].loc[neighboorhood_ratings['neighbourhood_cleansed'] == 'East Boston']
brighton = neighboorhood_ratings['review_scores_rating'].loc[neighboorhood_ratings['neighbourhood_cleansed'] == 'Brighton']
downtown = neighboorhood_ratings['review_scores_rating'].loc[neighboorhood_ratings['neighbourhood_cleansed'] == 'Downtown']
roxbury = neighboorhood_ratings['review_scores_rating'].loc[neighboorhood_ratings['neighbourhood_cleansed'] == 'Roxbury']
north_end = neighboorhood_ratings['review_scores_rating'].loc[neighboorhood_ratings['neighbourhood_cleansed'] == 'North End']
mission_hill = neighboorhood_ratings['review_scores_rating'].loc[neighboorhood_ratings['neighbourhood_cleansed'] == 'Mission Hill']
charlestown = neighboorhood_ratings['review_scores_rating'].loc[neighboorhood_ratings['neighbourhood_cleansed'] == 'Charlestown']
south_boston_waterfront = neighboorhood_ratings['review_scores_rating'].loc[neighboorhood_ratings['neighbourhood_cleansed'] == 'South Boston Waterfront']
stats.f_oneway(jamaica_plain, south_end, back_bay, dorchester, fenway, beacon_hill, allston, south_boston, east_boston, brighton, downtown, roxbury, north_end, mission_hill, charlestown, south_boston_waterfront)

F_onewayResult(statistic=7.198170678083899, pvalue=9.37496282188307e-16)

In [9]:
final_data = prop_type[['id', 'price', 'host_response_rate', 'neighbourhood_cleansed', 'review_scores_rating', 'cancellation_policy',
                        'property_type', 'bedrooms']]
final_data.head()

Unnamed: 0,id,price,host_response_rate,neighbourhood_cleansed,review_scores_rating,cancellation_policy,property_type,bedrooms
1,3075044,65,1.0,Roslindale,94.0,moderate,Apartment,1.0
2,6976,65,1.0,Roslindale,98.0,moderate,Apartment,1.0
3,1436513,75,1.0,Roslindale,100.0,moderate,House,1.0
4,7651065,79,1.0,Roslindale,99.0,flexible,House,1.0
5,12386020,75,1.0,Roslindale,100.0,flexible,Condominium,1.0


In [11]:
final_data.to_json(r'final_data.json')

In [None]:
avg_neighborhood = final_data.groupby('neighbourhood_cleansed', as_index = False).agg({'review_scores_rating':'mean'})
#avg_price = avg_neighborhood.sort_values('price', ascending = False)
avg_rating = avg_neighborhood.sort_values('review_scores_rating', ascending = False)
avg_rating

#avg_neighborhood.sort_values('review_scores_rating', ascending = False)

In [None]:
final_data_1 = final_data.loc[final_data['neighbourhood_cleansed'] != 'Leather District']
final_data_2 = final_data_1.loc[final_data_1['neighbourhood_cleansed'] != 'Longwood Medical Area']
final_data_3 = final_data_2.loc[final_data_2['neighbourhood_cleansed'] != 'Bay Village']
final_data_4 = final_data_3.loc[final_data_3['neighbourhood_cleansed'] != 'Mattapan']
final_data_5 = final_data_4.loc[final_data_4['neighbourhood_cleansed'] != 'Hyde Park']
final_data_6 = final_data_5.loc[final_data_5['neighbourhood_cleansed'] != 'West End']
final_data_7 = final_data_6.loc[final_data_6['neighbourhood_cleansed'] != 'West Roxbury']
final_data_8 = final_data_7.loc[final_data_7['neighbourhood_cleansed'] != 'Chinatown']
final_data_9 = final_data_8.loc[final_data_8['neighbourhood_cleansed'] != 'Roslindale']

final_data_9

In [None]:
avg_neighborhood = final_data_9.groupby('neighbourhood_cleansed', as_index = False).agg({'review_scores_rating':'median'})
#avg_price = avg_neighborhood.sort_values('price', ascending = False)
avg_rating = avg_neighborhood.sort_values('review_scores_rating', ascending = False)
avg_rating


In [None]:
def boxplot_sorted(final_data_9, by, column, rot=0):
    df2 = pd.DataFrame({col:vals[column] for col, vals in final_data_9.groupby(by)})
    meds = df2.median().sort_values()
    return df2[meds.index].boxplot(rot=rot, return_type="axes")
axes = boxplot_sorted(final_data_9, by=["neighbourhood_cleansed"], column="review_scores_rating")
plt.xticks(rotation=30, horizontalalignment='right')
plt.xlabel("Neighborhood")
plt.ylabel("Ratings")
plt.title("Rating By Neighborhood", fontsize= 32)
plt.savefig("./Images/RatingByNeighborhoodBW.png")

In [None]:
neighboorhood_price = final_data_9[['neighbourhood_cleansed', 'price']]
neighboorhood_price = neighboorhood_price.reset_index(drop=True)

In [None]:
jamaica_plain = neighboorhood_price['price'].loc[neighboorhood_price['neighbourhood_cleansed'] == 'Jamaica Plain']
south_end = neighboorhood_price['price'].loc[neighboorhood_price['neighbourhood_cleansed'] == 'South End']
back_bay = neighboorhood_price['price'].loc[neighboorhood_price['neighbourhood_cleansed'] == 'Back Bay']
dorchester= neighboorhood_price['price'].loc[neighboorhood_price['neighbourhood_cleansed'] == 'Dorchester']
fenway = neighboorhood_price['price'].loc[neighboorhood_price['neighbourhood_cleansed'] == 'Fenway']
beacon_hill = neighboorhood_price['price'].loc[neighboorhood_price['neighbourhood_cleansed'] == 'Beacon Hill']
allston = neighboorhood_price['price'].loc[neighboorhood_price['neighbourhood_cleansed'] == 'Allston']
south_boston = neighboorhood_price['price'].loc[neighboorhood_price['neighbourhood_cleansed'] == 'South Boston']
east_boston = neighboorhood_price['price'].loc[neighboorhood_price['neighbourhood_cleansed'] == 'East Boston']
brighton = neighboorhood_price['price'].loc[neighboorhood_price['neighbourhood_cleansed'] == 'Brighton']
downtown = neighboorhood_price['price'].loc[neighboorhood_price['neighbourhood_cleansed'] == 'Downtown']
roxbury = neighboorhood_price['price'].loc[neighboorhood_price['neighbourhood_cleansed'] == 'Roxbury']
north_end = neighboorhood_price['price'].loc[neighboorhood_price['neighbourhood_cleansed'] == 'North End']
mission_hill = neighboorhood_price['price'].loc[neighboorhood_price['neighbourhood_cleansed'] == 'Mission Hill']
charlestown = neighboorhood_price['price'].loc[neighboorhood_price['neighbourhood_cleansed'] == 'Charlestown']
south_boston_waterfront = neighboorhood_price['price'].loc[neighboorhood_price['neighbourhood_cleansed'] == 'South Boston Waterfront']
stats.f_oneway(jamaica_plain, south_end, back_bay, dorchester, fenway, beacon_hill, allston, south_boston, east_boston, brighton, downtown, roxbury, north_end, mission_hill, charlestown, south_boston_waterfront)

In [None]:
def boxplot_sorted(neighboorhood_price, by, column, rot=0):
    df2 = pd.DataFrame({col:vals[column] for col, vals in neighboorhood_price.groupby(by)})
    meds = df2.median().sort_values()
    return df2[meds.index].boxplot(rot=rot, return_type="axes")
axes = boxplot_sorted(neighboorhood_price, by=["neighbourhood_cleansed"], column="price")
plt.xticks(rotation=30, horizontalalignment='right')
plt.xlabel("Neighborhood")
plt.ylabel("Price")
plt.title("Price By Neighborhood", fontsize= 32)
plt.savefig("./Images/PriceByNeighborhoodBW.png")

In [None]:
gmaps.configure(api_key=api_key)

In [None]:
locations = prop_type[['latitude', 'longitude']].astype(float)
price = prop_type['price']

In [None]:
# Heat map location, weight = price
fig = gmaps.figure()

heat_layer = gmaps.heatmap_layer(locations, weights = price, dissipating = True, max_intensity = 800, point_radius = 5)

fig.add_layer(heat_layer)

fig

In [None]:
final_data_9

In [None]:
south_end = final_data_9.loc[final_data_9['neighbourhood_cleansed'] == 'South End']
south_end_clean = south_end.dropna(axis = 0, subset= ['bedrooms'])
south_end_clean.info()

In [None]:
room_count = south_end_clean['bedrooms'].value_counts().sort_index()
room_index = south_end_clean['bedrooms'].unique()
room_index_sort = np.sort(room_index)
print(room_count)
print(room_index_sort)

In [None]:
explode = (0,0,0,0.3,0.1,0.3)
labels = room_index_sort
room_pie = room_count.plot.pie(y=room_count, figsize=(12,12), explode = explode, startangle=140, autopct="%1.1f%%", labeldistance = None, title = "Number of Bedrooms in South End")
plt.xlabel("")
plt.ylabel("")
plt.legend(labels, loc = "upper right")

plt.savefig("./Images/BedroomsPie.png")
plt.show()

In [None]:
south_end_price = south_end_clean[['bedrooms', 'price']]
zero_bed = south_end_price['price'].loc[south_end_price['bedrooms'] == 0]
one_bed = south_end_price['price'].loc[south_end_price['bedrooms'] == 1]
two_bed = south_end_price['price'].loc[south_end_price['bedrooms'] == 2]
three_bed = south_end_price['price'].loc[south_end_price['bedrooms'] == 3]
stats.f_oneway(zero_bed, one_bed, two_bed, three_bed)

In [None]:
south_end_3 = south_end_clean.loc[south_end_clean['bedrooms'] <= 3]
south_end_group_3 = south_end_3.groupby('bedrooms').mean()
south_end_group_3

In [None]:
xbp = south_end_group_3.index
ybp = south_end_group_3['price']
plt.bar(xbp, ybp, color= ['royalblue', 'darkorange', 'green', 'red'], alpha=0.7, align='center')
plt.title("Price by Number of Bedrooms in South End")
plt.xticks([0, 1, 2, 3])
plt.ylabel("Price")
plt.xlabel("Number of Bedrooms")
plt.savefig("./Images/PriceByBedrooms.png")

In [None]:
south_end_ratings = south_end_clean[['bedrooms', 'review_scores_rating']]
zero_bedr = south_end_ratings['review_scores_rating'].loc[south_end_ratings['bedrooms'] == 0]
one_bedr = south_end_ratings['review_scores_rating'].loc[south_end_ratings['bedrooms'] == 1]
two_bedr = south_end_ratings['review_scores_rating'].loc[south_end_ratings['bedrooms'] == 2]
three_bedr = south_end_ratings['review_scores_rating'].loc[south_end_ratings['bedrooms'] == 3]
stats.f_oneway(zero_bedr, one_bedr, two_bedr, three_bedr)

In [None]:
xbr = south_end_group_3.index
ybr = south_end_group_3['review_scores_rating']
plt.bar(xbr, ybr, color= ['royalblue', 'darkorange', 'green', 'red'], alpha=0.7, align='center')
plt.title("Rating by Number of Bedrooms in South End")
plt.ylim(80, 100)
plt.xticks([0, 1, 2, 3])
plt.ylabel("Rating")
plt.xlabel("Number of Bedrooms")
plt.savefig("./Images/RatingByBedrooms.png")

In [None]:
y_axis = south_end_clean["property_type"].value_counts()
y_axis = y_axis.iloc[0:4]
y_axis.plot.pie(y=y_axis, figsize=(12,12), startangle=140, autopct="%1.1f%%", labeldistance = None, title = "South End Propery Type")
plt.xlabel("")
plt.ylabel("")
Ptype = y_axis.index
plt.legend(labels = Ptype, loc = "upper right")

plt.savefig("./Images/PropertyTypePie.png")
plt.show()

In [None]:
south_end_pratings = south_end_clean[['property_type', 'review_scores_rating']]
aptr = south_end_pratings['review_scores_rating'].loc[south_end_pratings['property_type'] == 'Apartment']
condor = south_end_pratings['review_scores_rating'].loc[south_end_pratings['property_type'] == 'Condominium']
houser = south_end_pratings['review_scores_rating'].loc[south_end_pratings['property_type'] == 'House']
townr = south_end_pratings['review_scores_rating'].loc[south_end_pratings['property_type'] == 'Townhouse']
stats.f_oneway(aptr, condor, houser, townr)

In [None]:
sptype = south_end_clean.groupby("property_type").mean()
sptype1 = sptype.iloc[[0, 2, 3, 6],:]
sptype1

In [None]:
x_axis1 = sptype1.index
y_axis1 = sptype1['review_scores_rating']
plt.bar(x_axis1, y_axis1, color= ['royalblue', 'darkorange', 'green', 'red'], alpha=0.7, align='center')
plt.title("Rating by Property Type in South End")
plt.ylim(80, 100)
plt.ylabel("Rating")
plt.xlabel("Property Type")
plt.savefig("./Images/RatingsbyPropertyType.png")

In [None]:
south_end_pprice = south_end_clean[['property_type', 'price']]
apt = south_end_pprice['price'].loc[south_end_pprice['property_type'] == 'Apartment']
condo = south_end_pprice['price'].loc[south_end_pprice['property_type'] == 'Condominium']
house = south_end_pprice['price'].loc[south_end_pprice['property_type'] == 'House']
town = south_end_pprice['price'].loc[south_end_pprice['property_type'] == 'Townhouse']
stats.f_oneway(apt, condo, house, town)

In [None]:
x_axis2 = sptype1.index
y_axis2 = sptype1['price']
plt.bar(x_axis2, y_axis2, color= ['royalblue', 'darkorange', 'green', 'red'], alpha=0.7, align='center')
plt.title("Price by Property Type in South End")
plt.ylabel("Price")
plt.xlabel("Property Type")
plt.savefig("./Images/PricebyPropertyType.png")