In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
# load necessary packages
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [4]:
# folium for viualizing geospatial data
import folium
from folium import plugins
# the following sklearn packages used to cnvert collection of text documents to a matrix of token counts
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
listings = pd.read_csv ('/kaggle/input/boston/listings.csv');
reviews = pd.read_csv ('/kaggle/input/boston/reviews.csv');
calendar = pd.read_csv ('/kaggle/input/boston/calendar.csv');

* **Listings, including full descriptions and average review score.**
* **Reviews, including unique id for each reviewer and detailed comments.**
* **Calendar, including listing id and the price and availability for that day.**

# **Data exploration**

In [6]:
listings.shape

there are 3585 listings in the data set, with 95 features

**get percenatage of missing values in listings dataframe****

In [7]:
%matplotlib inline
listings.isnull().mean().plot.bar(figsize=(40,3));
# from the above cell, we see that some of the column values are missing
# here we list the columns with most missing values
most_missing_cols = set(listings.columns[listings.isnull().mean() > 0.75])
most_missing_cols

**lets also see what the other DFs look like****

In [8]:
calendar.shape
## calendar holds avaliablity and price information for each of listings for entire year
calendar.head()

In [9]:
## reviews hodls the reviews (comments and their names) and reviews dates for different listings
reviews.head()

# **How the listing prices are distributed? Which price ranges are the most common?**

to see the distribution of listing prices, we look at price related column in the listing df. In particular, we look at weekly price column and prepare a bar plot of value counts 

In [10]:
weekly_price = listings['weekly_price'].value_counts().rename_axis('price').reset_index(name='counts')
weekly_price.plot(kind = 'bar', x='price', y='counts', figsize=(90,4))

we can see from the above plot that some prices are more frequent and the range of prices is very large.
to better visualize the distribution of prices, we can draw a histogram of the weekly price distribution.

In [11]:
weekly_price['price'] = weekly_price['price'].replace('[\$,]', '', regex=True).astype(float) # change prices to float type
weekly_price = weekly_price.sort_values(by=['price'], ascending=True)
print('minimum weekly price ($): ', weekly_price['price'].min())
print('maximum weekly price ($): ', weekly_price['price'].max())
weekly_price.head()

#  Plot histogram of weekly prices

In [12]:
bins=list(range(0, 6400, 50))
#print(bins)
ax = weekly_price['price'].plot.hist(by = 'price', bins=25, alpha=0.9, figsize=(8,4))
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.xlabel('Weekly price in $', fontsize = 14)
plt.ylabel('Frequency', fontsize = 14)
plt.title("Histogram of weekly price in ($)", fontsize = 16);

# Expensive month?

In [13]:
print(calendar.shape)
calendar.head()

In [14]:
#get percenatage of missing values in listings dataframe
%matplotlib inline
calendar.isnull().mean().plot.bar(figsize=(6,4), fontsize = 12);

 we can see that about 50% of the listings do not have daily price information for a specific date
 to continue our anlalysis of expensive months, we first drop the rows with missing price information

In [15]:
calendar = calendar[calendar['price'].notna()]
calendar.head()

In [16]:
# here we do further preparation of the dataset
# first extract the month from date column
# the change the price to float (remove $)
# then changing availability from t / f to 1 / 0 for ease of analysis
calendar['month'] = pd.DatetimeIndex(calendar['date']).month
calendar['price'] = calendar['price'].replace('[\$,]', '', regex=True).astype(float)
calendar['available'].replace({'f': 0, 't': 1}, inplace=True)
calendar.head()

In [17]:
# to get the average prices by month, we group the average of the price by month
price_by_month = calendar.groupby('month', as_index=False)['price'].mean()
price_by_month.plot.bar(x='month', y = 'price', figsize=(12,4));
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.xlabel('Month', fontsize = 16)
plt.ylabel('Avg. price', fontsize = 16)
plt.title("Average price per month in ($)", fontsize = 16);

In [18]:
print(calendar['price'].min())
print(calendar['price'].max())

In [19]:
## Plot histogram of prices
bins=list(range(0, 10, 1700))
#print(bins)
ax = calendar['price'].plot.hist(by = 'price', bins=25, density=True, alpha=0.9, figsize=(8,4))
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.xlabel('Price in $', fontsize = 14)
plt.ylabel('Freq. (probability)', fontsize = 14)
plt.title("Histogram of prices in ($)", fontsize = 16);

Conclusions: We can see that month 9 (september) is most expensive, followed closely by octber.It is Mild autumn weather makes touring around on foot a joy.The above histogram of the daily prices also confirm the emprical distribution shown for weekly prices earlier.

# **Which months are the busiest to visit Boston? Does a busy month also means expensive?**

In [20]:
## Here, we make also use of the reviews dataset
print(reviews.shape)
reviews.head()

In [21]:
#get percenatage of missing values in listings dataframe
%matplotlib inline
reviews.isnull().mean().plot.bar(figsize=(6,4), fontsize = 12);

In [22]:
## we can see some of the comments are missing
## to continue our anlalysis, we first drop the rows with missing comments
## then add month information by extracting from date column
reviews = reviews[reviews['comments'].notna()]
reviews['month'] = pd.DatetimeIndex(reviews['date']).month
reviews.head()

In [23]:
# number of reviews per month
reviews_per_month = reviews['month'].value_counts()
(reviews_per_month/reviews.shape[0]).plot(kind="bar", figsize=(12,4));
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
plt.title("Reviews per month", fontsize = 16);

Conclusions: We can see that August is the busiest month (about 15% of reviews are made in August). Based on the above analysis, september is the most expensive month, followed closely by october. We can see there is not  connection between the most expensive and most busy (indicated by the reviews received) months. in this case the busiest months are in summer.

# **What is the distribution of the listings in the city borders? are they evenly distribued or concentrated in few neighbourhoods?**

In [24]:
# To answer these questions, we look once again to listings dataframe
listings.head()

In [25]:
## one of the columns with neighbourhood information is the 'neighbourhood_cleansed' column
area_counts = listings['neighbourhood_cleansed'].value_counts().rename_axis('name').reset_index(name='counts')
area_counts.head()

In [26]:
## To visualize, the most common listings neighbourhood, we can use latitude and longitude columns 
listing_coordinates = listings[['latitude', 'longitude']]
listing_coordinates.values

In [28]:
## We can use folium library to plot an interactive heatmap based on the location coordinates we get from listings dataframe
from folium import plugins
m = folium.Map([42.2826188 , -71.13306793], zoom_start=12) # initialize the map with Boston coordinates
location_array = listing_coordinates.values
# plot heatmap
m.add_child(plugins.HeatMap(location_array, radius=20))
m

# What can be said aboud the vibe of the neighbourhoods based on reviews?

In [29]:
## We continue our analysis using listings df, in particular the neighbourhood_overview and neighbourhood_cleansed columns
df = listings[['neighborhood_overview', 'neighbourhood_cleansed']]
df.head()

In [30]:
## We can observe from the cell above that some of the neighborhood_overviews are missing
## Our anlaysis in the subsequent cells are based on counting the top words/phrases in neighborhood overiews
## Hence, in the next lines, we are going to drop the rows with missing overviews/reviews
df = df.dropna()
df.head()

In [31]:
## lets have a look at some random neighbourhood overviews
print('first overview:  ', df['neighborhood_overview'].values[0])
print('second overview:  ', df['neighborhood_overview'].values[100])
print('third overview:  ', df['neighborhood_overview'].values[500])

In [32]:
## we prepare a df based on neighbourhood overview descriptons and analyse the common words/phrases
## the common / frequent words can give us an indication of the vibe of the neighbourhood
overview = pd.DataFrame(columns = ['description', 'common_words']) # prepare the dataframe
overview['description'] = df.groupby('neighbourhood_cleansed', as_index=True)['neighborhood_overview'].sum()
overview['name'] = overview.index
overview.fillna('', inplace=True)
overview.head()

In [33]:
# Here we use sklearn countvectorizer library to determine the most common bigrams in neighbourhood descriptions
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
def get_top_n_bigram(corpus, n=None): 
    '''
    A function for getting top bigrams of a text
    It takes a text corpus and a number n as input
    Returns top n bi-grams of from corpus with frequency excluding stop words
    '''
    vec = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [34]:
# get top bigrams form neighbourhood_overview descriptions using the above function
for i in range(overview.shape[0]):
    common_words = get_top_n_bigram([overview['description'][i]], 5)
    for word, freq in common_words:
        overview['common_words'][i] = overview['common_words'][i] + word + '; '

In [35]:
overview.head()