## Import Package

In [None]:
import numpy as np
import datetime as dt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 300)

## Import Dataset

In [None]:
# read input file
listing = pd.read_csv("listings.csv", low_memory=False)

# convert host start date from string to date 
listing["host_since"] = pd.to_datetime(listing["host_since"])

# convert price from string to float and name it as `price_per_day`
listing["price_per_day"] = listing["price"].str.replace('$', '').str.replace(',', '').astype(float)

In [None]:
listing.head(2)

In [None]:
print ("What is the total listing?", listing["id"].count())
print ("What is the average price per day?", int(listing["price_per_day"].mean()))
print ("How many hosts in NYC?", listing["host_id"].nunique())
print ("How many neighborhoods in NYC?", listing["neighbourhood_cleansed"].nunique())

## Create the Visualization Function - Bar Chart

In [None]:
def bar_chart(x, y, title, xlabel, ylabel, color, legend, weight, height, ymin, ymax, xticks_rotation): #font_size
    #plt.style.use("ggplot")
    # color https://matplotlib.org/3.1.0/gallery/color/named_colors.html
    fig, ax = plt.subplots(figsize = (weight, height))

    rects = ax.bar(x, y, color = color, label = legend) #width = bar_width)

    ax.set_title(title)
    ax.set_xlabel(xlabel)
    #ax.set_ylabel(ylabel)
    ax.legend()
    ax.set_ylim([ymin,ymax])
    
    def autolabel(rects):
        for rect in rects:
            height = rect.get_height()
            ax.annotate('{}'.format(height),
                        xy=(rect.get_x() + rect.get_width() / 2, height),
                        xytext=(0, 3),
                        textcoords="offset points",
                        ha='center', va='bottom')
        
    autolabel(rects)
    plt.xticks(rotation = xticks_rotation) # horizontal, vertical
    
    fig.tight_layout()
    #plt.figure()
    #plt.rcParams.update({'font.size': font_size})
    plt.show()

## Listing and Price Review by New York City Boroughs

In [None]:
boroughs = listing.groupby(["neighbourhood_group_cleansed"]).agg({"id" : "count", "price_per_day" : "mean"}).reset_index()

boroughs.rename(columns = {"neighbourhood_group_cleansed":"Boroughs",
                           "id" : "Listing_Count",
                           "price_per_day" : "Average_Daily_Price"}, inplace = True)

boroughs["Listing_Pct"] = boroughs["Listing_Count"]/boroughs["Listing_Count"].sum()
boroughs.head(10)

In [None]:
boroughs.sort_values(["Listing_Count"], ascending = False, inplace = True)
bar_chart(x = boroughs["Boroughs"], 
          y = boroughs["Listing_Count"], 
          legend = "Total Listing",
          color = "tab:blue",
          title = "Total Listing by Boroughs", 
          xlabel = "Boroughs",
          ylabel = "Listing",
          weight = 12,
          height = 4,
          ymin = None,
          ymax = 24000,
          #font_size = 14,
          xticks_rotation = "horizontal"
         )

In [None]:
boroughs.sort_values(["Average_Daily_Price"], ascending = False, inplace = True)
bar_chart(x = boroughs["Boroughs"], 
          y = boroughs["Average_Daily_Price"].astype(int), 
          legend = "Avg Daily Price",
          color = "tab:orange",
          title = "Average Daily Price by Boroughs", 
          xlabel = "Boroughs",
          ylabel = "Listing",
          weight = 12,
          height = 4,
          ymin = None,
          ymax = 225,
          #font_size = 14,
          xticks_rotation = "horizontal"
         )

## Listing and Price Review by Neighborhood

In [None]:
neighborhood = listing.groupby(["neighbourhood_group_cleansed", "neighbourhood_cleansed"]).agg({"id" : "count", "price_per_day" : "mean"}).reset_index()

neighborhood.rename(columns = {"neighbourhood_cleansed":"Neighbourhood",
                                "id" : "Listing_Count",
                                "price_per_day" : "Average_Daily_Price"}, inplace = True)
neighborhood["Listing_Pct"] = neighborhood["Listing_Count"]/boroughs["Listing_Count"].sum()
neighborhood.sort_values(["Listing_Count"], ascending = False, inplace = True)
neighborhood.head(10)

In [None]:
neighborhood.sort_values(["Listing_Count"], ascending = False, inplace = True)
bar_chart(x = neighborhood["Neighbourhood"][0:10], 
          y = neighborhood["Listing_Count"][0:10], 
          legend = "Total Listing",
          color = "tab:blue",
          title = "Total Listing by New York City Neighborhood", 
          xlabel = "Neighborhood",
          ylabel = "Listing",
          weight = 12,
          height = 5,
          ymin = None,
          ymax = 4500,
          #font_size = 8,
          xticks_rotation = "vertical"
         )

In [None]:
neighborhood.sort_values(["Average_Daily_Price"], ascending = False, inplace = True)
bar_chart(x = neighborhood["Neighbourhood"][0:10], 
          y = neighborhood["Average_Daily_Price"].astype(int)[0:10], 
          legend = "Average_Daily_Price",
          color = "tab:orange",
          title = "Total Listing by New York City Neighborhood", 
          xlabel = "Neighborhood",
          ylabel = "Listing",
          weight = 15,
          height = 5,
          ymin = None,
          ymax = 850,
          #font_size = 10,
          xticks_rotation = "vertical"
         )

## Listing and Price Review by Property Type

In [None]:
property_type = listing.groupby(["property_type"]).agg({"id" : "count", "price_per_day" : "mean"}).reset_index()

property_type.rename(columns = {"property_type":"Property_Type",
                                "id" : "Listing_Count",
                                "price_per_day" : "Average_Daily_Price"}, inplace = True)

property_type["Listing_Pct"] = property_type["Listing_Count"]/property_type["Listing_Count"].sum()
property_type.head(10)

In [None]:
property_type.sort_values(["Listing_Count"], ascending = False, inplace = True)
bar_chart(x = property_type["Property_Type"][0:10], 
          y = property_type["Listing_Count"][0:10], 
          legend = "Total Listing",
          color = "tab:blue",
          title = "Total Listing by Property Type", 
          xlabel = "Property Type",
          ylabel = "Listing",
          weight = 15,
          height = 6,
          ymin = None,
          ymax = 45000,
          #font_size = 8,
          xticks_rotation = "vertical"
         )

In [None]:
property_type.sort_values(["Average_Daily_Price"], ascending = False, inplace = True)
bar_chart(x = property_type["Property_Type"][0:10], 
          y = property_type["Average_Daily_Price"].astype(int)[0:10], 
          legend = "Average Daily Price",
          color = "tab:orange",
          title = "Average Daily Price by Property Type", 
          xlabel = "Property Type",
          ylabel = "Listing",
          weight = 15,
          height = 6,
          ymin = None,
          ymax = 1400,
          #font_size = 8,
          xticks_rotation = "vertical"
         )

## Host Review by Host Year

In [None]:
listing["host_year"] = np.where(listing["host_since"].isnull(), None, listing["host_since"].dt.strftime('%Y'))
listing_count = listing.groupby(["host_year"])[["id"]].count().reset_index()
listing_count.rename(columns = {"id" : "host_count"}, inplace = True)
listing_count["host_in_total"] = listing_count["host_count"].cumsum()
listing_count

In [None]:
bar_chart(x = listing_count["host_year"], 
          y = listing_count["host_count"], 
          legend = "Listing",
          color = "tab:blue",
          title = "Listing by Year", 
          xlabel = "Year",
          ylabel = "Listing",
          weight = 12,
          height = 4,
          ymin = None,
          ymax = 9000,
          #font_size = 8,
          xticks_rotation = "horizontal"
         )

## Correlation between Price and Attributes

In [None]:
df_corr=listing[[
                "price_per_day",
                "host_has_profile_pic",
                #"host_identity_verified",
                "accommodates",
                "bathrooms",
                "bedrooms",
                "beds",
                "square_feet",
                "number_of_reviews",
                #"review_scores_value",
                "cleaning_fee",
                "guests_included",
                "extra_people",
                ]]

df_corr["host_has_profile_pic"] = np.where(df_corr["host_has_profile_pic"] == "t", 1, 0)
# df_corr["host_identity_verified"] = np.where(df_corr["host_identity_verified"] == "t", 1, 0)

In [None]:
#df_corr.groupby(["host_identity_verified"])["host_identity_verified"].count()

In [None]:
df_corr.info()

In [None]:
corr = df_corr.corr()
corr.style.background_gradient()