## Fuctions for plotting
##### Since there are four cities in total and the format of dataframes are unified, applying fuctions for repetitive plottings is more efficient.

In [2]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.image as img
from PIL import Image
import requests
from io import BytesIO
import seaborn as sns
from scipy.stats import norm
from scipy import stats
%matplotlib inline

In [3]:
# import dataset
def import_df(web):
    df = pd.read_csv(web)
    df['price'] = df['price'].str.replace('$','')
    df['price'] = df['price'].str.replace(',','').astype(float)
    return df

In [4]:
# import the image
def import_img(web):
    response = requests.get(web)
    img = Image.open(BytesIO(response.content))
    return img

In [5]:
# get the max/min value of longitude and latitude
def area(df):
    xl=df['longitude'].min()
    xh=df['longitude'].max()
    yl=df['latitude'].min()
    yh=df['latitude'].max()
    return xl,xh,yl,yh

In [6]:
# get the overall geographic location for all houses
def location(city,df,sqrt,colorsize,figsize1,figsize2):
    mpl.rcParams.update(mpl.rcParamsDefault)
    %matplotlib inline
    
    cmap = plt.cm.coolwarm
    n = mpl.colors.Normalize()
    fig,ax = plt.subplots(figsize=(figsize1,figsize2))
    df.plot.scatter(ax=ax,x='longitude',y='latitude',s=df['price']**sqrt,
                    color=cmap(n(df['price'].values)*colorsize))
    ax.set_xlim(df['longitude'].min()-0.01,df['longitude'].max()+0.01)
    ax.set_ylim(df['latitude'].min()-0.01,df['latitude'].max()+0.01)
    ax.set_xlabel('Longitude')
    ax.set_ylabel('Latitude')
    ax.set_title(city+' Airbnb Locations',size=14,fontweight='bold')
    return fig,ax

In [7]:
# get the geographic location under each room type category
def room_type_location(df,sqrt,colorsize,figsize1,figsize2):
    mpl.rcParams.update(mpl.rcParamsDefault)
    %matplotlib inline
    
    rtype = df.groupby(['id','room_type','latitude','longitude'],as_index=False)[['price']].mean()
    types = df['room_type'].unique()
    num = len(types)
    cmap = plt.cm.coolwarm
    n = mpl.colors.Normalize()
    fig,ax = plt.subplots(1,num,figsize=(figsize1*num,figsize2))
    for i in range(0,num):
        tprice = rtype.loc[rtype['room_type']==types[i],:]
        tprice.plot.scatter(ax=ax[i],x='longitude',y='latitude',s=tprice['price']**sqrt,
                            color=cmap(n(tprice['price'].values)*colorsize))
        ax[i].set_xlim(df['longitude'].min()-0.01,df['longitude'].max()+0.01)
        ax[i].set_ylim(df['latitude'].min()-0.01,df['latitude'].max()+0.01)
        ax[i].set_title(types[i],size=13,fontweight='bold')
        ax[i].set_xlabel('Longitude')
        ax[i].set_ylabel('Latitude')
    return fig,ax

In [8]:
# get the top 10 hosts with the most house listings
def top_hosts(city,df):
    top_host = df['host_id'].value_counts()[:10]
    ax = sns.barplot(top_host.index, top_host.values,order=top_host.index)
    ax.set_xticklabels(ax.get_xticklabels(),rotation=45)
    ax.set_title('Hosts with the most listings in '+city,size=14,fontweight='bold')
    ax.set_ylabel('Count of listings')
    ax.set_xlabel('Host IDs')

In [9]:
# count number of listings in each neighbourhood
def count_nei(city,df):
    mpl.rcParams.update(mpl.rcParamsDefault)
    %matplotlib inline
    fig,ax = plt.subplots(figsize=(18,5))
    ax = sns.countplot(x='neighbourhood',data=df)
    ax.set_xlabel('Neighbourhood group')
    ax.set_ylabel('Count')
    ax.set_title('Counts of listings in neighbourhoods',fontsize=14,fontweight='bold')
    ax.set_xticklabels(ax.get_xticklabels(),rotation=90)
    return fig,ax

In [10]:
# plot pie charts for number of listings in each neighbourhood
def nei_pie(city,df): 
    if city == 'NYC':
        labels = df.neighbourhood_group.value_counts().index
        sizes = df.neighbourhood_group.value_counts().values
        explode = (0.1, 0.2, 0.3, 0.4, 0.6)
    else:
        labels = df.neighbourhood.value_counts().index
        sizes = df.neighbourhood.value_counts().values
        explode = np.full(len(sizes),0.1)
    fig, ax = plt.subplots()
    wedges, texts, autotexts = ax.pie(sizes, explode=explode,labels=labels, autopct='%1.1f%%',
                                       shadow=True, startangle=90)
    ax.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
    ax.set_title('Most rented neighbourhood group pie chart',size=12,fontweight='bold')
    ax.legend(wedges, labels,
              title='Neighbourhood groups',
              loc='center left',
              bbox_to_anchor=(1, 0, 0.5, 1))
    return ax

In [11]:
# get price distribution of each neighbourhood
def nei_stats(city,df):
    if city == 'NYC':
        col = 'neighbourhood_group'
    else:
        col = 'neighbourhood'
    nei = df[col].unique()
    price_list = []
    for n in nei:
        sub = df.loc[df[col] == n]
        sub_price = sub[['price']]
        price_list.append(sub_price)
    stats_list = []
    for p in price_list:
        i = p.describe(percentiles=[.25, .50, .75])
        i = i.iloc[3:]
        i.reset_index(inplace=True)
        i.rename(columns={'index':'Stats'},inplace=True)
        stats_list.append(i)
    # change names of the price column to the area name
    for i in range(0,len(nei)):
        stats_list[i].rename(columns={'price':nei[i]},inplace=True)

    # finilize dataframe for final view    
    stats_df = stats_list
    stats_df = [df.set_index('Stats') for df in stats_df]
    stats_df = stats_df[0].join(stats_df[1:])
    stats_df
    return stats_df

In [12]:
# violinplot to show density and distribtuion of prices 
def plot_distribution(city,df,dropv):
    if city == 'NYC':
        col = 'neighbourhood_group'
        angel = 0
    else:
        col = 'neighbourhood'
        angel = 45
    # create a sub-dataframe with no extreme values / less than 500
    no_extreme = df[df.price < dropv]    
    ax = sns.violinplot(data=no_extreme,x=col,y='price')
    ax.set_xlabel('Neighbourhood group')
    ax.set_ylabel('Price')
    ax.set_title('Density and distribution of prices for each neighbourhood group',fontweight='bold')
    ax.set_xticklabels(ax.get_xticklabels(),rotation=angel)
    return fig,ax

In [13]:
# get top 10 neighborhoods with the most listings
def top_nei(city,df): 
    nbhd = df.neighbourhood.value_counts()[:10]
    figure,ax = plt.subplots(figsize=(6,4))
    x = list(nbhd.index)
    y = list(nbhd.values)
    ax = sns.barplot(y,x)
    ax.set_title('Most Popular Neighbourhood in '+city,size=14,fontweight='bold')
    ax.set_ylabel("Neighbourhood area")
    ax.set_xlabel("Number of guest hosted in this Area")
    return fig,ax

In [14]:
# get the number of rooms under each room type in the top 10 neighborhoods
def top_nei_room_type(city,df):
    top10nei = df.neighbourhood.value_counts()[:10].index
    top10nei_df = df.loc[df['neighbourhood'].isin(top10nei)]
    if city == 'NYC':
        ax = sns.catplot(x='neighbourhood',col='room_type',hue='neighbourhood_group',data=top10nei_df,kind='count')
    else:
        ax = sns.catplot(x='neighbourhood',col='room_type',data=top10nei_df,kind='count')
    ax.set_xticklabels(rotation=45)
    return ax

In [15]:
# colormap to display avaialbility_365
def ava365(city,df,figsize1,figsize2):
    fig,ax = plt.subplots(figsize=(figsize1,figsize2))
    df.plot(kind='scatter', x='longitude', y='latitude', label='availability_365', c='availability_365', cmap='YlGnBu', alpha=0.6,colorbar=True,ax=ax)
    ax.set_title('Availability of rooms in '+city,size=14,fontweight='bold')
    ax.set_xlabel('Longitude')
    ax.set_ylabel('Latitude')
    ax.legend().remove()
    return fig,ax