# Pre: Data Preparation

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd 
import descartes


## Read data
lists = pd.read_csv('listings.csv')
pd.set_option('display.max_columns', 500) #expand the size of terminal window to display all rows

## Clean data - only for this case, using LA as example
lists['zipcode'] = lists['zipcode'].astype(str).\
                                    str.replace('ca','').\
                                    str.replace('CA','').\
                                    str.replace('Near ','').\
                                    str.replace('139 S Valencia Ave, Glendora.','').\
                                    str.extract(r'^(\d{5})')

lists['price'] = lists['price'].astype(str).\
                                str.replace('$','').\
                                str.replace(',','').\
                                astype(float)

lists = lists.query('price > 0')  #filter out listings for free living
lists['cleaning_fee'] = lists['cleaning_fee'].astype(str).\
                                            str.replace('$','').\
                                            str.replace(',','').\
                                            astype(float)

lists['host_since_year'] = lists['host_since'].astype(str).apply(lambda x: x.split('-')[0]) #extract host join year

lists['list_since_year'] = lists['first_review'].astype(str).apply(lambda x: x.split('-')[0]) #extract listing upload year

## Summarize data
#lists.describe()

# Part I: Description of Data

In [None]:
## Bar chart showing the new listing growth rate from 2008
list_growth = pd.DataFrame()
list_growth = lists.groupby('list_since_year')['id'].nunique().reset_index()
list_growth = list_growth.query('list_since_year != "nan"').rename(columns = {'id': 'new_listing_count'})
list_growth.plot(kind = 'bar', x = 'list_since_year', title = 'New Listing Growth Trend')

## Bar chart showing the new host growth rate from 2008
host_growth = pd.DataFrame()
host_growth = lists.groupby('host_since_year')['host_id'].nunique().reset_index()
host_growth = host_growth.query('host_since_year != "nan"').rename(columns = {'host_id': 'new_host_count'})
host_growth.plot(kind = 'bar', x = 'host_since_year', title = 'New Host Growth Trend')