# Exploration of Data Sources
* listings.csv.gz - Detailed listings data for New York City
* listings.csv - Summary information and metrics for listings in New York City
* calendar.csv.gz - Detailed calendar data for listings in New York City
* reviews.csv.gz - Detailed review data for listings in New York City
* reviews.csv - Summary review data and listing ID
* neighbourhoods.csv - Neighbourhood list for geo filter. Sourced from city or open source GIS files
* neighbourhoods.geojson - GeoJSON file of neighbourhoods of the city

For the purposes of this notebook, each file will be explored separately.

In [1]:
import pandas as pd 
import json
import os

# function to summarize data
# param: df dataframe
# print number of variables, data type counts, and variable names
def summarizeData(df):
    print("Number of Variables: {:d}\n".format(df.shape[1]))
    print("Number of Observations: {:d}\n".format(df.shape[0]))
    print("Data types:")
    print(df.dtypes.value_counts())    
    print("\nColumn Names by Type:\n")
    g = df.columns.to_series().groupby(df.dtypes).groups
    for item in g.items():
        print(item)

## Available Data

In [2]:
# Data directory
folder = r'C:\Users\ke392d\Desktop\Master\LTP_Learning Together Program\LTP - Masters Data Science\MAR653\_team_project\data'
# function to get absolute filepaths in a directory
# param directory: absolute directory name
# yields absolute file paths
def absoluteFilePaths(directory):
   for dirpath,_,filenames in os.walk(directory):
       for f in filenames:
           yield os.path.abspath(os.path.join(dirpath, f)) 
# get list of absolute file paths in current directory
myGenerator = absoluteFilePaths(folder)
filelist = []
for f in myGenerator:
    filelist.append(f)
print(filelist)    

['C:\\Users\\ke392d\\Desktop\\Master\\LTP_Learning Together Program\\LTP - Masters Data Science\\MAR653\\_team_project\\data\\listings.csv', 'C:\\Users\\ke392d\\Desktop\\Master\\LTP_Learning Together Program\\LTP - Masters Data Science\\MAR653\\_team_project\\data\\neighbourhoods.csv', 'C:\\Users\\ke392d\\Desktop\\Master\\LTP_Learning Together Program\\LTP - Masters Data Science\\MAR653\\_team_project\\data\\neighbourhoods.geojson', 'C:\\Users\\ke392d\\Desktop\\Master\\LTP_Learning Together Program\\LTP - Masters Data Science\\MAR653\\_team_project\\data\\reviews.csv', 'C:\\Users\\ke392d\\Desktop\\Master\\LTP_Learning Together Program\\LTP - Masters Data Science\\MAR653\\_team_project\\data\\calendar_detailed\\calendar.csv.gz', 'C:\\Users\\ke392d\\Desktop\\Master\\LTP_Learning Together Program\\LTP - Masters Data Science\\MAR653\\_team_project\\data\\listings_detailed\\listings.csv.gz', 'C:\\Users\\ke392d\\Desktop\\Master\\LTP_Learning Together Program\\LTP - Masters Data Science\\MAR6

## File: listings.csv.gz (detailed)
* Detailed Listings data for New York City
* Contains 50599 observations of 43 numeric and 63 object variables

In [3]:
fname = folder + '\\listings_detailed\\listings.csv.gz'
df = pd.read_csv(fname, compression='gzip')
df.describe()

Unnamed: 0,id,scrape_id,thumbnail_url,medium_url,xl_picture_url,host_id,host_acceptance_rate,host_listings_count,host_total_listings_count,latitude,...,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
count,50599.0,50599.0,0.0,0.0,0.0,50599.0,0.0,50036.0,50036.0,50599.0,...,39415.0,39385.0,39404.0,39381.0,39382.0,50599.0,50599.0,50599.0,50599.0,40379.0
mean,21373800.0,20191200000000.0,,,,78938540.0,,17.38872,17.38872,40.728743,...,9.273272,9.739647,9.743579,9.580585,9.383272,7.330916,5.61517,1.480009,0.128995,1.405862
std,12428640.0,12.21496,,,,90959440.0,,111.558158,111.558158,0.055004,...,1.079284,0.735555,0.751233,0.758823,0.928692,32.757485,32.293312,6.496156,1.027153,1.695551
min,2595.0,20191200000000.0,,,,2438.0,,0.0,0.0,40.49979,...,2.0,2.0,2.0,2.0,2.0,1.0,0.0,0.0,0.0,0.01
25%,10098910.0,20191200000000.0,,,,8925493.0,,1.0,1.0,40.68955,...,9.0,10.0,10.0,9.0,9.0,1.0,0.0,0.0,0.0,0.19
50%,21546420.0,20191200000000.0,,,,35726600.0,,1.0,1.0,40.72288,...,10.0,10.0,10.0,10.0,10.0,1.0,1.0,0.0,0.0,0.74
75%,32717800.0,20191200000000.0,,,,129333000.0,,2.0,2.0,40.76303,...,10.0,10.0,10.0,10.0,10.0,2.0,1.0,1.0,0.0,2.11
max,40584020.0,20191200000000.0,,,,314368200.0,,1767.0,1767.0,40.91686,...,10.0,10.0,10.0,10.0,10.0,310.0,310.0,118.0,24.0,52.63


In [4]:
summarizeData(df)

Number of Variables: 106

Number of Observations: 50599

Data types:
object     63
float64    22
int64      21
dtype: int64

Column Names by Type:

(dtype('int64'), Index(['id', 'scrape_id', 'host_id', 'accommodates', 'guests_included',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'availability_30', 'availability_60',
       'availability_90', 'availability_365', 'number_of_reviews',
       'number_of_reviews_ltm', 'calculated_host_listings_count',
       'calculated_host_listings_count_entire_homes',
       'calculated_host_listings_count_private_rooms',
       'calculated_host_listings_count_shared_rooms'],
      dtype='object'))
(dtype('float64'), Index(['thumbnail_url', 'medium_url', 'xl_picture_url', 'host_acceptance_rate',
       'host_listings_count', 'host_total_listings_count', 'latitude',
       'longitude', 'bathrooms', 'bedrooms', 'beds', 'square_feet',
      

### Sample Observation

In [5]:
df.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,...,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,2595,https://www.airbnb.com/rooms/2595,20191204162729,2019-12-07,Skylit Midtown Castle,"Beautiful, spacious skylit studio in the heart...","- Spacious (500+ft²), immaculate and nicely fu...","Beautiful, spacious skylit studio in the heart...",none,Centrally located in the heart of Manhattan ju...,...,f,f,strict_14_with_grace_period,t,t,1,1,0,0,0.39
1,3831,https://www.airbnb.com/rooms/3831,20191204162729,2019-12-07,Cozy Entire Floor of Brownstone,Urban retreat: enjoy 500 s.f. floor in 1899 br...,Greetings! We own a double-duplex brownst...,Urban retreat: enjoy 500 s.f. floor in 1899 br...,none,Just the right mix of urban center and local n...,...,f,f,moderate,f,f,1,1,0,0,4.67
2,5099,https://www.airbnb.com/rooms/5099,20191204162729,2019-12-06,Large Cozy 1 BR Apartment In Midtown East,My large 1 bedroom apartment has a true New Yo...,I have a large 1 bedroom apartment centrally l...,My large 1 bedroom apartment has a true New Yo...,none,My neighborhood in Midtown East is called Murr...,...,f,f,moderate,t,t,1,1,0,0,0.6
3,5121,https://www.airbnb.com/rooms/5121,20191204162729,2019-12-06,BlissArtsSpace!,,HELLO EVERYONE AND THANKS FOR VISITING BLISS A...,HELLO EVERYONE AND THANKS FOR VISITING BLISS A...,none,,...,f,f,strict_14_with_grace_period,f,f,1,0,1,0,0.38
4,5178,https://www.airbnb.com/rooms/5178,20191204162729,2019-12-05,Large Furnished Room Near B'way,Please don’t expect the luxury here just a bas...,"You will use one large, furnished, private roo...",Please don’t expect the luxury here just a bas...,none,"Theater district, many restaurants around here.",...,f,f,strict_14_with_grace_period,f,f,1,0,1,0,3.52


## File: listings.csv (summary)
* Summary information and metrics for listings in New York City (good for visualizations)
* Contains 50599 observations of 10 numeric and 6 object variables

In [6]:
fname = folder + '\\listings.csv'
df = pd.read_csv(fname)
df.describe()

Unnamed: 0,id,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
count,50599.0,50599.0,50599.0,50599.0,50599.0,50599.0,50599.0,40379.0,50599.0,50599.0
mean,21373800.0,78938540.0,40.728743,-73.950762,158.171782,7.531908,24.809225,1.405862,7.330916,112.785944
std,12428640.0,90959440.0,0.055004,0.047606,348.089562,22.454271,47.317882,1.695551,32.757485,135.891028
min,2595.0,2438.0,40.49979,-74.24787,0.0,1.0,0.0,0.01,1.0,0.0
25%,10098910.0,8925493.0,40.68955,-73.98293,69.0,1.0,1.0,0.19,1.0,0.0
50%,21546420.0,35726600.0,40.72288,-73.95505,105.0,3.0,6.0,0.74,1.0,42.0
75%,32717800.0,129333000.0,40.76303,-73.9342,175.0,5.0,25.0,2.11,2.0,225.0
max,40584020.0,314368200.0,40.91686,-73.71299,10000.0,1250.0,675.0,52.63,310.0,365.0


In [7]:
summarizeData(df)

Number of Variables: 16

Number of Observations: 50599

Data types:
int64      7
object     6
float64    3
dtype: int64

Column Names by Type:

(dtype('int64'), Index(['id', 'host_id', 'price', 'minimum_nights', 'number_of_reviews',
       'calculated_host_listings_count', 'availability_365'],
      dtype='object'))
(dtype('float64'), Index(['latitude', 'longitude', 'reviews_per_month'], dtype='object'))
(dtype('O'), Index(['name', 'host_name', 'neighbourhood_group', 'neighbourhood',
       'room_type', 'last_review'],
      dtype='object'))


### Sample Observation

In [8]:
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,10,48,2019-11-04,0.39,1,1
1,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,295,2019-11-22,4.67,1,1
2,5099,Large Cozy 1 BR Apartment In Midtown East,7322,Chris,Manhattan,Murray Hill,40.74767,-73.975,Entire home/apt,200,3,78,2019-10-13,0.6,1,19
3,5121,BlissArtsSpace!,7356,Garon,Brooklyn,Bedford-Stuyvesant,40.68688,-73.95596,Private room,60,29,49,2017-10-05,0.38,1,365
4,5178,Large Furnished Room Near B'way,8967,Shunichi,Manhattan,Hell's Kitchen,40.76489,-73.98493,Private room,79,2,454,2019-11-21,3.52,1,242


## File: reviews.csv.gz (detailed)
* Detailed Review Data for listings in New York City
* Contains 1255322 observations of 3 numeric and 3 object variables

In [9]:
fname = folder + '\\reviews_detailed\\reviews.csv.gz'
df = pd.read_csv(fname, compression='gzip')
df.describe()

Unnamed: 0,listing_id,id,reviewer_id
count,1255322.0,1255322.0,1255322.0
mean,13962110.0,291842200.0,83249340.0
std,10764780.0,173448600.0,78202630.0
min,2595.0,903.0,1.0
25%,4081800.0,141565700.0,18768310.0
50%,13156840.0,290004200.0,55921060.0
75%,21733540.0,452663700.0,132493400.0
max,40565660.0,573611200.0,314691700.0


In [10]:
summarizeData(df)

Number of Variables: 6

Number of Observations: 1255322

Data types:
int64     3
object    3
dtype: int64

Column Names by Type:

(dtype('int64'), Index(['listing_id', 'id', 'reviewer_id'], dtype='object'))
(dtype('O'), Index(['date', 'reviewer_name', 'comments'], dtype='object'))


### Sample Observation

In [11]:
df.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,2595,17857,2009-11-21,50679,Jean,Notre séjour de trois nuits.\r\nNous avons app...
1,2595,19176,2009-12-05,53267,Cate,Great experience.
2,2595,19760,2009-12-10,38960,Anita,I've stayed with my friend at the Midtown Cast...
3,2595,34320,2010-04-09,71130,Kai-Uwe,"We've been staying here for about 9 nights, en..."
4,2595,46312,2010-05-25,117113,Alicia,We had a wonderful stay at Jennifer's charming...


## File: reviews.csv (summary)
* Summary Review data and Listing ID (to facilitate time based analytics and visualizations linked to a listing)
* Contains 1255322 observations of 1 numeric and 1 object variables
* These are observations of review dates by listing ID.

In [12]:
fname = folder + '\\reviews.csv'
df = pd.read_csv(fname)
df.describe()

Unnamed: 0,listing_id
count,1255322.0
mean,13962110.0
std,10764780.0
min,2595.0
25%,4081800.0
50%,13156840.0
75%,21733540.0
max,40565660.0


In [13]:
summarizeData(df)

Number of Variables: 2

Number of Observations: 1255322

Data types:
int64     1
object    1
dtype: int64

Column Names by Type:

(dtype('int64'), Index(['listing_id'], dtype='object'))
(dtype('O'), Index(['date'], dtype='object'))


### Sample Observation

In [14]:
df.head()

Unnamed: 0,listing_id,date
0,2595,2009-11-21
1,2595,2009-12-05
2,2595,2009-12-10
3,2595,2010-04-09
4,2595,2010-05-25


## File: calendar.csv.gz (detailed)
> Detailed Calendar Data for listings in New York City

In [15]:
fname = folder + '\\calendar_detailed\\calendar.csv.gz'
df = pd.read_csv(fname, compression='gzip')
df.describe()

Unnamed: 0,listing_id,minimum_nights,maximum_nights
count,18470160.0,18469500.0,18469500.0
mean,21373650.0,8.528041,688694.8
std,12428610.0,30.29394,38411860.0
min,2595.0,1.0,1.0
25%,10098890.0,1.0,29.0
50%,21546420.0,3.0,1125.0
75%,32717890.0,5.0,1125.0
max,40584020.0,3456.0,2147484000.0


In [16]:
summarizeData(df)

Number of Variables: 7

Number of Observations: 18470156

Data types:
object     4
float64    2
int64      1
dtype: int64

Column Names by Type:

(dtype('int64'), Index(['listing_id'], dtype='object'))
(dtype('float64'), Index(['minimum_nights', 'maximum_nights'], dtype='object'))
(dtype('O'), Index(['date', 'available', 'price', 'adjusted_price'], dtype='object'))


### Sample Observation

In [17]:
df.head()

Unnamed: 0,listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights
0,19812,2019-12-05,f,$105.00,$105.00,3.0,90.0
1,2595,2019-12-06,f,$175.00,$175.00,10.0,1125.0
2,2595,2019-12-07,f,$175.00,$175.00,10.0,1125.0
3,2595,2019-12-08,f,$175.00,$175.00,10.0,1125.0
4,2595,2019-12-09,f,$175.00,$175.00,10.0,1125.0


## File: neighbourhoods.csv
> Neighbourhood list for geo filter. Sourced from city or open source GIS files.

In [18]:
fname = folder + '\\neighbourhoods.csv'
df = pd.read_csv(fname)
df.describe()

Unnamed: 0,neighbourhood_group,neighbourhood
count,230,230
unique,5,230
top,Queens,Ditmars Steinway
freq,53,1


In [19]:
summarizeData(df)

Number of Variables: 2

Number of Observations: 230

Data types:
object    2
dtype: int64

Column Names by Type:

(dtype('O'), Index(['neighbourhood_group', 'neighbourhood'], dtype='object'))


### Sample Observation

In [23]:
df.head()

Unnamed: 0,neighbourhood_group,neighbourhood
0,Bronx,Allerton
1,Bronx,Baychester
2,Bronx,Belmont
3,Bronx,Bronxdale
4,Bronx,Castle Hill


## File: neighbourhoods.geojson
> GeoJSON file of neighbourhoods of the city

In [20]:
fname = folder + '\\neighbourhoods.geojson'
with open(fname,'r') as fin:
    data = json.loads(fin.read())

In [21]:
data.keys()

dict_keys(['type', 'features'])

In [22]:
data['type']

'FeatureCollection'

In [24]:
print("There are {:d} feature sets in the collection".format(len(data['features'])))

There are 233 feature sets in the collection


### Sample Observation

In [25]:
# inspect first feature set
data['features'][0]

{'type': 'Feature',
 'geometry': {'type': 'MultiPolygon',
  'coordinates': [[[[-73.766708, 40.614911],
     [-73.768253, 40.614878],
     [-73.773977, 40.616004],
     [-73.773631, 40.616327],
     [-73.768735, 40.620901],
     [-73.767459, 40.620511],
     [-73.766978, 40.616919],
     [-73.766708, 40.614911]]]]},
 'properties': {'neighbourhood': 'Bayswater', 'neighbourhood_group': 'Queens'}}

Each feature set contains latitude and longitude coordinates specific to neighbourhood and neighbourhood group.

In [26]:
# all neighbourhood groups
neighbourhoodgroups = set([f['properties']['neighbourhood_group'] for f in data['features']])
print(neighbourhoodgroups)

{'Brooklyn', 'Manhattan', 'Staten Island', 'Queens', 'Bronx'}


In [27]:
# all neighbourhoods by neighbourhood group
for group in neighbourhoodgroups:
    # neighbourhoods respective to neighbourhood group
    nnames = [f['properties']['neighbourhood'] for f in data['features'] if f['properties']['neighbourhood_group'] == group]
    print('\n',"group:",group)
    print(" neighbourhoods:{:d}".format(len(nnames)),'\n')
    print(nnames)


 group: Brooklyn
 neighbourhoods:48 

['Bay Ridge', 'Gerritsen Beach', 'Bath Beach', 'Bedford-Stuyvesant', 'Bensonhurst', 'Bergen Beach', 'Boerum Hill', 'Borough Park', 'Brighton Beach', 'Prospect-Lefferts Gardens', 'Brooklyn Heights', 'Brownsville', 'Bushwick', 'Cypress Hills', 'Canarsie', 'Carroll Gardens', 'Clinton Hill', 'Cobble Hill', 'Crown Heights', 'Columbia St', 'Coney Island', 'DUMBO', 'Downtown Brooklyn', 'Dyker Heights', 'East Flatbush', 'East New York', 'Flatbush', 'Flatlands', 'Gowanus', 'Fort Greene', 'Fort Hamilton', 'Gravesend', 'Kensington', 'Greenpoint', 'Manhattan Beach', 'Midwood', 'Mill Basin', 'Navy Yard', 'Park Slope', 'Prospect Heights', 'Red Hook', 'Sea Gate', 'Sheepshead Bay', 'South Slope', 'Sunset Park', 'Vinegar Hill', 'Windsor Terrace', 'Williamsburg']

 group: Manhattan
 neighbourhoods:32 

['Battery Park City', 'Chinatown', 'Chelsea', 'Civic Center', 'East Harlem', 'East Village', 'Financial District', 'Flatiron District', 'Gramercy', 'Greenwich Villag

## Getting Zipcodes for Coordinates

In [28]:
from uszipcode import SearchEngine
from uszipcode import Zipcode
from uszipcode import SimpleZipcode
search = SearchEngine(simple_zipcode=True) # set simple_zipcode=False to use rich info database
result = search.by_coordinates(40.614878, -73.768253, radius=30, returns=1)

In [29]:
data['features'][0]['properties']

{'neighbourhood': 'Bayswater', 'neighbourhood_group': 'Queens'}

In [30]:
for i in data['features'][0]['geometry']['coordinates'][0][0]:
    print(i[0], i[1])

-73.766708 40.614911
-73.768253 40.614878
-73.773977 40.616004
-73.773631 40.616327
-73.768735 40.620901
-73.767459 40.620511
-73.766978 40.616919
-73.766708 40.614911


In [31]:
search.by_coordinates(lat = 40.614911, lng = -73.766708, returns=1)

[SimpleZipcode(zipcode='11096', zipcode_type='Standard', major_city='Inwood', post_office_city='Inwood, NY', common_city_list=['Inwood', 'Far Rockaway'], county='Nassau County', state='NY', lat=40.62, lng=-73.75, timezone='Eastern', radius_in_miles=1.0, area_code_list=['516', '718', '347', '929'], population=8344, population_density=6827.0, land_area_in_sqmi=1.22, water_area_in_sqmi=0.47, housing_units=2702, occupied_housing_units=2536, median_home_value=384800, median_household_income=48538, bounds_west=-73.767023, bounds_east=-73.737316, bounds_north=40.633387, bounds_south=40.609984)]

In [32]:
data['features'][0]
for feature in data['features']:
    print(feature['properties'])

{'neighbourhood': 'Bayswater', 'neighbourhood_group': 'Queens'}
{'neighbourhood': 'Allerton', 'neighbourhood_group': 'Bronx'}
{'neighbourhood': 'City Island', 'neighbourhood_group': 'Bronx'}
{'neighbourhood': 'Ditmars Steinway', 'neighbourhood_group': 'Queens'}
{'neighbourhood': 'Ozone Park', 'neighbourhood_group': 'Queens'}
{'neighbourhood': 'Fordham', 'neighbourhood_group': 'Bronx'}
{'neighbourhood': 'Whitestone', 'neighbourhood_group': 'Queens'}
{'neighbourhood': 'Arden Heights', 'neighbourhood_group': 'Staten Island'}
{'neighbourhood': 'Arrochar', 'neighbourhood_group': 'Staten Island'}
{'neighbourhood': 'Arverne', 'neighbourhood_group': 'Queens'}
{'neighbourhood': 'Bay Ridge', 'neighbourhood_group': 'Brooklyn'}
{'neighbourhood': 'Belmont', 'neighbourhood_group': 'Bronx'}
{'neighbourhood': 'Gerritsen Beach', 'neighbourhood_group': 'Brooklyn'}
{'neighbourhood': 'Port Ivory', 'neighbourhood_group': 'Staten Island'}
{'neighbourhood': 'Soundview', 'neighbourhood_group': 'Bronx'}
{'neig

In [33]:
len([feature['properties'] for feature in data['features']])

233