# Exploration of Data Sources
* listings.csv.gz - Detailed listings data for New York City
* listings.csv - Summary information and metrics for listings in New York City
* calendar.csv.gz - Detailed calendar data for listings in New York City
* reviews.csv.gz - Detailed review data for listings in New York City
* reviews.csv - Summary review data and listing ID
* neighbourhoods.csv - Neighbourhood list for geo filter. Sourced from city or open source GIS files
* neighbourhoods.geojson - GeoJSON file of neighbourhoods of the city

For the purposes of this notebook, each file will be explored separately.

In [0]:
import pandas as pd 
import json
import os

# function to summarize data
# param: df dataframe
# print number of variables, data type counts, and variable names
def summarizeData(df):
    print("Number of Variables: {:d}\n".format(df.shape[1]))
    print("Number of Observations: {:d}\n".format(df.shape[0]))
    print("Data types:")
    print(df.dtypes.value_counts())    
    print("\nColumn Names by Type:\n")
    g = df.columns.to_series().groupby(df.dtypes).groups
    for item in g.items():
        print(item)

### Load Data Sources

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import pandas as pd

file_path = '/content/drive/My Drive/MAR 653/Final Project/data/{}'

calendar_df = pd.read_csv(file_path.format('calendar.csv.gz'), compression='gzip')
listings_summary_df = pd.read_csv(file_path.format('listings.csv'))
listings_detail_df = pd.read_csv(file_path.format('listings.csv.gz'), compression='gzip')
neighbourhoods_df = pd.read_csv(file_path.format('neighbourhoods.csv'))
neighbourhoods_geo_df = pd.read_json(file_path.format('neighbourhoods.geojson'))
reviews_summary_df = pd.read_csv(file_path.format('reviews.csv'))
reviews_detail_df = pd.read_csv(file_path.format('reviews.csv.gz'), compression='gzip')
prizm_zipcode_df = pd.read_excel(file_path.format('prizm_attributes.xlsx'), sheet_name='zipcodes')
prizm_attributes_df = pd.read_excel(file_path.format('prizm_attributes.xlsx'), sheet_name='attributes')

In [4]:
os.listdir('drive/My Drive/MAR 653/Final Project/data')

['calendar.csv.gz',
 'listings.csv.gz',
 'reviews.csv.gz',
 'listings_detailed',
 'reviews_detailed',
 'calendar_detailed',
 'neighbourhoods.csv',
 'neighbourhoods.geojson',
 'listings.csv',
 'reviews.csv',
 'prizm_attributes.xlsx',
 'listings_cleaned.csv',
 'listings_201512.csv',
 'listings_201612.csv',
 'listings_201712.csv',
 'listings_201812.csv',
 'listings_201912.csv']

In [0]:
# read geojson into dictionary
import json
with open('drive/My Drive/MAR 653/Final Project/data/neighbourhoods.geojson', 'r') as fin:
  geo = json.loads(fin.read())

## File: listings.csv.gz (detailed)
* Detailed Listings data for New York City
* Contains 50599 observations of 43 numeric and 63 object variables

In [6]:
listings_detail_df.describe()

Unnamed: 0,id,scrape_id,thumbnail_url,medium_url,xl_picture_url,host_id,host_acceptance_rate,host_listings_count,host_total_listings_count,latitude,longitude,accommodates,bathrooms,bedrooms,beds,square_feet,guests_included,minimum_nights,maximum_nights,minimum_minimum_nights,maximum_minimum_nights,minimum_maximum_nights,maximum_maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,availability_30,availability_60,availability_90,availability_365,number_of_reviews,number_of_reviews_ltm,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
count,50599.0,50599.0,0.0,0.0,0.0,50599.0,0.0,50036.0,50036.0,50599.0,50599.0,50599.0,50548.0,50536.0,50467.0,386.0,50599.0,50599.0,50599.0,50599.0,50599.0,50599.0,50599.0,50599.0,50599.0,50599.0,50599.0,50599.0,50599.0,50599.0,50599.0,39437.0,39401.0,39415.0,39385.0,39404.0,39381.0,39382.0,50599.0,50599.0,50599.0,50599.0,40379.0
mean,21373800.0,20191200000000.0,,,,78938540.0,,17.38872,17.38872,40.728743,-73.950762,2.86063,1.150332,1.181217,1.549626,699.533679,1.521196,7.531908,43836.4,7.140576,9.103955,680470.4,1020003.0,8.531299,688194.7,7.125378,19.373426,32.337121,112.785944,24.809225,9.294986,93.842305,9.612649,9.273272,9.739647,9.743579,9.580585,9.383272,7.330916,5.61517,1.480009,0.128995,1.405862
std,12428640.0,12.21496,,,,90959440.0,,111.558158,111.558158,0.055004,0.047606,1.900865,0.442102,0.760981,1.131406,519.171703,1.162624,22.454271,9547645.0,21.40683,34.37932,38181820.0,46759200.0,28.863199,38186670.0,9.509439,21.520797,34.449508,135.891028,47.317882,16.22853,8.721777,0.850623,1.079284,0.735555,0.751233,0.758823,0.928692,32.757485,32.293312,6.496156,1.027153,1.695551
min,2595.0,20191200000000.0,,,,2438.0,,0.0,0.0,40.49979,-74.24787,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,0.0,0.0,0.0,0.01
25%,10098910.0,20191200000000.0,,,,8925493.0,,1.0,1.0,40.68955,-73.98293,2.0,1.0,1.0,1.0,350.0,1.0,1.0,29.0,1.0,2.0,29.0,29.0,1.3,29.0,0.0,0.0,0.0,0.0,1.0,0.0,92.0,9.0,9.0,10.0,10.0,9.0,9.0,1.0,0.0,0.0,0.0,0.19
50%,21546420.0,20191200000000.0,,,,35726600.0,,1.0,1.0,40.72288,-73.95505,2.0,1.0,1.0,1.0,700.0,1.0,3.0,1120.0,2.0,3.0,1125.0,1125.0,3.0,1125.0,1.0,8.0,15.0,42.0,6.0,2.0,96.0,10.0,10.0,10.0,10.0,10.0,10.0,1.0,1.0,0.0,0.0,0.74
75%,32717800.0,20191200000000.0,,,,129333000.0,,2.0,2.0,40.76303,-73.9342,4.0,1.0,1.0,2.0,903.0,2.0,5.0,1125.0,5.0,5.0,1125.0,1125.0,5.0,1125.0,13.0,39.0,67.0,225.0,25.0,12.0,100.0,10.0,10.0,10.0,10.0,10.0,10.0,2.0,1.0,1.0,0.0,2.11
max,40584020.0,20191200000000.0,,,,314368200.0,,1767.0,1767.0,40.91686,-73.71299,25.0,15.5,21.0,40.0,3700.0,16.0,1250.0,2147484000.0,1250.0,3456.0,2147484000.0,2147484000.0,1486.2,2147484000.0,30.0,60.0,90.0,365.0,675.0,407.0,100.0,10.0,10.0,10.0,10.0,10.0,10.0,310.0,310.0,118.0,24.0,52.63


In [7]:
summarizeData(listings_detail_df)

Number of Variables: 106

Number of Observations: 50599

Data types:
object     63
float64    22
int64      21
dtype: int64

Column Names by Type:

(dtype('int64'), Index(['id', 'scrape_id', 'host_id', 'accommodates', 'guests_included',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'availability_30', 'availability_60',
       'availability_90', 'availability_365', 'number_of_reviews',
       'number_of_reviews_ltm', 'calculated_host_listings_count',
       'calculated_host_listings_count_entire_homes',
       'calculated_host_listings_count_private_rooms',
       'calculated_host_listings_count_shared_rooms'],
      dtype='object'))
(dtype('float64'), Index(['thumbnail_url', 'medium_url', 'xl_picture_url', 'host_acceptance_rate',
       'host_listings_count', 'host_total_listings_count', 'latitude',
       'longitude', 'bathrooms', 'bedrooms', 'beds', 'square_feet',
      

### Sample Observation

In [0]:
listings_detail_df.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,notes,transit,access,interaction,house_rules,thumbnail_url,medium_url,picture_url,xl_picture_url,host_id,host_url,host_name,host_since,host_location,host_about,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_thumbnail_url,host_picture_url,host_neighbourhood,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,street,neighbourhood,neighbourhood_cleansed,...,extra_people,minimum_nights,maximum_nights,minimum_minimum_nights,maximum_minimum_nights,minimum_maximum_nights,maximum_maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,calendar_updated,has_availability,availability_30,availability_60,availability_90,availability_365,calendar_last_scraped,number_of_reviews,number_of_reviews_ltm,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,requires_license,license,jurisdiction_names,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,2595,https://www.airbnb.com/rooms/2595,20191204162729,2019-12-07,Skylit Midtown Castle,"Beautiful, spacious skylit studio in the heart...","- Spacious (500+ft²), immaculate and nicely fu...","Beautiful, spacious skylit studio in the heart...",none,Centrally located in the heart of Manhattan ju...,,Apartment is located on 37th Street between 5t...,"Guests have full access to the kitchen, bathro...",I am a Sound Therapy Practitioner and Kundalin...,"Make yourself at home, respect the space and t...",,,https://a0.muscache.com/im/pictures/f0813a11-4...,,2845,https://www.airbnb.com/users/show/2845,Jennifer,2008-09-09,"New York, New York, United States",A New Yorker since 2000! My passion is creatin...,within a day,85%,,f,https://a0.muscache.com/im/users/2845/profile_...,https://a0.muscache.com/im/users/2845/profile_...,Midtown,6.0,6.0,"['email', 'phone', 'reviews', 'offline_governm...",t,t,"New York, NY, United States",Midtown,Midtown,...,$0.00,10,1125,10,10,1125,1125,10.0,1125.0,2 weeks ago,t,1,1,1,1,2019-12-07,48,7,2009-11-21,2019-11-04,94.0,9.0,9.0,10.0,10.0,10.0,9.0,f,,,f,f,strict_14_with_grace_period,t,t,1,1,0,0,0.39
1,3831,https://www.airbnb.com/rooms/3831,20191204162729,2019-12-07,Cozy Entire Floor of Brownstone,Urban retreat: enjoy 500 s.f. floor in 1899 br...,Greetings! We own a double-duplex brownst...,Urban retreat: enjoy 500 s.f. floor in 1899 br...,none,Just the right mix of urban center and local n...,,B52 bus for a 10-minute ride to downtown Brook...,You will have exclusive use of and access to: ...,"We'll be around, but since you have the top fl...",Smoking - outside please; pets allowed but ple...,,,https://a0.muscache.com/im/pictures/e49999c2-9...,,4869,https://www.airbnb.com/users/show/4869,LisaRoxanne,2008-12-07,"New York, New York, United States",Laid-back bi-coastal actor/professor/attorney.,within an hour,100%,,f,https://a0.muscache.com/im/users/4869/profile_...,https://a0.muscache.com/im/users/4869/profile_...,Clinton Hill,1.0,1.0,"['email', 'phone', 'reviews', 'kba']",t,t,"Brooklyn, NY, United States",Brooklyn,Clinton Hill,...,$0.00,1,730,1,1,730,730,1.0,730.0,2 weeks ago,t,1,1,1,1,2019-12-07,295,75,2014-09-30,2019-11-22,90.0,9.0,9.0,10.0,9.0,10.0,9.0,f,,,f,f,moderate,f,f,1,1,0,0,4.67
2,5099,https://www.airbnb.com/rooms/5099,20191204162729,2019-12-06,Large Cozy 1 BR Apartment In Midtown East,My large 1 bedroom apartment has a true New Yo...,I have a large 1 bedroom apartment centrally l...,My large 1 bedroom apartment has a true New Yo...,none,My neighborhood in Midtown East is called Murr...,Read My Full Listing For All Information. New ...,From the apartment is a 10 minute walk to Gran...,I will meet you upon arrival.,I usually check in with guests via text or ema...,• Check-in time is 2PM. • Check-out time is 12...,,,https://a0.muscache.com/im/pictures/24020910/1...,,7322,https://www.airbnb.com/users/show/7322,Chris,2009-02-02,"New York, New York, United States","I'm an artist, writer, traveler, and a native ...",,,,f,https://a0.muscache.com/im/pictures/user/26745...,https://a0.muscache.com/im/pictures/user/26745...,Flatiron District,1.0,1.0,"['email', 'phone', 'reviews', 'jumio', 'govern...",t,f,"New York, NY, United States",Manhattan,Murray Hill,...,$100.00,3,21,3,3,21,21,3.0,21.0,2 weeks ago,t,19,19,19,19,2019-12-06,78,8,2009-04-20,2019-10-13,90.0,10.0,9.0,10.0,10.0,10.0,9.0,f,,,f,f,moderate,t,t,1,1,0,0,0.6
3,5121,https://www.airbnb.com/rooms/5121,20191204162729,2019-12-06,BlissArtsSpace!,,HELLO EVERYONE AND THANKS FOR VISITING BLISS A...,HELLO EVERYONE AND THANKS FOR VISITING BLISS A...,none,,,,,,,,,https://a0.muscache.com/im/pictures/2090980c-b...,,7356,https://www.airbnb.com/users/show/7356,Garon,2009-02-03,"New York, New York, United States","I am an artist(painter, filmmaker) and curato...",within a few hours,100%,,f,https://a0.muscache.com/im/pictures/user/72a61...,https://a0.muscache.com/im/pictures/user/72a61...,Bedford-Stuyvesant,1.0,1.0,"['email', 'phone', 'facebook', 'reviews', 'off...",t,f,"Brooklyn, NY, United States",Bedford-Stuyvesant,Bedford-Stuyvesant,...,$30.00,29,730,29,29,730,730,29.0,730.0,23 months ago,t,30,60,90,365,2019-12-06,49,0,2009-05-28,2017-10-05,90.0,8.0,8.0,10.0,10.0,9.0,9.0,f,,,f,f,strict_14_with_grace_period,f,f,1,0,1,0,0.38
4,5178,https://www.airbnb.com/rooms/5178,20191204162729,2019-12-05,Large Furnished Room Near B'way,Please don’t expect the luxury here just a bas...,"You will use one large, furnished, private roo...",Please don’t expect the luxury here just a bas...,none,"Theater district, many restaurants around here.",Reservation should be made at least a few days...,,Bathroom is shared with the host but the kitch...,,"No smoking in the room. When you check out, pl...",,,https://a0.muscache.com/im/pictures/12065/f070...,,8967,https://www.airbnb.com/users/show/8967,Shunichi,2009-03-03,"New York, New York, United States",I used to work for a financial industry but no...,within a few hours,100%,,f,https://a0.muscache.com/im/users/8967/profile_...,https://a0.muscache.com/im/users/8967/profile_...,Hell's Kitchen,1.0,1.0,"['email', 'phone', 'facebook', 'reviews']",t,f,"New York, NY, United States",Manhattan,Hell's Kitchen,...,$12.00,2,14,1,2,14,14,1.8,14.0,2 months ago,t,3,12,40,242,2019-12-05,454,47,2009-05-06,2019-11-21,84.0,9.0,7.0,9.0,9.0,10.0,8.0,f,,,f,f,strict_14_with_grace_period,f,f,1,0,1,0,3.52


## File: listings.csv (summary)
* Summary information and metrics for listings in New York City (good for visualizations)
* Contains 50599 observations of 10 numeric and 6 object variables

In [45]:
listings_summary_df.describe()

Unnamed: 0,id,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
count,50599.0,50599.0,50599.0,50599.0,50599.0,50599.0,50599.0,40379.0,50599.0,50599.0
mean,21373800.0,78938540.0,40.728743,-73.950762,158.171782,7.531908,24.809225,1.405862,7.330916,112.785944
std,12428640.0,90959440.0,0.055004,0.047606,348.089562,22.454271,47.317882,1.695551,32.757485,135.891028
min,2595.0,2438.0,40.49979,-74.24787,0.0,1.0,0.0,0.01,1.0,0.0
25%,10098910.0,8925493.0,40.68955,-73.98293,69.0,1.0,1.0,0.19,1.0,0.0
50%,21546420.0,35726600.0,40.72288,-73.95505,105.0,3.0,6.0,0.74,1.0,42.0
75%,32717800.0,129333000.0,40.76303,-73.9342,175.0,5.0,25.0,2.11,2.0,225.0
max,40584020.0,314368200.0,40.91686,-73.71299,10000.0,1250.0,675.0,52.63,310.0,365.0


In [0]:
summarizeData(listings_summary_df)

Number of Variables: 16

Number of Observations: 50599

Data types:
int64      7
object     6
float64    3
dtype: int64

Column Names by Type:

(dtype('int64'), Index(['id', 'host_id', 'price', 'minimum_nights', 'number_of_reviews',
       'calculated_host_listings_count', 'availability_365'],
      dtype='object'))
(dtype('float64'), Index(['latitude', 'longitude', 'reviews_per_month'], dtype='object'))
(dtype('O'), Index(['name', 'host_name', 'neighbourhood_group', 'neighbourhood',
       'room_type', 'last_review'],
      dtype='object'))


### Sample Observation

In [0]:
listings_summary_df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,10,48,2019-11-04,0.39,1,1
1,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,295,2019-11-22,4.67,1,1
2,5099,Large Cozy 1 BR Apartment In Midtown East,7322,Chris,Manhattan,Murray Hill,40.74767,-73.975,Entire home/apt,200,3,78,2019-10-13,0.6,1,19
3,5121,BlissArtsSpace!,7356,Garon,Brooklyn,Bedford-Stuyvesant,40.68688,-73.95596,Private room,60,29,49,2017-10-05,0.38,1,365
4,5178,Large Furnished Room Near B'way,8967,Shunichi,Manhattan,Hell's Kitchen,40.76489,-73.98493,Private room,79,2,454,2019-11-21,3.52,1,242


## File: reviews.csv.gz (detailed)
* Detailed Review Data for listings in New York City
* Contains 1255322 observations of 3 numeric and 3 object variables

In [0]:
reviews_detail_df.describe()

Unnamed: 0,listing_id,id,reviewer_id
count,1255322.0,1255322.0,1255322.0
mean,13962110.0,291842200.0,83249340.0
std,10764780.0,173448600.0,78202630.0
min,2595.0,903.0,1.0
25%,4081800.0,141565700.0,18768310.0
50%,13156840.0,290004200.0,55921060.0
75%,21733540.0,452663700.0,132493400.0
max,40565660.0,573611200.0,314691700.0


In [0]:
summarizeData(reviews_detail_df)

Number of Variables: 6

Number of Observations: 1255322

Data types:
object    3
int64     3
dtype: int64

Column Names by Type:

(dtype('int64'), Index(['listing_id', 'id', 'reviewer_id'], dtype='object'))
(dtype('O'), Index(['date', 'reviewer_name', 'comments'], dtype='object'))


### Sample Observation

In [8]:
reviews_detail_df.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,2595,17857,2009-11-21,50679,Jean,Notre séjour de trois nuits.\r\nNous avons app...
1,2595,19176,2009-12-05,53267,Cate,Great experience.
2,2595,19760,2009-12-10,38960,Anita,I've stayed with my friend at the Midtown Cast...
3,2595,34320,2010-04-09,71130,Kai-Uwe,"We've been staying here for about 9 nights, en..."
4,2595,46312,2010-05-25,117113,Alicia,We had a wonderful stay at Jennifer's charming...


## File: reviews.csv (summary)
* Summary Review data and Listing ID (to facilitate time based analytics and visualizations linked to a listing)
* Contains 1255322 observations of 1 numeric and 1 object variables
* These are observations of review dates by listing ID.

In [0]:
reviews_summary_df.describe()

Unnamed: 0,listing_id
count,1255322.0
mean,13962110.0
std,10764780.0
min,2595.0
25%,4081800.0
50%,13156840.0
75%,21733540.0
max,40565660.0


In [0]:
summarizeData(reviews_summary_df)

Number of Variables: 2

Number of Observations: 1255322

Data types:
object    1
int64     1
dtype: int64

Column Names by Type:

(dtype('int64'), Index(['listing_id'], dtype='object'))
(dtype('O'), Index(['date'], dtype='object'))


### Sample Observation

In [0]:
reviews_summary_df.head()

Unnamed: 0,listing_id,date
0,2595,2009-11-21
1,2595,2009-12-05
2,2595,2009-12-10
3,2595,2010-04-09
4,2595,2010-05-25


## File: calendar.csv.gz (detailed)
> Detailed Calendar Data for listings in New York City

In [0]:
calendar_df.describe()

Unnamed: 0,listing_id,minimum_nights,maximum_nights
count,18470160.0,18469500.0,18469500.0
mean,21373650.0,8.528041,688694.8
std,12428610.0,30.29394,38411860.0
min,2595.0,1.0,1.0
25%,10098890.0,1.0,29.0
50%,21546420.0,3.0,1125.0
75%,32717890.0,5.0,1125.0
max,40584020.0,3456.0,2147484000.0


In [0]:
summarizeData(calendar_df)

Number of Variables: 7

Number of Observations: 18470156

Data types:
object     4
float64    2
int64      1
dtype: int64

Column Names by Type:

(dtype('int64'), Index(['listing_id'], dtype='object'))
(dtype('float64'), Index(['minimum_nights', 'maximum_nights'], dtype='object'))
(dtype('O'), Index(['date', 'available', 'price', 'adjusted_price'], dtype='object'))


### Sample Observation

In [0]:
calendar_df.head()

Unnamed: 0,listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights
0,19812,2019-12-05,f,$105.00,$105.00,3.0,90.0
1,2595,2019-12-06,f,$175.00,$175.00,10.0,1125.0
2,2595,2019-12-07,f,$175.00,$175.00,10.0,1125.0
3,2595,2019-12-08,f,$175.00,$175.00,10.0,1125.0
4,2595,2019-12-09,f,$175.00,$175.00,10.0,1125.0


## File: neighbourhoods.csv
> Neighbourhood list for geo filter. Sourced from city or open source GIS files.

In [9]:
neighbourhoods_df.describe()

Unnamed: 0,neighbourhood_group,neighbourhood
count,230,230
unique,5,230
top,Queens,Pelham Gardens
freq,53,1


In [0]:
summarizeData(neighbourhoods_df)

Number of Variables: 2

Number of Observations: 230

Data types:
object    2
dtype: int64

Column Names by Type:

(dtype('O'), Index(['neighbourhood_group', 'neighbourhood'], dtype='object'))


### Sample Observation

In [0]:
neighbourhoods_df.head()

Unnamed: 0,neighbourhood_group,neighbourhood
0,Bronx,Allerton
1,Bronx,Baychester
2,Bronx,Belmont
3,Bronx,Bronxdale
4,Bronx,Castle Hill


## File: neighbourhoods.geojson
> GeoJSON file of neighbourhoods of the city

In [0]:
geo.keys()

dict_keys(['type', 'features'])

In [0]:
geo['type']

'FeatureCollection'

In [0]:
print("There are {:d} feature sets in the collection".format(len(geo['features'])))

There are 233 feature sets in the collection


### Sample Observation

In [0]:
# inspect first feature set
geo['features'][0]

{'geometry': {'coordinates': [[[[-73.766708, 40.614911],
     [-73.768253, 40.614878],
     [-73.773977, 40.616004],
     [-73.773631, 40.616327],
     [-73.768735, 40.620901],
     [-73.767459, 40.620511],
     [-73.766978, 40.616919],
     [-73.766708, 40.614911]]]],
  'type': 'MultiPolygon'},
 'properties': {'neighbourhood': 'Bayswater', 'neighbourhood_group': 'Queens'},
 'type': 'Feature'}

Each feature set contains latitude and longitude coordinates specific to neighbourhood and neighbourhood group.

In [0]:
# all neighbourhood groups
neighbourhoodgroups = set([f['properties']['neighbourhood_group'] for f in geo['features']])
print(neighbourhoodgroups)

{'Manhattan', 'Brooklyn', 'Bronx', 'Staten Island', 'Queens'}


In [0]:
# all neighbourhoods by neighbourhood group
for group in neighbourhoodgroups:
    # neighbourhoods respective to neighbourhood group
    nnames = [f['properties']['neighbourhood'] for f in geo['features'] if f['properties']['neighbourhood_group'] == group]
    print('\n',"group:",group)
    print(" neighbourhoods:{:d}".format(len(nnames)),'\n')
    print(nnames)


 group: Manhattan
 neighbourhoods:32 

['Battery Park City', 'Chinatown', 'Chelsea', 'Civic Center', 'East Harlem', 'East Village', 'Financial District', 'Flatiron District', 'Gramercy', 'Greenwich Village', 'Harlem', "Hell's Kitchen", 'Inwood', 'Kips Bay', 'Little Italy', 'Murray Hill', 'Lower East Side', 'Marble Hill', 'Midtown', 'Morningside Heights', 'NoHo', 'Nolita', 'SoHo', 'Roosevelt Island', 'Stuyvesant Town', 'Theater District', 'Tribeca', 'Two Bridges', 'Upper East Side', 'Upper West Side', 'Washington Heights', 'West Village']

 group: Brooklyn
 neighbourhoods:48 

['Bay Ridge', 'Gerritsen Beach', 'Bath Beach', 'Bedford-Stuyvesant', 'Bensonhurst', 'Bergen Beach', 'Boerum Hill', 'Borough Park', 'Brighton Beach', 'Prospect-Lefferts Gardens', 'Brooklyn Heights', 'Brownsville', 'Bushwick', 'Cypress Hills', 'Canarsie', 'Carroll Gardens', 'Clinton Hill', 'Cobble Hill', 'Crown Heights', 'Columbia St', 'Coney Island', 'DUMBO', 'Downtown Brooklyn', 'Dyker Heights', 'East Flatbush', 

# How neighborhood affects rental price?



In [4]:
listings_summary_df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,10,48,2019-11-04,0.39,1,1
1,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,295,2019-11-22,4.67,1,1
2,5099,Large Cozy 1 BR Apartment In Midtown East,7322,Chris,Manhattan,Murray Hill,40.74767,-73.975,Entire home/apt,200,3,78,2019-10-13,0.6,1,19
3,5121,BlissArtsSpace!,7356,Garon,Brooklyn,Bedford-Stuyvesant,40.68688,-73.95596,Private room,60,29,49,2017-10-05,0.38,1,365
4,5178,Large Furnished Room Near B'way,8967,Shunichi,Manhattan,Hell's Kitchen,40.76489,-73.98493,Private room,79,2,454,2019-11-21,3.52,1,242


In [53]:
listings_neighbourhood_df = listings_summary_df[['price','neighbourhood','neighbourhood_group']]
listings_neighbourhood_df['zipcode'] = listings_detail_df['zipcode']
listings_neighbourhood_df['neighbourhood_normalized'] = listings_neighbourhood_df['neighbourhood'].factorize()[0]
listings_neighbourhood_df['neighbourhood_group_normalized'] = listings_neighbourhood_df['neighbourhood_group'].factorize()[0]
listings_neighbourhood_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,price,neighbourhood,neighbourhood_group,zipcode,neighbourhood_normalized,neighbourhood_group_normalized
0,225,Midtown,Manhattan,10018,0,0
1,89,Clinton Hill,Brooklyn,11238,1,1
2,200,Murray Hill,Manhattan,10016,2,0
3,60,Bedford-Stuyvesant,Brooklyn,11216,3,1
4,79,Hell's Kitchen,Manhattan,10019,4,0


In [0]:
prizm_zipcode_df = prizm_zipcode_df[prizm_zipcode_df.isna().any(axis=1) == False]

In [93]:
prizm_attributes_df.loc[prizm_attributes_df['id']==1]['score'][0]

5

In [96]:
type(prizm_zipcode_df['attribute_id'][0])

str

In [157]:
# Calculate the averaged prizm score by zipcode
prizm_zipcode_df['average_score'] = -1
for i, row in prizm_zipcode_df.iterrows():
  attribute_id_arr = row['attribute_id'].split(',')
#  print(attribute_id_arr.split(','))
  sum = 0
  count = len(attribute_id_arr)
  for id in attribute_id_arr:
#    print('id: ', type(int(id)))
    match_row = prizm_attributes_df.loc[prizm_attributes_df['id']==int(id)]
    row_i = match_row.index[0]
#    print(row_i)
#   print(match_row['score'][row_i])
#    print(prizm_attributes_df.loc[prizm_attributes_df['id']==int(id)].index)
#    print(type(prizm_attributes_df.loc[prizm_attributes_df['id']==int(id)]['score']))
    sum += match_row['score'][row_i]
#  print('sum: ', sum, ' count: ', count)
  prizm_zipcode_df.loc[i, 'average_score'] = sum / count


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [191]:
listings_neighbourhood_df[listings_neighbourhood_df.isna().any(axis=1)]
listings_neighbourhood_df[listings_neighbourhood_df.isna().any(axis=1) == False]

Unnamed: 0,price,neighbourhood,neighbourhood_group,zipcode,neighbourhood_normalized,neighbourhood_group_normalized,upper,upper_middle,lower_middle,lower
0,225,Midtown,Manhattan,10018,0,0,0,0,0,0
1,89,Clinton Hill,Brooklyn,11238,1,1,0,0,0,0
2,200,Murray Hill,Manhattan,10016,2,0,0,0,0,0
3,60,Bedford-Stuyvesant,Brooklyn,11216,3,1,0,0,0,0
4,79,Hell's Kitchen,Manhattan,10019,4,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
50594,200,Fordham,Bronx,10458,137,4,0,0,0,0
50595,150,Ditmars Steinway,Queens,11105,50,2,0,0,0,0
50596,225,Fordham,Bronx,10458,137,4,0,0,0,0
50597,70,Bedford-Stuyvesant,Brooklyn,11206,3,1,0,0,0,0


In [190]:
zipcode

[SimpleZipcode(zipcode='11201', zipcode_type='Standard', major_city='Brooklyn', post_office_city='Brooklyn, NY', common_city_list=['Brooklyn'], county='Kings County', state='NY', lat=40.7, lng=-73.99, timezone='Eastern', radius_in_miles=1.0, area_code_list=['347', '718', '929'], population=51128, population_density=35981.0, land_area_in_sqmi=1.42, water_area_in_sqmi=0.01, housing_units=26390, occupied_housing_units=23325, median_home_value=669000, median_household_income=95369, bounds_west=-74.008117, bounds_east=-73.973126, bounds_north=40.705769, bounds_south=40.683319),
 SimpleZipcode(zipcode='11203', zipcode_type='Standard', major_city='Brooklyn', post_office_city='Brooklyn, NY', common_city_list=['Brooklyn'], county='Kings County', state='NY', lat=40.65, lng=-73.93, timezone='Eastern', radius_in_miles=1.0, area_code_list=['718'], population=76174, population_density=35502.0, land_area_in_sqmi=2.15, water_area_in_sqmi=0.0, housing_units=28087, occupied_housing_units=26420, median_h

In [187]:
prizm_zipcode_df.loc[prizm_zipcode_df['zipcode'] == 14075]

Unnamed: 0,zipcode,attribute_id,average_score


In [164]:
listings_neighbourhood_df['upper'] = 0
listings_neighbourhood_df['upper_middle'] = 0
listings_neighbourhood_df['lower_middle'] = 0
listings_neighbourhood_df['lower'] = 0

for i, row in listings_neighbourhood_df.iterrows():
  match_row = prizm_zipcode_df.loc[prizm_zipcode_df['zipcode']==int(row['zipcode'])]
  print(match_row)

    zipcode  attribute_id  average_score
46    10018  31,17,4,40,7            3.6
   zipcode   attribute_id  average_score
8    11238  31,17,40,4,21            3.6
    zipcode   attribute_id  average_score
13    10016  31,17,4,40,21            3.6
   zipcode    attribute_id  average_score
3    11216  40,17,31,21,63            3.0
   zipcode  attribute_id  average_score
4    10019  17,31,4,40,7            3.6
    zipcode  attribute_id  average_score
11    10025  17,7,40,31,4            3.6
   zipcode   attribute_id  average_score
5    10002  63,17,31,43,7            3.6
    zipcode   attribute_id  average_score
12    10036  17,31,40,4,21            3.6
    zipcode  attribute_id  average_score
15    11215  4,17,31,7,21            4.2
    zipcode  attribute_id  average_score
21    10014  17,31,4,7,21            4.2
   zipcode   attribute_id  average_score
0    11211  31,17,40,56,4            3.2
    zipcode   attribute_id  average_score
17    11205  31,17,63,4,40            3.0
    zipcod

ValueError: ignored

In [14]:
#from pandas import factorize
listings_neighbourhood_df['neighbourhood_normalized'] = listings_neighbourhood_df['neighbourhood'].factorize()[0]
#listings_neighbourhood_df['neighbourhood'].factorize()[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [15]:
listings_neighbourhood_df['neighbourhood_group_normalized'] = listings_neighbourhood_df['neighbourhood_group'].factorize()[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [25]:
def set_price_level(x):
    if 69 >= x:
      return 'low'
    elif 105 >= x:
      return 'low medium' 
    elif 175 >= x:
      return 'high medium'
    else:
      return 'high'

listings_neighbourhood_df['price_normalized'] = listings_neighbourhood_df['price'].apply(set_price_level)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [40]:
import numpy as np
listings_neighbourhood_df.describe(include=[np.object])

Unnamed: 0,neighbourhood,neighbourhood_group,price_normalized
count,50599,50599,50599
unique,223,5,4
top,Bedford-Stuyvesant,Manhattan,low
freq,3974,22070,12911


In [42]:
listings_neighbourhood_df.neighbourhood.unique()

array(['Midtown', 'Clinton Hill', 'Murray Hill', 'Bedford-Stuyvesant',
       "Hell's Kitchen", 'Upper West Side', 'Chinatown', 'South Slope',
       'West Village', 'Williamsburg', 'Fort Greene', 'Chelsea',
       'Crown Heights', 'East Harlem', 'Park Slope', 'Windsor Terrace',
       'Inwood', 'East Village', 'Harlem', 'Greenpoint', 'Bushwick',
       'Flatbush', 'Lower East Side', 'Prospect-Lefferts Gardens',
       'Long Island City', 'Kips Bay', 'SoHo', 'Upper East Side',
       'Red Hook', 'Prospect Heights', 'Washington Heights', 'Woodside',
       'Brooklyn Heights', 'Carroll Gardens', 'Gowanus', 'Flatlands',
       'Cobble Hill', 'Flushing', 'Boerum Hill', 'Sunnyside', 'DUMBO',
       'St. George', 'Tribeca', 'Highbridge', 'NoHo', 'Ridgewood',
       'Port Morris', 'Morningside Heights', 'Jamaica', 'Middle Village',
       'Ditmars Steinway', 'Flatiron District', 'Roosevelt Island',
       'Greenwich Village', 'Little Italy', 'Tompkinsville', 'Astoria',
       'Kensington', 'C

In [0]:
from sklearn import linear_model

x = listings_neighbourhood_df[['price']]
y = listings_neighbourhood_df[['neighbourhood_normalized', 'neighbourhood_group_normalized']]

lm = linear_model.LinearRegression()
model = lm.fit(x,y)

#chi square test
#anova analysis variance test

In [32]:
model.coef_

array([[-0.00421108],
       [-0.00029717]])