## Part 1: Data Cleaning

In [2]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

In [4]:
df = pd.read_csv('analysisData.csv', low_memory = False)

In [8]:
df.shape

(29142, 96)

In [6]:
df.head(3)

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,...,requires_license,license,jurisdiction_names,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
0,20091785,https://www.airbnb.com/rooms/20091785,20180303203649,2018-03-05,"Cozy Stay in Queens, Easy Access to Manhattan","A perfect spot for an intern, student or out o...",,"A perfect spot for an intern, student or out o...",none,We're right next to the best Indian food in Qu...,...,f,,,t,f,flexible,f,f,1,0.42
1,3710661,https://www.airbnb.com/rooms/3710661,20180303203649,2018-03-04,Spacious room in comfortable apt.,"Nice size apt 15 minutes from central park, a...",This is a private room in a two bedroom apartm...,"Nice size apt 15 minutes from central park, a...",none,The apartment is conveniently located on the b...,...,f,,,t,f,moderate,f,f,3,2.33
2,15055244,https://www.airbnb.com/rooms/15055244,20180303203649,2018-03-04,Fresh Clean & Modern: Williamsburg at its Best,**This listing would normally be $160 per nigh...,Construction on this mid-sized building was co...,**This listing would normally be $160 per nigh...,none,Williamsburg is bursting with culture. Come se...,...,f,,,f,f,moderate,f,f,1,1.1


In [7]:
df.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary',
       'space', 'description', 'experiences_offered', 'neighborhood_overview',
       'notes', 'transit', 'access', 'interaction', 'house_rules',
       'thumbnail_url', 'medium_url', 'picture_url', 'xl_picture_url',
       'host_id', 'host_url', 'host_name', 'host_since', 'host_location',
       'host_about', 'host_response_time', 'host_response_rate',
       'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url',
       'host_picture_url', 'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'street',
       'neighbourhood', 'neighbourhood_cleansed',
       'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market',
       'smart_location', 'country_code', 'country', 'latitude', 'longitude',
       'is_location_exact', 'property_type', 'room_type', 'accommodates',
       'bathrooms',

Preselect 54 columns.
`id` and `host_id` are kept for reference reasons.

In [434]:
preselect = df.drop(labels = ['listing_url', 'scrape_id', 'last_scraped','thumbnail_url', 'medium_url', 
                  'picture_url', 'xl_picture_url',
                  'host_url', 'host_name', 'host_neighbourhood', 'host_location','host_since',
                  'host_listings_count','host_total_listings_count','host_thumbnail_url','host_picture_url',
                  'neighbourhood','neighbourhood_group_cleansed','street', 'country','country_code',
                  'market','has_availability',
                  'smart_location','is_location_exact', 'calendar_updated','calendar_last_scraped',
                  'availability_60','availability_365','availability_90','maximum_nights',
                  'weekly_price','monthly_price','guests_included','extra_people',
                  'requires_license', 'first_review','last_review',
                  'license','requires_license','calculated_host_listings_count',
                  'jurisdiction_names','review_scores_value'], axis = 1)

preselect.head(2)


Unnamed: 0,id,name,summary,space,description,experiences_offered,neighborhood_overview,notes,transit,access,...,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,reviews_per_month
0,20091785,"Cozy Stay in Queens, Easy Access to Manhattan","A perfect spot for an intern, student or out o...",,"A perfect spot for an intern, student or out o...",none,We're right next to the best Indian food in Qu...,,"Parking is limited, but we're near the E/F/M/R...","You're welcome to chill in your room, cozy up ...",...,8,10,10,10,t,f,flexible,f,f,0.42
1,3710661,Spacious room in comfortable apt.,"Nice size apt 15 minutes from central park, a...",This is a private room in a two bedroom apartm...,"Nice size apt 15 minutes from central park, a...",none,The apartment is conveniently located on the b...,Apartment located on border UES and East H.N...,You can take the 6 train downtown and get to T...,Guests should take there shoes off at the fron...,...,9,10,10,8,t,f,moderate,f,f,2.33


In [435]:
preselect.columns

Index(['id', 'name', 'summary', 'space', 'description', 'experiences_offered',
       'neighborhood_overview', 'notes', 'transit', 'access', 'interaction',
       'house_rules', 'host_id', 'host_about', 'host_response_time',
       'host_response_rate', 'host_acceptance_rate', 'host_is_superhost',
       'host_verifications', 'host_has_profile_pic', 'host_identity_verified',
       'neighbourhood_cleansed', 'city', 'state', 'zipcode', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
       'bedrooms', 'beds', 'bed_type', 'amenities', 'square_feet', 'price',
       'security_deposit', 'cleaning_fee', 'minimum_nights', 'availability_30',
       'number_of_reviews', 'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'instant_bookable', 'is_business_travel_ready', 'cancellation_policy',
       'require_guest_profile_pic

Continue to drop columns has all or most NaN values: `host_acceptance_rate` and `square_feet`

In [436]:
preselect.isnull().sum()/preselect.shape[0]

id                                  0.000000
name                                0.000172
summary                             0.029888
space                               0.250086
description                         0.000412
experiences_offered                 0.000000
neighborhood_overview               0.344520
notes                               0.567875
transit                             0.308387
access                              0.351074
interaction                         0.370531
house_rules                         0.340231
host_id                             0.000000
host_about                          0.341946
host_response_time                  0.274518
host_response_rate                  0.274518
host_acceptance_rate                1.000000
host_is_superhost                   0.000000
host_verifications                  0.000000
host_has_profile_pic                0.000000
host_identity_verified              0.000000
neighbourhood_cleansed              0.000000
city      

In [437]:
# drop columns with most missing values
preselect.drop(labels = ['host_acceptance_rate','square_feet'], axis = 1, inplace = True)

### Data Cleaning

#### Drop rows with inconsistent or missing values

In [438]:
original = len(preselect)

# drop any inconsistent values for other numeric variables
print ('Number of Accommodates 0:', len(preselect[preselect['accommodates'] == 0]))
print ('Number of Bedrooms 0:', len(preselect[preselect['bedrooms'] == 0]))
print ('Number of Beds 0:', len(preselect[preselect['beds'] == 0]))
print ('Number of Listings with Price $0.00:', len(preselect[preselect['price'] == 0.00]))
preselect = preselect[preselect.accommodates != 0]
preselect = preselect[preselect.bedrooms != 0]
preselect = preselect[preselect.beds!= 0]
preselect = preselect[preselect.price != 0.00]

# drop rows with NaN values from zipcode, state, city, beds
preselect = preselect.dropna(how='any', subset=['zipcode', 'state','city', 'beds'])


# clean zipcode, drop inconsistent values
original = len(preselect)
length = []
for value in preselect.zipcode.values:
    length.append(len(value))
    
from collections import Counter 
print ( 'Zipcode Length common value and ferquency: ', Counter(length))

preselect = preselect[preselect.zipcode.astype(str).apply(len) == 5]

print  ('Number of NaN values removed:', original - len(preselect))

Number of Accommodates 0: 0
Number of Bedrooms 0: 2756
Number of Beds 0: 0
Number of Listings with Price $0.00: 25
Zipcode Length common value and ferquency:  Counter({5: 25997, 10: 2, 12: 1, 2: 1})
Number of NaN values removed: 4


#### Fill in other missing values

In [439]:
#fill in NaN with columns mean for security_deposit and cleaning_fee
def fill_in_na(data, columns):
    for col in columns:
        data[col] = data[col].fillna(data[col].mean())

fill_in_na(data = preselect, columns = ['security_deposit','cleaning_fee'])

#fill in NaN with column mode for host_response_time
preselect.host_response_time.fillna(preselect.host_response_time.mode()[0], inplace = True)

#### Convert  format

In [440]:
#convert response rate 
preselect.host_response_rate = preselect.host_response_rate.fillna('0%').str.replace('%','').astype(int)
preselect.host_response_rate = preselect.host_response_rate/100

#convert zipcode 
preselect.zipcode = preselect.zipcode.astype(object)

In [441]:
# convert property_type to fewer nlevels
print ("Property_type has {} unique values".format(preselect['property_type'].nunique()))

preselect.property_type = preselect.property_type.str.lower()

preselect.property_type = preselect.property_type.str.replace("aparthotel|bed and breakfast|boutique hotel|hostel|hotel|resort|timeshare", "hotel")
preselect.property_type = preselect.property_type.str.replace("apartment|vacation home|bungalow|cabin|casa particular \(cuba\)|chalet|condominium|other|castle|villa|guest suite|guesthouse|earth house|house|town house |loft|serviced apartment|townhouse","house")
preselect.property_type = preselect.property_type.str.replace("boat|camper/rv|cave|island|tent|train|treehouse|yurt","outdoor")
preselect.property_type = preselect.property_type.str.replace("in-law|dorm|tiny house","small_house")

print ("After formating, Property_type now has {} unique values".format(preselect['property_type'].nunique()))

Property_type has 33 unique values
After formating, Property_type now has 4 unique values
