## Part 1: Data Cleaning

In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

In [2]:
df = pd.read_csv('analysisData.csv', low_memory = False)

In [3]:
df.shape

(29142, 96)

In [4]:
df.head(3)

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,...,requires_license,license,jurisdiction_names,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
0,20091785,https://www.airbnb.com/rooms/20091785,20180303203649,2018-03-05,"Cozy Stay in Queens, Easy Access to Manhattan","A perfect spot for an intern, student or out o...",,"A perfect spot for an intern, student or out o...",none,We're right next to the best Indian food in Qu...,...,f,,,t,f,flexible,f,f,1,0.42
1,3710661,https://www.airbnb.com/rooms/3710661,20180303203649,2018-03-04,Spacious room in comfortable apt.,"Nice size apt 15 minutes from central park, a...",This is a private room in a two bedroom apartm...,"Nice size apt 15 minutes from central park, a...",none,The apartment is conveniently located on the b...,...,f,,,t,f,moderate,f,f,3,2.33
2,15055244,https://www.airbnb.com/rooms/15055244,20180303203649,2018-03-04,Fresh Clean & Modern: Williamsburg at its Best,**This listing would normally be $160 per nigh...,Construction on this mid-sized building was co...,**This listing would normally be $160 per nigh...,none,Williamsburg is bursting with culture. Come se...,...,f,,,f,f,moderate,f,f,1,1.1


In [5]:
df.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary',
       'space', 'description', 'experiences_offered', 'neighborhood_overview',
       'notes', 'transit', 'access', 'interaction', 'house_rules',
       'thumbnail_url', 'medium_url', 'picture_url', 'xl_picture_url',
       'host_id', 'host_url', 'host_name', 'host_since', 'host_location',
       'host_about', 'host_response_time', 'host_response_rate',
       'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url',
       'host_picture_url', 'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'street',
       'neighbourhood', 'neighbourhood_cleansed',
       'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market',
       'smart_location', 'country_code', 'country', 'latitude', 'longitude',
       'is_location_exact', 'property_type', 'room_type', 'accommodates',
       'bathrooms',

Preselect 54 columns.
`id` and `host_id` are kept for reference reasons.

In [6]:
preselect = df.drop(labels = ['listing_url', 'scrape_id', 'last_scraped','thumbnail_url', 'medium_url', 
                  'picture_url', 'xl_picture_url',
                  'host_url', 'host_name', 'host_neighbourhood', 'host_location','host_since',
                  'host_listings_count','host_total_listings_count','host_thumbnail_url','host_picture_url',
                  'neighbourhood','neighbourhood_group_cleansed','street', 'country','country_code',
                  'market','has_availability',
                  'smart_location','is_location_exact', 'calendar_updated','calendar_last_scraped',
                  'availability_60','availability_365','availability_90','maximum_nights',
                  'weekly_price','monthly_price','guests_included','extra_people',
                  'requires_license', 'first_review','last_review',
                  'license','requires_license','calculated_host_listings_count',
                  'jurisdiction_names','review_scores_value'], axis = 1)

preselect.head(2)


Unnamed: 0,id,name,summary,space,description,experiences_offered,neighborhood_overview,notes,transit,access,...,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,reviews_per_month
0,20091785,"Cozy Stay in Queens, Easy Access to Manhattan","A perfect spot for an intern, student or out o...",,"A perfect spot for an intern, student or out o...",none,We're right next to the best Indian food in Qu...,,"Parking is limited, but we're near the E/F/M/R...","You're welcome to chill in your room, cozy up ...",...,8,10,10,10,t,f,flexible,f,f,0.42
1,3710661,Spacious room in comfortable apt.,"Nice size apt 15 minutes from central park, a...",This is a private room in a two bedroom apartm...,"Nice size apt 15 minutes from central park, a...",none,The apartment is conveniently located on the b...,Apartment located on border UES and East H.N...,You can take the 6 train downtown and get to T...,Guests should take there shoes off at the fron...,...,9,10,10,8,t,f,moderate,f,f,2.33


- Continue to drop columns with all NaN values: `host_acceptance_rate` and `square_feet`
- Continue to drop columns with many inconsistent values: `city` and `state`

In [7]:
# drop columns 
preselect.drop(labels = ['host_acceptance_rate','square_feet','city','state'], axis = 1, inplace = True)

**Accommodates, Bedrooms, Beds, Price, Zipcode**

In [8]:
original = len(preselect)

# drop any inconsistent values for other numeric variables
print ('Number of Accommodates 0:', len(preselect[preselect['accommodates'] == 0]))
print ('Number of Bedrooms 0:', len(preselect[preselect['bedrooms'] == 0]))
print ('Number of Beds 0:', len(preselect[preselect['beds'] == 0]))
print ('Number of Listings with Price $0.00:', len(preselect[preselect['price'] == 0.00]))
preselect = preselect[preselect.accommodates != 0]
preselect = preselect[preselect.bedrooms != 0]
preselect = preselect[preselect.beds!= 0]
preselect = preselect[preselect.price != 0.00]

# drop rows with NaN values from zipcode, state, city, beds
preselect = preselect.dropna(how='any', subset=['zipcode', 'beds']).reset_index(drop=True)


# clean zipcode, drop inconsistent values
original = len(preselect)
length = []
for value in preselect.zipcode.values:
    length.append(len(value))
    
from collections import Counter 
print ( 'Zipcode Length common value and ferquency: ', Counter(length))

preselect = preselect[preselect.zipcode.astype(str).apply(len) == 5]

print  ('Number of NaN values removed:', original - len(preselect))

Number of Accommodates 0: 0
Number of Bedrooms 0: 2756
Number of Beds 0: 0
Number of Listings with Price $0.00: 25
Zipcode Length common value and ferquency:  Counter({5: 26032, 10: 2, 12: 1, 2: 1})
Number of NaN values removed: 4


**security_deposit, cleaning_fee** and **host_response_time**

In [9]:
#fill in NaN for security_deposit, cleaning_fee and host_response_time
preselect.fillna({'security_deposit':0 , 'cleaning_fee': 0, 'host_response_time':preselect.host_response_time.mode()[0]}, inplace=True)     

#### Convert  format

**host_response_rate** and **zipcode**

In [10]:
#convert response rate 
preselect.host_response_rate = preselect.host_response_rate.fillna('0%').str.replace('%','').astype(int)
preselect.host_response_rate = preselect.host_response_rate/100

#convert zipcode 
preselect.zipcode = preselect.zipcode.astype(object)

#### property_type

In [11]:
# convert property_type to fewer nlevels
print ("Property_type has {} unique values".format(preselect['property_type'].nunique()))

preselect.property_type = preselect.property_type.str.lower()

preselect.property_type = preselect.property_type.str.replace("aparthotel|bed and breakfast|boutique hotel|hostel|hotel|resort|timeshare", "hotel")
preselect.property_type = preselect.property_type.str.replace("apartment|vacation home|bungalow|cabin|casa particular \(cuba\)|chalet|condominium|other|castle|villa|guest suite|guesthouse|earth house|house|town house |loft|serviced apartment|townhouse","house")
preselect.property_type = preselect.property_type.str.replace("boat|camper/rv|cave|island|tent|train|treehouse|yurt","outdoor")
preselect.property_type = preselect.property_type.str.replace("in-law|dorm|tiny house","small_house")

print ("After formating, Property_type now has {} unique values".format(preselect['property_type'].nunique()))

Property_type has 33 unique values
After formating, Property_type now has 4 unique values


#### cancellation_policy

In [12]:
# convert cancel policy
print ("Cancellation_policy has {} unique values".format(preselect['cancellation_policy'].nunique()))

preselect.cancellation_policy = preselect.cancellation_policy.str.replace("super_strict_30|super_strict_60", "strict")

print ("After formating, Cancellation_policy now has {} unique values".format(preselect['cancellation_policy'].nunique()))

Cancellation_policy has 5 unique values
After formating, Cancellation_policy now has 3 unique values


**host_verifications**

In [13]:
# make a list of words extracted from host_verifications

#for loop method
'''
def clean_host_verifications():
    input: nested list
    output: word frequency table
    
    flattened_list = []
    for ele_list in preselect.host_verifications.str.replace("[","").str.replace("]",""):
        for ele in ele_list.split(','):
            ele = ele.replace("'","").strip()
            flattened_list.append(ele)
    
    c = Counter(flattened_list)
    #word_freq = c.most_common()
    word = list(c)
    #return pd.DataFrame(word_freq, columns = ['word','freq'])
    return word
hver_word_freq = clean_host_verifications()
hver_word_freq
'''

# using list comprehension to improve speed
hver_list = preselect.host_verifications.str.replace("[","").str.replace("]","").str.split(',')
hver_word = [ele.replace("'","").strip() for ele_list in hver_list for ele in ele_list]
hver_word_freq = list(Counter(hver_word))
hver_word_freq

['email',
 'phone',
 'facebook',
 'google',
 'reviews',
 'kba',
 'jumio',
 'work_email',
 'offline_government_id',
 'selfie',
 'government_id',
 'identity_manual',
 'linkedin',
 'manual_offline',
 'amex',
 'manual_online',
 'sent_id',
 'weibo',
 'sesame',
 'sesame_offline',
 'zhima_selfie',
 'photographer']

In [14]:
#clean host_verifications column
#create a dataframe to contain seperated features in host_verifications
def create_hver_df():
    hver_df = pd.DataFrame(index = range(len(preselect)), columns = ['hver_{}'.format(value) for value in hver_word_freq])

    for word in hver_word_freq:
        hver_df['hver_{}'.format(word)] = preselect.host_verifications.str.contains(word)

    return hver_df

hver_df = create_hver_df()
#concat new dataframe to preselect
#drop the orginal host_verifications clumn and concat new columns
preselect_no_hver = preselect.drop(['host_verifications'], axis = 1)
preselect = pd.concat([preselect_no_hver, hver_df], axis = 1)

In [15]:
preselect.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26036 entries, 0 to 26035
Data columns (total 71 columns):
id                                  26032 non-null float64
name                                26028 non-null object
summary                             25212 non-null object
space                               19777 non-null object
description                         26020 non-null object
experiences_offered                 26032 non-null object
neighborhood_overview               17292 non-null object
notes                               11482 non-null object
transit                             18204 non-null object
access                              17191 non-null object
interaction                         16602 non-null object
house_rules                         17396 non-null object
host_id                             26032 non-null float64
host_about                          17354 non-null object
host_response_time                  26032 non-null object
host_response_rate 

**amenities**

In [16]:
def extract_amenities_word():
    '''
    input: nested list
    output: word frequency table
    '''
    preselect.amenities.fillna("none", inplace = True)
    
    am_list = preselect.amenities.str.replace('"','').str.replace("{","").str.replace("}","").str.split(',')
    am_word = [ele.strip() for ele_list in am_list for ele in ele_list]
    am_word_freq = list(Counter(am_word))

    #remove unwanted values
    unwanted_words = {'', 'translation missing: en.hosting_amenity_49','translation missing: en.hosting_amenity_50',} 
    word = [element for element in word if element not in unwanted_words]
    #return pd.DataFrame(word_freq, columns = ['word','freq'])
    return word

am_word_freq = extract_amenities_word()


In [41]:
#clean amenities column
#create a dataframe to contain seperated features in amenities
def create_am_df():
    am_df = pd.DataFrame(index = range(len(preselect)), columns = ['am_{}'.format(value) for value in am_word_freq])

    for word in am_word_freq:
        am_df['am_{}'.format(word)] = preselect.amenities.astype(str).str.contains(word)

    return am_df

am_df = create_am_df()

#drop orginal clumn and concat new columns
preselect_no_am = preselect.drop(['amenities'], axis = 1)
preselect = pd.concat([preselect_no_am, am_df], axis = 1)

  import sys
  import sys
  import sys


In [48]:
preselect.shape

(26036, 188)