# Santa Cruz: Data Cleaning 

In [1]:
# ! pip install langdetect

In [2]:
import pandas as pd
import ast
from langdetect import detect

pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)

In [3]:
listing = pd.read_csv('./data/Santa_Cruz_listings.csv')
review = pd.read_csv('./data/Santa_Cruz_reviews.csv')

In [4]:
print(f'Shape for Santa Cruz Listings CSV: {listing.shape}')
print(f'Shape for Santa Cruz Reviews CSV: {review.shape}')

Shape for Santa Cruz Listings CSV: (1255, 74)
Shape for Santa Cruz Reviews CSV: (95238, 6)


## Cleaning Review csv file

In [5]:
review.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,11879,77106,2010-08-11,174171,Kelly,"Very kind soul, accommodating, welcoming... ma..."
1,11879,4453240,2013-05-06,5769319,Kristen,Tired of slick/sterile hotel/motel rooms? Thi...
2,11879,14718383,2014-06-24,778621,Mona,The stay at Steve's house was awesome. Steve i...
3,11879,33286462,2015-05-26,5831887,Karen,Steve is a very nice guy and the place was ver...
4,11879,35873459,2015-06-22,26993254,Alice,"enjoyed my stay in aptos, loved the beach and..."


Dropping unnecessary columns.

In [6]:
review.drop(columns = ['id',
                       'reviewer_id',
                       'reviewer_name'],
            inplace = True)

Dropping all rows with null values since they contain no content.

In [7]:
review.isnull().sum()

listing_id     0
date           0
comments      37
dtype: int64

In [8]:
review = review.dropna()

#### Minimizing the dataframe

Removing all reviews that are not from 2019 to minimize the data.

In [9]:
review = review[(review['date'] >= '2018-01-01') & (review['date'] <= '2019-12-31')]

Remove all reviews that are less than 5 words.

In [10]:
review = review[review['comments'].str.count(' ') > 4]

Remove all reviews that are not in English.

In [11]:
review.drop([i for i in review[review['comments'].apply(detect) != 'en'].index], inplace=True)

Remove '\n' since this is just an indicator for a line break.

In [12]:
review['comments'] = review['comments'].str.replace('\n', '')

Removing any numbers from the comments.

In [13]:
review['comments'] = review['comments'].replace('\d+', '', regex=True)

Removing reviews written in Asian languages.

In [14]:
review.drop([i for i in review[review['comments'].str.contains(r'[^\x00-\x7F]+') == True].index], inplace = True)

Removing rows where the comments were generated by AirBnb due to the host cancelling a booked reservation.

In [15]:
review.drop([i for i in review[review['comments'].str.contains('This is an automated posting') == True].index], inplace = True)

## Cleaning Listing csv file

In [16]:
listing.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,description,neighborhood_overview,picture_url,host_id,host_url,host_name,host_since,host_location,host_about,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_thumbnail_url,host_picture_url,host_neighbourhood,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,neighbourhood,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,minimum_nights,maximum_nights,minimum_minimum_nights,maximum_minimum_nights,minimum_maximum_nights,maximum_maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,calendar_updated,has_availability,availability_30,availability_60,availability_90,availability_365,calendar_last_scraped,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,11879,https://www.airbnb.com/rooms/11879,20210227173810,2021-03-01,Sunny room close to beach and parks,Sunny room in a cozy home with a 40 acre park ...,,https://a0.muscache.com/pictures/5acb8501-4855...,44764,https://www.airbnb.com/users/show/44764,Steven,2009-10-09,"Aptos, California, United States","Easygoing, environmentalist, musician/educator...",within a few hours,92%,69%,t,https://z0.muscache.cn/im/pictures/user/f62740...,https://z0.muscache.cn/im/pictures/user/f62740...,Santa Cruz,4,4,"['email', 'phone', 'facebook', 'reviews']",t,f,,Unincorporated Areas,,36.98186,-121.88114,Private room in house,Private room,2,,1 shared bath,1.0,1.0,"[""Extra pillows and blankets"", ""Free street pa...",$75.00,2,30,2,2,30,30,2.0,30.0,,t,20,50,80,169,2021-03-01,60,14,3,2010-08-11,2021-02-24,97.0,10.0,9.0,10.0,10.0,10.0,10.0,,f,3,2,1,0,0.47
1,24548,https://www.airbnb.com/rooms/24548,20210227173810,2021-02-28,Room with Private Entrance.,Bedroom and bath with private entrance in home...,Great location close to Santa Cruz Harbor and ...,https://a0.muscache.com/pictures/cf8d43fd-90c1...,99532,https://www.airbnb.com/users/show/99532,Kerstin,2010-03-26,"Santa Cruz, California, United States",Easy going and friendly I like to think.\r\n\r...,within an hour,100%,93%,t,https://a0.muscache.com/im/pictures/user/ff3b6...,https://a0.muscache.com/im/pictures/user/ff3b6...,Santa Cruz,1,1,"['email', 'phone', 'facebook', 'reviews', 'kba']",t,f,"Santa Cruz, California, United States",City of Santa Cruz,,36.97167,-121.99774,Private room in house,Private room,2,,1 private bath,1.0,1.0,"[""Dedicated workspace"", ""Carbon monoxide alarm...",$100.00,2,5,2,2,5,5,2.0,5.0,,t,0,0,0,0,2021-02-28,479,20,0,2010-04-24,2020-11-23,99.0,10.0,10.0,10.0,10.0,10.0,10.0,,f,1,0,1,0,3.63
2,31721,https://www.airbnb.com/rooms/31721,20210227173810,2021-03-01,Dog Friendly Private Pleasure Point Beach Cottage,*LOCATION LOCATION LOCATION* <br />Welcome to ...,I love everything about this neighborhood. I ...,https://a0.muscache.com/pictures/73aa203a-cb53...,136376,https://www.airbnb.com/users/show/136376,Annie,2010-06-01,"Santa Cruz, California, United States",Hey Airbnb folks thanks for taking the time to...,within a few hours,97%,75%,f,https://a0.muscache.com/im/users/136376/profil...,https://a0.muscache.com/im/users/136376/profil...,Santa Cruz,2,2,"['email', 'phone', 'facebook', 'reviews', 'kba']",t,f,"Santa Cruz, California, United States",City of Capitola,,36.95849,-121.97207,Entire cottage,Entire home/apt,3,,1 bath,1.0,1.0,"[""Dedicated workspace"", ""Carbon monoxide alarm...",$179.00,4,90,2,4,90,90,4.0,90.0,,t,1,1,1,1,2021-03-01,222,40,1,2012-11-26,2021-01-31,95.0,10.0,9.0,10.0,10.0,10.0,9.0,,f,2,2,0,0,2.21
3,43785,https://www.airbnb.com/rooms/43785,20210227173810,2021-03-01,Guest bedroom in Victorian home,"<b>The space</b><br />A redwood-floored, sun-f...","The West side of Santa Cruz is an extension, u...",https://a0.muscache.com/pictures/305117/1a131d...,191477,https://www.airbnb.com/users/show/191477,Caroline,2010-08-04,"Santa Cruz, California, United States","I am a phonetician, speech technologist, and l...",within an hour,100%,53%,t,https://a0.muscache.com/im/users/191477/profil...,https://a0.muscache.com/im/users/191477/profil...,Santa Cruz,1,1,"['email', 'phone', 'reviews', 'offline_governm...",t,t,"Santa Cruz, California, United States",City of Santa Cruz,,36.97694,-122.036,Private room in house,Private room,2,,1 private bath,1.0,2.0,"[""Extra pillows and blankets"", ""Free street pa...",$85.00,2,30,2,2,30,30,2.0,30.0,,t,2,32,62,332,2021-03-01,446,14,1,2010-08-30,2021-02-07,97.0,10.0,10.0,10.0,10.0,10.0,10.0,,f,1,0,1,0,3.49
4,49520,https://www.airbnb.com/rooms/49520,20210227173810,2021-03-01,Guest Cottage with shared bath,30 day minimum<br />Tiny garden cottage with a...,,https://a0.muscache.com/pictures/892035/5dffd6...,225721,https://www.airbnb.com/users/show/225721,Christine,2010-09-06,"Santa Cruz, California, United States",I am an over employed PA in love with the coun...,within a few hours,100%,14%,f,https://a0.muscache.com/im/users/225721/profil...,https://a0.muscache.com/im/users/225721/profil...,,2,2,"['email', 'phone', 'facebook', 'reviews']",t,f,,Unincorporated Areas,,36.96325,-121.82091,Private room in apartment,Private room,2,,,1.0,1.0,"[""Washer"", ""Heating"", ""Microwave"", ""Dishes and...",$95.00,30,30,30,30,30,30,30.0,30.0,,t,0,0,0,0,2021-03-01,145,0,0,2011-06-28,2018-09-15,94.0,10.0,9.0,10.0,10.0,9.0,9.0,,f,2,0,2,0,1.23


In [17]:
listing.isnull().sum().sum()

8239

In [18]:
listing.isnull().sum()

id                                                 0
listing_url                                        0
scrape_id                                          0
last_scraped                                       0
name                                               0
description                                       16
neighborhood_overview                            308
picture_url                                        0
host_id                                            0
host_url                                           0
host_name                                          0
host_since                                         0
host_location                                      1
host_about                                       364
host_response_time                               111
host_response_rate                               111
host_acceptance_rate                              69
host_is_superhost                                  0
host_thumbnail_url                            

### Dropping Columns

Columns with no relevant information

In [19]:
listing.drop(columns = ['last_scraped',                         
                        'license',
                        'host_id',
                        'scrape_id',                            
                        'listing_url',                          
                        'picture_url',                          
                        'host_url',                             
                        'host_thumbnail_url',                   
                        'host_picture_url',                     
                        'host_name',                            
                        'host_verifications',
                        'calendar_last_scraped',
                        'host_neighbourhood',
                        'host_location',
                        'host_response_rate',
                        'availability_30',
                        'availability_60',
                        'availability_90',
                        'availability_365',
                        'number_of_reviews_ltm',
                        'number_of_reviews_l30d',
                        'calculated_host_listings_count',
                        'calculated_host_listings_count_entire_homes',
                        'calculated_host_listings_count_private_rooms',
                        'calculated_host_listings_count_shared_rooms'],
            inplace = True)

All values are nulls

In [20]:
listing.drop(columns = ['calendar_updated',                     
                        'neighbourhood_group_cleansed',         
                        'bathrooms'],
            inplace = True)

Repeat values from other columns

In [21]:
listing.drop(columns = ['minimum_minimum_nights',               
                        'maximum_minimum_nights',               
                        'minimum_maximum_nights',              
                        'maximum_maximum_nights',               
                        'minimum_nights_avg_ntm',               
                        'maximum_nights_avg_ntm',               
                        'neighbourhood',
                        'host_total_listings_count',
                        'beds',
                        'room_type'],                       
            inplace = True)

Almost all values are the same

In [22]:
listing.drop(columns = ['host_has_profile_pic',
                        'has_availability'], 
            inplace = True)

Dropping column due to multicollinearity

In [23]:
listing.drop(columns = ['host_identity_verified',
                        'host_is_superhost'],
             inplace = True)

### Dropping Nulls

The rows with no description are also missing data for many other rows.

In [24]:
listing.dropna(subset=['description'], inplace = True)

Since these rows have no values for these columns, it is implied that they had no reviews.

In [25]:
listing.dropna(subset=['first_review',
                       'last_review',
                       'review_scores_rating',
                       'review_scores_accuracy',
                       'review_scores_cleanliness',
                       'review_scores_checkin',
                       'review_scores_communication',
                       'review_scores_location',
                       'review_scores_value'],
               inplace = True)

### Imputing Nulls

Filling the nulls with 'No Content' because there are 308 nulls for 'neighborhood_overview' and 364 nulls for 'host_about'.

In [26]:
listing['neighborhood_overview'].fillna('No Content', inplace = True)
listing['host_about'].fillna('No Content', inplace = True)

Filling the null values for 'host_response_time' with 'within a few hours'.

In [27]:
listing['host_response_time'].value_counts()

within an hour        661
within a few hours    208
within a day          145
a few days or more     18
Name: host_response_time, dtype: int64

In [28]:
listing['host_response_time'].fillna('within a few hours', inplace = True)

Filling in the 2 nulls for 'bathroom_text' with '1 bath' since it is for a listing of a private room only, it most likely has only 1 bathroom available.

In [29]:
listing[listing['bathrooms_text'].isnull()]

Unnamed: 0,id,name,description,neighborhood_overview,host_since,host_about,host_response_time,host_acceptance_rate,host_listings_count,neighbourhood_cleansed,latitude,longitude,property_type,accommodates,bathrooms_text,bedrooms,amenities,price,minimum_nights,maximum_nights,number_of_reviews,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,reviews_per_month
4,49520,Guest Cottage with shared bath,30 day minimum<br />Tiny garden cottage with a...,No Content,2010-09-06,I am an over employed PA in love with the coun...,within a few hours,14%,2,Unincorporated Areas,36.96325,-121.82091,Private room in apartment,2,,1.0,"[""Washer"", ""Heating"", ""Microwave"", ""Dishes and...",$95.00,30,30,145,2011-06-28,2018-09-15,94.0,10.0,9.0,10.0,10.0,9.0,9.0,f,1.23
5,49523,Strict 30 night minimum! Small room w shared b...,THIRTY DAY MINIMUM!!!!! The small room is perf...,The property is near the top of a low ridge ne...,2010-09-06,I am an over employed PA in love with the coun...,within a few hours,14%,2,Unincorporated Areas,36.96319,-121.82195,Private room in house,2,,1.0,"[""Washer"", ""Heating"", ""Hair dryer"", ""Dedicated...",$75.00,30,30,209,2011-02-21,2019-10-26,97.0,10.0,9.0,10.0,10.0,10.0,10.0,f,1.71


In [30]:
listing['bathrooms_text'].fillna('1 bath', inplace = True)

Filling the null values for 'host_acceptance_rate' with the mean value. First, converting the values from percentages into floats.

In [31]:
listing['host_acceptance_rate'] = listing['host_acceptance_rate'].str.replace('%', '').astype('float')/100.0

In [32]:
avg_acceptance_rate = listing['host_acceptance_rate'].value_counts().mean()

In [33]:
listing['host_acceptance_rate'].fillna(avg_acceptance_rate, inplace = True)

Filling the null values for 'bedrooms' with '1.0' since that is the most frequent value.

In [34]:
listing['bedrooms'].value_counts()

1.0     429
2.0     257
3.0     212
4.0      92
5.0      16
6.0       4
7.0       2
9.0       1
10.0      1
Name: bedrooms, dtype: int64

In [35]:
listing['bedrooms'].fillna('1.0', inplace = True)

## Save clean dataframes

Verify all nulls are dealt with.

In [36]:
listing.isnull().sum().sum()

0

In [37]:
review.isnull().sum().sum()

0

Checking the final amount of columns and rows for the final dataframe.

In [38]:
print(f'Shape for Santa Cruz Listings CSV: {listing.shape}')
print(f'Shape for Santa Cruz Reviews CSV: {review.shape}')

Shape for Santa Cruz Listings CSV: (1109, 32)
Shape for Santa Cruz Reviews CSV: (31372, 3)


Saving finalized dataframes as new CSV files.

In [39]:
listing.to_csv('./data/Santa_Cruz_Listings_Clean', index = False)
review.to_csv('./data/Santa_Cruz_Reviews_Clean', index = False)