In [1]:
import numpy as np
import pandas as pd
from datetime import date, datetime
from matplotlib import pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('listings.csv', index_col='id')

In [3]:
df.drop(columns=['listing_url', 'scrape_id', 'last_scraped', 'source', 'name',
                 'description', 'neighborhood_overview', 'picture_url', 'host_id',
                 'host_url', 'host_name', 'host_location', 'host_about',
                 'host_response_time', 'host_response_rate', 'host_acceptance_rate',
                 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 
                 'host_listings_count', 'host_total_listings_count', 'host_verifications',
                 'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
                 'calendar_updated','first_review','last_review'], inplace=True)

Data Cleaning and Processing

In [4]:
df['price'] = df['price'].str.replace(r'\$|,', '', regex=True).astype(float)

In [5]:
df.loc[df['price'] == 0 ,'price'] = np.nan
df['price'] = df.groupby(['neighbourhood_group_cleansed','room_type'])['price'].transform(lambda x: x.fillna(x.median()))

In [6]:
df['bedrooms'] = df.groupby(['room_type'])['beds'].transform(lambda x: x.fillna(x.median()))

In [7]:
# get the bathroom values
df['bathrooms_cleaned'] = df['bathrooms_text'].str.replace(r'\s.*', '', regex=True)
df['bathrooms_cleaned'] = df['bathrooms_cleaned'].apply(lambda x: 0.5 if x in ('Half-bath', 'Shared','Private') else float(x)).astype(float)

In [9]:
values = {'host_is_superhost': 'f', 'has_availability':'f','instant_bookable':'f', 'reviews_per_month': 0, 'number_of_reviews':0,
          'number_of_reviews_ltm':0, 'number_of_reviews_l30d':0, 'bedrooms':1, 'bathrooms_cleaned': df['bathrooms_cleaned'].median()}
df.fillna(value=values, inplace=True)

df['host_is_superhost'] = df['host_is_superhost'].map({'t': 1, 'f': 0}).astype(int)
df['has_availability'] = df['has_availability'].map({'t': 1, 'f': 0}).astype(int)
df['instant_bookable'] = df['instant_bookable'].map({'t': 1, 'f': 0}).astype(int)

df = df.astype(
    {
        'neighbourhood_group_cleansed': 'category',
        'room_type' : 'category'
    }
)

In [10]:
# address skew in price feature with log transformation
df['price_log'] = np.log(df['price'])

# address skew in minimum nights with sqrt transformation
df['minimum_nights_sqrt'] = np.sqrt(df['minimum_nights'])

# address skew in with maximum nights log transformation
df['maximum_nights_long'] = np.log(df['maximum_nights'])

# address skew in minimum_minimum_nights with sqrt transformation
df['minimum_minimum_nights_sqrt'] = np.sqrt(df['minimum_minimum_nights'])

# address skew in minimum_maximum_nights with log transformation
df['minimum_maximum_nights_log'] = np.log(df['minimum_maximum_nights'])

# address skew in maximum_maximum_nights with log transformation
df['maximum_maximum_nights_log'] = np.log(df['maximum_maximum_nights'])

# address skew in maximum_nights_avg_ntm with log transformaiton
df['maximum_nights_avg_ntm_log'] = np.log(df['maximum_nights_avg_ntm'])

# address skew in number_of_reviews_ltm with sqrt transformation
df['number_of_reviews_ltm_sqrt'] = np.sqrt(df['number_of_reviews_ltm'])

# address skew in number_of_reviews_l30d with sqrt transformation
df['number_of_reviews_l30d_sqrt'] = np.sqrt(df['number_of_reviews_l30d'])

Feature Engineering

In [15]:
# find length of time for host
df['host_since'] = pd.to_datetime(df['host_since'])
df['today'] = pd.to_datetime(date.today())
df['host_length_years'] = (df['today'] - df['host_since']) / np.timedelta64(1, 'D')
df['host_length_years'] = round((df['host_length_years'] *  0.0027379),2).astype(float)

df.fillna(value={'host_length_years': df['host_length_years'].median()}, inplace=True)

df.drop(columns=['bathrooms','bathrooms_text','today','host_since'], inplace = True)

In [16]:
df_encoded = pd.get_dummies(df,columns=['neighbourhood_group_cleansed'], dtype=int)

In [17]:
df_encoded['amenities'] = df_encoded['amenities'].str.replace('"', '')
df_encoded['amenities'] = df_encoded['amenities'].str.replace('[', '')
df_encoded['amenities'] = df_encoded['amenities'].str.replace(']', '')
df_encoded['amenities'] = df_encoded['amenities'].str.replace(r'\\u.*','', regex = True)
df_encoded['amenities'] = df_encoded['amenities'].str.strip()
df_encoded['free_parking_on_premises'] = df_encoded['amenities'].str.contains(r'Free parking on premises|Free driveway parking.*', case=False, regex=True)
df_encoded['paid_parking_on_premises'] = df_encoded['amenities'].str.contains(r'PAID.*PARKING.*ON PREMISES|PAID PARKING GARAGE ON PREMISES|Paid parking lot on premises|Paid valet parking on premises', case=False, regex=True)
df_encoded['parking_off_premises'] = df_encoded['amenities'].str.contains(r'.*STREET PARKING|.*PARKING OFF PREMISES|PAID PARKING LOT OFF PREMISES|PAID PARKING GARAGE OFF PREMISES', case=False, regex=True)
df_encoded['washer'] = df_encoded['amenities'].str.contains(r'WASHER|FREE WASHER|PAID WASHER', case=False, regex=True)
df_encoded['dryer'] = df_encoded['amenities'].str.contains(r'DRYER|FREE DRYER|PAID DRYER', case=False, regex=True)
df_encoded['AC'] = df_encoded['amenities'].str.contains(r'Central air conditioning|AIR CONDITIONING|Window AC unit|AC - split type ductless system', case=False, regex=True)
df_encoded['heating'] = df_encoded['amenities'].str.contains(r'HEATING|CENTRAL HEATING|Radiant heating|Heating - split type ductless system', case=False, regex=True)
df_encoded['wifi'] = df_encoded['amenities'].str.contains(r'WIFI|.*WIFI', case=False, regex=True)
df_encoded['TV'] = df_encoded['amenities'].str.contains(r'TV|TV\s.*|.*HDTV.*', case=False, regex=True)
df_encoded['self_check_in'] = df_encoded['amenities'].str.contains('Self check-in')
df_encoded['gym'] = df_encoded['amenities'].str.contains(r'GYM|Shared gym in building|PRIVATE GYM IN BUILDING', case=False, regex=True)
df_encoded['pets_allowed'] = df_encoded['amenities'].str.contains('Pets allowed')
df_encoded['kitchen'] = df_encoded['amenities'].str.contains(r'KITCHEN|KITCHENETTE', case=False, regex=True)
df_encoded['patio_balcony'] = df_encoded['amenities'].str.contains(r'.*PATIO OR BALCONY', case=False, regex=True)
df_encoded['backyard'] = df_encoded['amenities'].str.contains(r'.*BACKYARD.*', case=False, regex=True)
df_encoded['pool'] = df_encoded['amenities'].str.contains('Pool')
df_encoded['luggage_dropoff_allowed'] = df_encoded['amenities'].str.contains('Luggage dropoff allowed')

Correlation Analysis

In [20]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None

corrs = df_encoded.corr(numeric_only=True)
corrs.drop(columns=['latitude','longitude'],inplace=True)

targetCor = corrs.drop('price_log')['price_log']

targetCor.loc[targetCor.abs().sort_values(ascending= False).index]

price                                           0.558868
accommodates                                    0.546141
beds                                            0.425464
bedrooms                                        0.425166
neighbourhood_group_cleansed_Manhattan          0.349101
longitude                                      -0.282884
minimum_nights_sqrt                            -0.272586
bathrooms_cleaned                               0.252338
minimum_minimum_nights_sqrt                    -0.246253
kitchen                                        -0.245276
gym                                             0.222957
self_check_in                                   0.212926
calculated_host_listings_count_private_rooms   -0.197845
neighbourhood_group_cleansed_Queens            -0.190435
instant_bookable                                0.186727
availability_90                                 0.182397
availability_365                                0.179805
calculated_host_listings_count_

In [None]:
df.to_csv('Airbnb_listings_cleaned.csv',header=True)