In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import math
import statistics
import scipy.stats as stats
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pylab
from statsmodels.compat import lzip
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('Airbnb_Data.csv')

# data overview 


In [4]:
def data_overview(df):
    print('overview of data')
    print('------------------')
    display(df.head())
    print("the shape of the dataset")
    print('--------------------')
    display(df.head())
    print("statistical summary")
    print("----------------------------------------------------------------------------")
    display(df.describe().T)
    print("\n")

    print("information of features")
    print("----------------------------------------------------------------------------")
    display(df.info())
    print("\n")

    print("missing values")
    print("----------------------------------------------------------------------------")
    display(df.isnull().sum())
    print("\n")

    print("checking for duplicates")
    print("----------------------------------------------------------------------------")
    print(f"number of dupliacte rows: {df.duplicated().sum()}")
    print("\n")

    object_columns = df.select_dtypes(include='object').columns #identify columns with object data type
    for column in object_columns:
        print("value counts for " + column)
        print("----------------------------------------------------------------------------")
        print(df[column].value_counts())
        print("\n")

In [5]:
data_overview(df)

overview of data
------------------


Unnamed: 0,id,log_price,property_type,room_type,amenities,accommodates,bathrooms,bed_type,cancellation_policy,cleaning_fee,...,latitude,longitude,name,neighbourhood,number_of_reviews,review_scores_rating,thumbnail_url,zipcode,bedrooms,beds
0,6901257,5.010635,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",3,1.0,Real Bed,strict,True,...,40.696524,-73.991617,Beautiful brownstone 1-bedroom,Brooklyn Heights,2,100.0,https://a0.muscache.com/im/pictures/6d7cbbf7-c...,11201.0,1.0,1.0
1,6304928,5.129899,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",7,1.0,Real Bed,strict,True,...,40.766115,-73.98904,Superb 3BR Apt Located Near Times Square,Hell's Kitchen,6,93.0,https://a0.muscache.com/im/pictures/348a55fe-4...,10019.0,3.0,3.0
2,7919400,4.976734,Apartment,Entire home/apt,"{TV,""Cable TV"",""Wireless Internet"",""Air condit...",5,1.0,Real Bed,moderate,True,...,40.80811,-73.943756,The Garden Oasis,Harlem,10,92.0,https://a0.muscache.com/im/pictures/6fae5362-9...,10027.0,1.0,3.0
3,13418779,6.620073,House,Entire home/apt,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",4,1.0,Real Bed,flexible,True,...,37.772004,-122.431619,Beautiful Flat in the Heart of SF!,Lower Haight,0,,https://a0.muscache.com/im/pictures/72208dad-9...,94117.0,2.0,2.0
4,3808709,4.744932,Apartment,Entire home/apt,"{TV,Internet,""Wireless Internet"",""Air conditio...",2,1.0,Real Bed,moderate,True,...,38.925627,-77.034596,Great studio in midtown DC,Columbia Heights,4,40.0,,20009.0,0.0,1.0


the shape of the dataset
--------------------


Unnamed: 0,id,log_price,property_type,room_type,amenities,accommodates,bathrooms,bed_type,cancellation_policy,cleaning_fee,...,latitude,longitude,name,neighbourhood,number_of_reviews,review_scores_rating,thumbnail_url,zipcode,bedrooms,beds
0,6901257,5.010635,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",3,1.0,Real Bed,strict,True,...,40.696524,-73.991617,Beautiful brownstone 1-bedroom,Brooklyn Heights,2,100.0,https://a0.muscache.com/im/pictures/6d7cbbf7-c...,11201.0,1.0,1.0
1,6304928,5.129899,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",7,1.0,Real Bed,strict,True,...,40.766115,-73.98904,Superb 3BR Apt Located Near Times Square,Hell's Kitchen,6,93.0,https://a0.muscache.com/im/pictures/348a55fe-4...,10019.0,3.0,3.0
2,7919400,4.976734,Apartment,Entire home/apt,"{TV,""Cable TV"",""Wireless Internet"",""Air condit...",5,1.0,Real Bed,moderate,True,...,40.80811,-73.943756,The Garden Oasis,Harlem,10,92.0,https://a0.muscache.com/im/pictures/6fae5362-9...,10027.0,1.0,3.0
3,13418779,6.620073,House,Entire home/apt,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",4,1.0,Real Bed,flexible,True,...,37.772004,-122.431619,Beautiful Flat in the Heart of SF!,Lower Haight,0,,https://a0.muscache.com/im/pictures/72208dad-9...,94117.0,2.0,2.0
4,3808709,4.744932,Apartment,Entire home/apt,"{TV,Internet,""Wireless Internet"",""Air conditio...",2,1.0,Real Bed,moderate,True,...,38.925627,-77.034596,Great studio in midtown DC,Columbia Heights,4,40.0,,20009.0,0.0,1.0


statistical summary
----------------------------------------------------------------------------


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,74111.0,11266620.0,6081735.0,344.0,6261964.0,12254150.0,16402260.0,21230900.0
log_price,74111.0,4.782069,0.7173938,0.0,4.317488,4.70953,5.220356,7.600402
accommodates,74111.0,3.155146,2.153589,1.0,2.0,2.0,4.0,16.0
bathrooms,73911.0,1.235263,0.5820441,0.0,1.0,1.0,1.0,8.0
latitude,74111.0,38.44596,3.080167,33.338905,34.12791,40.66214,40.7461,42.39044
longitude,74111.0,-92.39753,21.70532,-122.5115,-118.3424,-76.99696,-73.95466,-70.98505
number_of_reviews,74111.0,20.90057,37.82864,0.0,1.0,6.0,23.0,605.0
review_scores_rating,57389.0,94.06736,7.836556,20.0,92.0,96.0,100.0,100.0
bedrooms,74020.0,1.265793,0.8521435,0.0,1.0,1.0,1.0,10.0
beds,73980.0,1.710868,1.254142,0.0,1.0,1.0,2.0,18.0




information of features
----------------------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74111 entries, 0 to 74110
Data columns (total 29 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      74111 non-null  int64  
 1   log_price               74111 non-null  float64
 2   property_type           74111 non-null  object 
 3   room_type               74111 non-null  object 
 4   amenities               74111 non-null  object 
 5   accommodates            74111 non-null  int64  
 6   bathrooms               73911 non-null  float64
 7   bed_type                74111 non-null  object 
 8   cancellation_policy     74111 non-null  object 
 9   cleaning_fee            74111 non-null  bool   
 10  city                    74111 non-null  object 
 11  description             74111 non-null  object 
 12  first_review            58247 non-null  o

None



missing values
----------------------------------------------------------------------------


id                            0
log_price                     0
property_type                 0
room_type                     0
amenities                     0
accommodates                  0
bathrooms                   200
bed_type                      0
cancellation_policy           0
cleaning_fee                  0
city                          0
description                   0
first_review              15864
host_has_profile_pic        188
host_identity_verified      188
host_response_rate        18299
host_since                  188
instant_bookable              0
last_review               15827
latitude                      0
longitude                     0
name                          0
neighbourhood              6872
number_of_reviews             0
review_scores_rating      16722
thumbnail_url              8216
zipcode                     966
bedrooms                     91
beds                        131
dtype: int64



checking for duplicates
----------------------------------------------------------------------------
number of dupliacte rows: 0


value counts for property_type
----------------------------------------------------------------------------
property_type
Apartment             49003
House                 16511
Condominium            2658
Townhouse              1692
Loft                   1244
Other                   607
Guesthouse              498
Bed & Breakfast         462
Bungalow                366
Villa                   179
Dorm                    142
Guest suite             123
Camper/RV                94
Timeshare                77
Cabin                    72
In-law                   71
Hostel                   70
Boutique hotel           69
Boat                     65
Serviced apartment       21
Tent                     18
Castle                   13
Vacation home            11
Yurt                      9
Hut                       8
Treehouse                 7
Chalet           

Key Observations

1. There are 74111 rows and 29 columns in the dataset of which some columns are not relevant for our model, we will remove them during pre-processing.
3. 'aminities' columns has dictionaries passed in it for amminities in each column, we will replace this dictionary with a count of amminities in each property.
3. data type for 10 columns is numeric, 18 columns object type and 1 column is boolean type, column 'host_response_rate' has data in percentage terms. However, due to '%' sign the datatype is object while the data is continuous so we will remove the '%' and convert it into numeric data as it makes more sense that ways.
4. There are significant number of missing values in the data frame which we will have to impute.
5. There are no duplicates in the data.
6. Columns 'property_type' and 'cancellation_propert are relavant for us, however, value count for these columns shows that 6.there are groups within these features whose value count is very low. Due to these groups our model might get unstable or there might be an issue of overfitting, to overcome this issue we will merge some groups for these features

# Data Pre-processing

In [6]:
airbnb_v1 = df.drop(['id', 'description','first_review', 'host_has_profile_pic','host_since', 'last_review', 'latitude', 'longitude', 'name', 'neighbourhood',
                         'thumbnail_url', 'zipcode'], axis = 1)
airbnb_v1.head()

Unnamed: 0,log_price,property_type,room_type,amenities,accommodates,bathrooms,bed_type,cancellation_policy,cleaning_fee,city,host_identity_verified,host_response_rate,instant_bookable,number_of_reviews,review_scores_rating,bedrooms,beds
0,5.010635,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",3,1.0,Real Bed,strict,True,NYC,t,,f,2,100.0,1.0,1.0
1,5.129899,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",7,1.0,Real Bed,strict,True,NYC,f,100%,t,6,93.0,3.0,3.0
2,4.976734,Apartment,Entire home/apt,"{TV,""Cable TV"",""Wireless Internet"",""Air condit...",5,1.0,Real Bed,moderate,True,NYC,t,100%,t,10,92.0,1.0,3.0
3,6.620073,House,Entire home/apt,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",4,1.0,Real Bed,flexible,True,SF,t,,f,0,,2.0,2.0
4,4.744932,Apartment,Entire home/apt,"{TV,Internet,""Wireless Internet"",""Air conditio...",2,1.0,Real Bed,moderate,True,DC,t,100%,t,4,40.0,0.0,1.0


Converting data type for 'host_response_rate' to numeric

In [7]:
airbnb_v1['host_response_rate']= airbnb_v1['host_response_rate'].str.rstrip('%').astype('float')/100
airbnb_v1['host_response_rate'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 74111 entries, 0 to 74110
Series name: host_response_rate
Non-Null Count  Dtype  
--------------  -----  
55812 non-null  float64
dtypes: float64(1)
memory usage: 579.1 KB


# Imputing missing values

In [8]:
from sklearn.impute import SimpleImputer

In [9]:
SI = SimpleImputer(strategy = 'median')
SI.fit(airbnb_v1.select_dtypes(include = 'number'))
imputer = SI.transform(airbnb_v1.select_dtypes(include = 'number'))
airbnb_num = pd.DataFrame(imputer, columns = airbnb_v1.select_dtypes(include = 'number').columns)
airbnb_num.head()

Unnamed: 0,log_price,accommodates,bathrooms,host_response_rate,number_of_reviews,review_scores_rating,bedrooms,beds
0,5.010635,3.0,1.0,1.0,2.0,100.0,1.0,1.0
1,5.129899,7.0,1.0,1.0,6.0,93.0,3.0,3.0
2,4.976734,5.0,1.0,1.0,10.0,92.0,1.0,3.0
3,6.620073,4.0,1.0,1.0,0.0,96.0,2.0,2.0
4,4.744932,2.0,1.0,1.0,4.0,40.0,0.0,1.0


In [10]:
airbnb_num.isnull().sum()

log_price               0
accommodates            0
bathrooms               0
host_response_rate      0
number_of_reviews       0
review_scores_rating    0
bedrooms                0
beds                    0
dtype: int64

In [11]:
SI = SimpleImputer(strategy = 'most_frequent')
SI.fit(airbnb_v1.select_dtypes(include = 'object'))
imputer = SI.transform(airbnb_v1.select_dtypes(include = 'object'))
airbnb_obj = pd.DataFrame(imputer, columns = airbnb_v1.select_dtypes(include = 'object').columns)
airbnb_obj.head()

Unnamed: 0,property_type,room_type,amenities,bed_type,cancellation_policy,city,host_identity_verified,instant_bookable
0,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",Real Bed,strict,NYC,t,f
1,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",Real Bed,strict,NYC,f,t
2,Apartment,Entire home/apt,"{TV,""Cable TV"",""Wireless Internet"",""Air condit...",Real Bed,moderate,NYC,t,t
3,House,Entire home/apt,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",Real Bed,flexible,SF,t,f
4,Apartment,Entire home/apt,"{TV,Internet,""Wireless Internet"",""Air conditio...",Real Bed,moderate,DC,t,t


In [12]:
airbnb_obj.isnull().sum()

property_type             0
room_type                 0
amenities                 0
bed_type                  0
cancellation_policy       0
city                      0
host_identity_verified    0
instant_bookable          0
dtype: int64

In [13]:
# merging 'airbnb_obj' with 'airbnb_v2' and adding 'cleaning_fee' column
airbnb_v2 = pd.concat([airbnb_num, airbnb_obj], axis = 1)
airbnb_v2['cleaning_fee'] = airbnb_v1['cleaning_fee']
airbnb_v2.isnull().sum()

log_price                 0
accommodates              0
bathrooms                 0
host_response_rate        0
number_of_reviews         0
review_scores_rating      0
bedrooms                  0
beds                      0
property_type             0
room_type                 0
amenities                 0
bed_type                  0
cancellation_policy       0
city                      0
host_identity_verified    0
instant_bookable          0
cleaning_fee              0
dtype: int64

In [14]:
airbnb_v2.head()

Unnamed: 0,log_price,accommodates,bathrooms,host_response_rate,number_of_reviews,review_scores_rating,bedrooms,beds,property_type,room_type,amenities,bed_type,cancellation_policy,city,host_identity_verified,instant_bookable,cleaning_fee
0,5.010635,3.0,1.0,1.0,2.0,100.0,1.0,1.0,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",Real Bed,strict,NYC,t,f,True
1,5.129899,7.0,1.0,1.0,6.0,93.0,3.0,3.0,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",Real Bed,strict,NYC,f,t,True
2,4.976734,5.0,1.0,1.0,10.0,92.0,1.0,3.0,Apartment,Entire home/apt,"{TV,""Cable TV"",""Wireless Internet"",""Air condit...",Real Bed,moderate,NYC,t,t,True
3,6.620073,4.0,1.0,1.0,0.0,96.0,2.0,2.0,House,Entire home/apt,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",Real Bed,flexible,SF,t,f,True
4,4.744932,2.0,1.0,1.0,4.0,40.0,0.0,1.0,Apartment,Entire home/apt,"{TV,Internet,""Wireless Internet"",""Air conditio...",Real Bed,moderate,DC,t,t,True


# count of aminities

In [16]:
aminities_count = []
for i in airbnb_v2['amenities']:
  aminities_count.append(len(i))

airbnb_v2['amenities'] = aminities_count
airbnb_v2.info()
    

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74111 entries, 0 to 74110
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   log_price               74111 non-null  float64
 1   accommodates            74111 non-null  float64
 2   bathrooms               74111 non-null  float64
 3   host_response_rate      74111 non-null  float64
 4   number_of_reviews       74111 non-null  float64
 5   review_scores_rating    74111 non-null  float64
 6   bedrooms                74111 non-null  float64
 7   beds                    74111 non-null  float64
 8   property_type           74111 non-null  object 
 9   room_type               74111 non-null  object 
 10  amenities               74111 non-null  int64  
 11  bed_type                74111 non-null  object 
 12  cancellation_policy     74111 non-null  object 
 13  city                    74111 non-null  object 
 14  host_identity_verified  74111 non-null

# Merging groups

In [18]:

value_count = airbnb_v2['property_type'].value_counts()
other_values = value_count[value_count < 50].index
airbnb_v2['property_type'] = airbnb_v2['property_type'].replace(other_values, 'Other')
airbnb_v2['property_type'].value_counts()

property_type
Apartment          49003
House              16511
Condominium         2658
Townhouse           1692
Loft                1244
Other                715
Guesthouse           498
Bed & Breakfast      462
Bungalow             366
Villa                179
Dorm                 142
Guest suite          123
Camper/RV             94
Timeshare             77
Cabin                 72
In-law                71
Hostel                70
Boutique hotel        69
Boat                  65
Name: count, dtype: int64

In [19]:
airbnb_v2['cancellation_policy'] = airbnb_v2['cancellation_policy'].replace(['super_strict_30', 'super_strict_60'], 'super_strict')
airbnb_v2['cancellation_policy'].value_counts()

cancellation_policy
strict          32374
flexible        22545
moderate        19063
super_strict      129
Name: count, dtype: int64