## <span style="font-family: Arial; font-weight:bold;font-size:1.9em;color:#0e92ea">Assignment 2: AirBnB</span>

&nbsp;

<p align="center" style="font-family: Arial;color:#0e92ea;font-size:1em;">
Exploring pricing models that can effectively predict the Rent for accommodation and can help hosts, travelers, and also the business in devising profitable strategies.
    
</p>

##  <span style="font-family: Arial; font-weight:bold;font-size:1.9em;color:#0e92ea"> Contents:</span>

<ol style="font-family: Arial;color:#0e92ea;font-size:1em;">
    <li>Overview</li>
    <li>Univariate Data Analysis</li>
    <li>Multivariate Data Analysis</li>
    <li>Feature Engineer and Outlier Treatment</li>
    <li>Model building and Evaluation</li>
    <li>Recommendations</li>
</ol>

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder


warnings.filterwarnings('ignore')                          # Ignore/Do not display warnings

main_data = pd.read_csv('AirBnB.csv')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 500)

sns.set(color_codes=False)                                # Displays charts with background colors
%matplotlib inline

default_color_palette = ["#03A9F4", "#FF6F00", "#0288D1", "#D50000", "#7C4DFF"]
stats_colors = {'Mean':'#D50000', 'Mode':'#FF3D00', 'Median':'#2962FF'}

In [6]:
print(f"Rows : {main_data.shape[0]} x Columns : {main_data.shape[1]}")

Rows : 74111 x Columns : 11


In [7]:
def info(dataFrame):
    nulls_series             = dataFrame.isna().sum()         # Get a series counting number of empty values for each column
    nonnulls_series          = dataFrame.notnull().sum()      # Get a series counting number of non empty valuesfor each column 
    nulls_percentage         = ((nulls_series * 100)/(nulls_series + nonnulls_series)).astype(float)
    column_datatypes         = dataFrame.dtypes               # Get a series containing data types for each column 

    nulls_count_df = pd.concat(
        [nulls_series, nonnulls_series, nulls_percentage, column_datatypes],
        axis=1,
        keys=["Nulls", "Non-Nulls","Nulls %", "Type"], 
        sort=True)
    cm = sns.light_palette("red", as_cmap=True)
    display(nulls_count_df.style.background_gradient(cmap=cm, subset=pd.IndexSlice[:, ['Nulls %']]).format(formatter={('Nulls %'): "{:.2f}%"}))    

In [8]:
info(main_data)

Unnamed: 0,Nulls,Non-Nulls,Nulls %,Type
accommodates,3,74108,0.00%,float64
bathrooms,203,73908,0.27%,float64
bedrooms,92,74019,0.12%,float64
beds,131,73980,0.18%,float64
cancellation_policy,8,74103,0.01%,object
cleaning_fee,4,74107,0.01%,object
id,0,74111,0.00%,int64
instant_bookable,0,74111,0.00%,object
log_price,0,74111,0.00%,float64
review_scores_rating,16722,57389,22.56%,float64


In [9]:
np.random.seed(1)
main_data.sample(n=10)

Unnamed: 0,id,room_type,accommodates,bathrooms,cancellation_policy,cleaning_fee,instant_bookable,review_scores_rating,bedrooms,beds,log_price
56850,13586614,Entire home/apt,5.0,2.0,strict,False,f,100.0,2.0,3.0,7.408531
68140,18222981,Private room,3.0,1.0,strict,True,f,95.0,1.0,2.0,4.60517
23189,13915528,Entire home/apt,2.0,1.0,strict,True,f,95.0,1.0,2.0,4.094345
47255,7122067,Private room,1.0,1.0,flexible,False,f,100.0,1.0,1.0,4.317488
45933,12384924,Entire home/apt,5.0,2.0,strict,True,f,95.0,2.0,3.0,5.433722
66955,3999628,Private room,2.0,1.0,moderate,True,f,98.0,1.0,1.0,5.010635
18553,4948442,Entire home/apt,1.0,1.0,strict,False,f,,0.0,1.0,5.010635
26117,666837,Shared room,16.0,2.0,flexible,False,f,76.0,1.0,1.0,2.70805
65737,15929932,Entire home/apt,10.0,1.0,strict,False,f,93.0,2.0,2.0,4.976734
8915,21173263,Entire home/apt,3.0,1.0,strict,True,f,95.0,0.0,2.0,4.70048


In [10]:
print("=======================Distinct Categorial Values========================\n")
print(f"Instant Bookable\t: {main_data.instant_bookable.unique()}\n")
print(f"Cancellation Policy\t: {main_data.cancellation_policy.unique()}\n")
print(f"Room Types\t: {main_data.room_type.unique()}\n")
print("=========================================================================\n")


Instant Bookable	: ['f' 't']

Cancellation Policy	: ['strict' 'moderate' 'flexible' nan]

Room Types	: ['Entire home/apt' 'Private room' 'Shared room' nan]




In [15]:
main_data.duplicated().sum()

19994

In [22]:
main_data[main_data.duplicated(keep=False)].nunique(axis=0)

room_type                 3
accommodates             13
bathrooms                11
cancellation_policy       3
cleaning_fee              2
instant_bookable          2
review_scores_rating     30
bedrooms                  6
beds                      9
log_price               232
dtype: int64

In [18]:
main_data[main_data.duplicated(keep=False)]

Unnamed: 0,room_type,accommodates,bathrooms,cancellation_policy,cleaning_fee,instant_bookable,review_scores_rating,bedrooms,beds,log_price
0,Entire home/apt,3.0,1.0,strict,True,f,100.0,1.0,1.0,5.010635
5,Private room,2.0,1.0,strict,True,t,100.0,1.0,1.0,4.442651
7,Entire home/apt,2.0,1.0,moderate,True,f,93.0,1.0,1.0,4.787492
8,Private room,2.0,1.0,moderate,True,f,99.0,1.0,1.0,4.787492
9,Private room,2.0,1.0,moderate,True,t,90.0,1.0,1.0,3.583519
...,...,...,...,...,...,...,...,...,...,...
74096,Private room,2.0,1.0,flexible,False,f,60.0,1.0,1.0,3.912023
74100,Private room,2.0,1.0,strict,True,f,93.0,1.0,1.0,4.605170
74103,Entire home/apt,2.0,1.0,moderate,True,f,98.0,1.0,1.0,5.135798
74105,Private room,2.0,1.0,moderate,True,f,91.0,1.0,1.0,4.248495


<p>Variation in the log price could be be becuase of a number of factors / variables not captured in the data, like Location, proximity to things like beach, Special Events, Special Amenities, High Season ect. But since we do not have that data in hand it would be safe to build a model that will predict the price with the given columns, so dropping the duplicated rows would make sense.</p>

In [None]:
main_data.drop("id", axis=1, inplace=True)