In [1]:
# import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
# load dataset
airbnb = pd.read_csv('AB_NYC_2019.csv')

In [3]:
# display top 5 rows
airbnb.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,19-10-2018,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,21-05-2019,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,05-07-2019,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,19-11-2018,0.1,1,0


In [4]:
# shape
airbnb.shape

(48895, 16)

In [5]:
# number of unique values
airbnb.nunique()

id                                48895
name                              47896
host_id                           37457
host_name                         11452
neighbourhood_group                   5
neighbourhood                       221
latitude                          19048
longitude                         14718
room_type                             3
price                               674
minimum_nights                      109
number_of_reviews                   394
last_review                        1764
reviews_per_month                   937
calculated_host_listings_count       47
availability_365                    366
dtype: int64

In [6]:
# information
airbnb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              48895 non-null  int64  
 1   name                            48879 non-null  object 
 2   host_id                         48895 non-null  int64  
 3   host_name                       48874 non-null  object 
 4   neighbourhood_group             48895 non-null  object 
 5   neighbourhood                   48895 non-null  object 
 6   latitude                        48895 non-null  float64
 7   longitude                       48895 non-null  float64
 8   room_type                       48895 non-null  object 
 9   price                           48895 non-null  int64  
 10  minimum_nights                  48895 non-null  int64  
 11  number_of_reviews               48895 non-null  int64  
 12  last_review                     

In [7]:
# categorising different variables

# categorical variables
cat_var = ['neighbourhood_group','neighbourhood','room_type']

# continuous variables
cont_var = ['price','minimum_nights', 'number_of_reviews','reviews_per_month', 
            'calculated_host_listings_count','availability_365']

# location variables
loc_var = ['latitude', 'longitude']

# time variable
time_var = ['last_review']

# id variables
id_var = ['id','host_id']

In [8]:
# value counts for categorical variables
for i in cat_var:
    print(airbnb[i].value_counts())
    print('='*50)

Manhattan        21661
Brooklyn         20104
Queens            5666
Bronx             1091
Staten Island      373
Name: neighbourhood_group, dtype: int64
Williamsburg          3920
Bedford-Stuyvesant    3714
Harlem                2658
Bushwick              2465
Upper West Side       1971
                      ... 
Fort Wadsworth           1
Richmondtown             1
New Dorp                 1
Rossville                1
Willowbrook              1
Name: neighbourhood, Length: 221, dtype: int64
Entire home/apt    25409
Private room       22326
Shared room         1160
Name: room_type, dtype: int64


In [9]:
# statitical analysis of continuous variables
airbnb[cont_var].describe()

Unnamed: 0,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
count,48895.0,48895.0,48895.0,38843.0,48895.0,48895.0
mean,152.720687,7.029962,23.274466,1.373221,7.143982,112.781327
std,240.15417,20.51055,44.550582,1.680442,32.952519,131.622289
min,0.0,1.0,0.0,0.01,1.0,0.0
25%,69.0,1.0,1.0,0.19,1.0,0.0
50%,106.0,3.0,5.0,0.72,1.0,45.0
75%,175.0,5.0,24.0,2.02,2.0,227.0
max,10000.0,1250.0,629.0,58.5,327.0,365.0


In [10]:
# percentage of NULL values
airbnb.isnull().sum()*100/len(airbnb)

id                                 0.000000
name                               0.032723
host_id                            0.000000
host_name                          0.042949
neighbourhood_group                0.000000
neighbourhood                      0.000000
latitude                           0.000000
longitude                          0.000000
room_type                          0.000000
price                              0.000000
minimum_nights                     0.000000
number_of_reviews                  0.000000
last_review                       20.558339
reviews_per_month                 20.558339
calculated_host_listings_count     0.000000
availability_365                   0.000000
dtype: float64

In [11]:
# number of NULL values
airbnb.isnull().sum()

id                                    0
name                                 16
host_id                               0
host_name                            21
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10052
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

In [12]:
# checking rows where name is missing
airbnb[airbnb.name.isnull()]

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
2854,1615764,,6676776,Peter,Manhattan,Battery Park City,40.71239,-74.0162,Entire home/apt,400,1000,0,,,1,362
3703,2232600,,11395220,Anna,Manhattan,East Village,40.73215,-73.98821,Entire home/apt,200,1,28,08-06-2015,0.45,1,341
5775,4209595,,20700823,Jesse,Manhattan,Greenwich Village,40.73473,-73.99244,Entire home/apt,225,1,1,01-01-2015,0.02,1,0
5975,4370230,,22686810,Michaël,Manhattan,Nolita,40.72046,-73.9955,Entire home/apt,215,7,5,02-01-2016,0.09,1,0
6269,4581788,,21600904,Lucie,Brooklyn,Williamsburg,40.7137,-73.94378,Private room,150,1,0,,,1,0
6567,4756856,,1832442,Carolina,Brooklyn,Bushwick,40.70046,-73.92825,Private room,70,1,0,,,1,0
6605,4774658,,24625694,Josh,Manhattan,Washington Heights,40.85198,-73.93108,Private room,40,1,0,,,1,0
8841,6782407,,31147528,Huei-Yin,Brooklyn,Williamsburg,40.71354,-73.93882,Private room,45,1,0,,,1,0
11963,9325951,,33377685,Jonathan,Manhattan,Hell's Kitchen,40.76436,-73.98573,Entire home/apt,190,4,1,05-01-2016,0.02,1,0
12824,9787590,,50448556,Miguel,Manhattan,Harlem,40.80316,-73.95189,Entire home/apt,300,5,0,,,5,0


In [13]:
# rows where host_name is missing
airbnb[airbnb.host_name.isnull()]

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
360,100184,Bienvenue,526653,,Queens,Queens Village,40.72413,-73.76133,Private room,50,1,43,08-07-2019,0.45,1,88
2700,1449546,Cozy Studio in Flatbush,7779204,,Brooklyn,Flatbush,40.64965,-73.96154,Entire home/apt,100,30,49,02-01-2017,0.69,1,342
5745,4183989,SPRING in the City!! Zen-Style Tranquil Bedroom,919218,,Manhattan,Harlem,40.80606,-73.95061,Private room,86,3,34,23-05-2019,1.0,1,359
6075,4446862,Charming Room in Prospect Heights!,23077718,,Brooklyn,Crown Heights,40.67512,-73.96146,Private room,50,1,0,,,1,0
6582,4763327,"Luxurious, best location, spa inc'l",24576978,,Brooklyn,Greenpoint,40.72035,-73.95355,Entire home/apt,195,1,1,20-10-2015,0.02,1,0
8163,6292866,Modern Quiet Gem Near All,32722063,,Brooklyn,East Flatbush,40.65263,-73.93215,Entire home/apt,85,2,182,19-06-2019,3.59,2,318
8257,6360224,"Sunny, Private room in Bushwick",33134899,,Brooklyn,Bushwick,40.70146,-73.92792,Private room,37,1,1,01-07-2015,0.02,1,0
8852,6786181,R&S Modern Spacious Hideaway,32722063,,Brooklyn,East Flatbush,40.64345,-73.93643,Entire home/apt,100,2,157,19-06-2019,3.18,2,342
9138,6992973,1 Bedroom in Prime Williamsburg,5162530,,Brooklyn,Williamsburg,40.71838,-73.9563,Entire home/apt,145,1,0,,,1,0
9817,7556587,Sunny Room in Harlem,39608626,,Manhattan,Harlem,40.82929,-73.94182,Private room,28,1,1,01-08-2015,0.02,1,0


In [14]:
# host_id where name or host_name is missing
host = airbnb[airbnb.host_name.isnull()]['host_id']
host

360         526653
2700       7779204
5745        919218
6075      23077718
6582      24576978
8163      32722063
8257      33134899
8852      32722063
9138       5162530
9817      39608626
14040      7822683
14631     26138712
15174      5300585
19565    100971588
27777       415290
27962    159156636
28042    159156636
28274    159156636
30570    177146433
32193    119609345
38992    228750026
Name: host_id, dtype: int64

In [15]:
for i in host.values:
    print(airbnb[airbnb.host_id==i]['host_name'])
    print('')

360    NaN
Name: host_name, dtype: object

2700    NaN
Name: host_name, dtype: object

5745    NaN
Name: host_name, dtype: object

6075    NaN
Name: host_name, dtype: object

6582    NaN
Name: host_name, dtype: object

8163    NaN
8852    NaN
Name: host_name, dtype: object

8257    NaN
Name: host_name, dtype: object

8163    NaN
8852    NaN
Name: host_name, dtype: object

9138    NaN
Name: host_name, dtype: object

9817    NaN
Name: host_name, dtype: object

14040    NaN
Name: host_name, dtype: object

14631    NaN
Name: host_name, dtype: object

15174    NaN
Name: host_name, dtype: object

19565    NaN
Name: host_name, dtype: object

27777    NaN
Name: host_name, dtype: object

27962    NaN
28042    NaN
28274    NaN
Name: host_name, dtype: object

27962    NaN
28042    NaN
28274    NaN
Name: host_name, dtype: object

27962    NaN
28042    NaN
28274    NaN
Name: host_name, dtype: object

30570    NaN
Name: host_name, dtype: object

32193    NaN
Name: host_name, dtype: object

38992    

- Deleting all rows where either `name` or `host_name` is NULL

In [16]:
# Deleting all rows where either `name` or `host_name` is NULL
airbnb.dropna(subset=['name','host_name'], inplace=True)

In [17]:
# checking NULL values again
airbnb.isnull().sum()*100/len(airbnb)

id                                 0.000000
name                               0.000000
host_id                            0.000000
host_name                          0.000000
neighbourhood_group                0.000000
neighbourhood                      0.000000
latitude                           0.000000
longitude                          0.000000
room_type                          0.000000
price                              0.000000
minimum_nights                     0.000000
number_of_reviews                  0.000000
last_review                       20.543207
reviews_per_month                 20.543207
calculated_host_listings_count     0.000000
availability_365                   0.000000
dtype: float64

- Filling NULL values with 0 for `reviews_per_month`

In [18]:
# Filling NULL values with 0 for `reviews_per_month`
airbnb.reviews_per_month.fillna(0, inplace=True)

In [19]:
airbnb.isnull().sum()

id                                    0
name                                  0
host_id                               0
host_name                             0
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10037
reviews_per_month                     0
calculated_host_listings_count        0
availability_365                      0
dtype: int64

- Converting `last_review` to datetype

In [20]:
# Converting `last_review` to datetype
airbnb.last_review = pd.to_datetime(airbnb.last_review)

In [21]:
airbnb.isnull().sum()

id                                    0
name                                  0
host_id                               0
host_name                             0
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10037
reviews_per_month                     0
calculated_host_listings_count        0
availability_365                      0
dtype: int64

In [22]:
# creating a new file to be used for visualisation in TABLEAU
airbnb.to_csv('airbnb_treated.csv', index=False)