# Start Here

In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from sklearn.linear_model import LinearRegression

In [12]:
csv = pd.read_csv('AB_NYC_2019.csv')
csv.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [13]:
# Checking the length of the dataframe

len(csv)

48895

In [14]:
# Checking the available features and their summary of statistics
# As we can see, most are non-categorical values
# And the categorical variables are 'name,' 'host_name,' 'neighbourhood_group,' 'neighbourhood,' 'room_type,' and 'last_review.'
# The distinct values of the categorical variables are also shown below

csv.describe(include='all')

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
count,48895.0,48879,48895.0,48874,48895,48895,48895.0,48895.0,48895,48895.0,48895.0,48895.0,38843,38843.0,48895.0,48895.0
unique,,47905,,11452,5,221,,,3,,,,1764,,,
top,,Hillside Hotel,,Michael,Manhattan,Williamsburg,,,Entire home/apt,,,,2019-06-23,,,
freq,,18,,417,21661,3920,,,25409,,,,1413,,,
mean,19017140.0,,67620010.0,,,,40.728949,-73.95217,,152.720687,7.029962,23.274466,,1.373221,7.143982,112.781327
std,10983110.0,,78610970.0,,,,0.05453,0.046157,,240.15417,20.51055,44.550582,,1.680442,32.952519,131.622289
min,2539.0,,2438.0,,,,40.49979,-74.24442,,0.0,1.0,0.0,,0.01,1.0,0.0
25%,9471945.0,,7822033.0,,,,40.6901,-73.98307,,69.0,1.0,1.0,,0.19,1.0,0.0
50%,19677280.0,,30793820.0,,,,40.72307,-73.95568,,106.0,3.0,5.0,,0.72,1.0,45.0
75%,29152180.0,,107434400.0,,,,40.763115,-73.936275,,175.0,5.0,24.0,,2.02,2.0,227.0


In [26]:
# Dropping features that I think are irrelevant
# The variable 'id' and 'host_id' serve no purpose other than identifying the host and property
# The 'last_review' is just the last date the property was given a review and is NaN if no renter had left a review

raw_data = csv.drop(['id', 'host_id', 'last_review'], axis=1)

raw_data.head()

Unnamed: 0,name,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,Clean & quiet apt home by the park,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,0.21,6,365
1,Skylit Midtown Castle,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,0.38,2,355
2,THE VILLAGE OF HARLEM....NEW YORK !,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,1,365
3,Cozy Entire Floor of Brownstone,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,4.64,1,194
4,Entire Apt: Spacious Studio/Loft by central park,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,0.1,1,0


In [27]:
raw_data.describe(include='all')

Unnamed: 0,name,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
count,48879,48874,48895,48895,48895.0,48895.0,48895,48895.0,48895.0,48895.0,38843.0,48895.0,48895.0
unique,47905,11452,5,221,,,3,,,,,,
top,Hillside Hotel,Michael,Manhattan,Williamsburg,,,Entire home/apt,,,,,,
freq,18,417,21661,3920,,,25409,,,,,,
mean,,,,,40.728949,-73.95217,,152.720687,7.029962,23.274466,1.373221,7.143982,112.781327
std,,,,,0.05453,0.046157,,240.15417,20.51055,44.550582,1.680442,32.952519,131.622289
min,,,,,40.49979,-74.24442,,0.0,1.0,0.0,0.01,1.0,0.0
25%,,,,,40.6901,-73.98307,,69.0,1.0,1.0,0.19,1.0,0.0
50%,,,,,40.72307,-73.95568,,106.0,3.0,5.0,0.72,1.0,45.0
75%,,,,,40.763115,-73.936275,,175.0,5.0,24.0,2.02,2.0,227.0


# Missing Variables

In [28]:
# Checking how many NaN we have in my dataset

raw_data.isnull().sum()

name                                 16
host_name                            21
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

In [29]:
raw_data.describe(include='all')

Unnamed: 0,name,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
count,48879,48874,48895,48895,48895.0,48895.0,48895,48895.0,48895.0,48895.0,38843.0,48895.0,48895.0
unique,47905,11452,5,221,,,3,,,,,,
top,Hillside Hotel,Michael,Manhattan,Williamsburg,,,Entire home/apt,,,,,,
freq,18,417,21661,3920,,,25409,,,,,,
mean,,,,,40.728949,-73.95217,,152.720687,7.029962,23.274466,1.373221,7.143982,112.781327
std,,,,,0.05453,0.046157,,240.15417,20.51055,44.550582,1.680442,32.952519,131.622289
min,,,,,40.49979,-74.24442,,0.0,1.0,0.0,0.01,1.0,0.0
25%,,,,,40.6901,-73.98307,,69.0,1.0,1.0,0.19,1.0,0.0
50%,,,,,40.72307,-73.95568,,106.0,3.0,5.0,0.72,1.0,45.0
75%,,,,,40.763115,-73.936275,,175.0,5.0,24.0,2.02,2.0,227.0


In [30]:
# Before we fill the Null with the mean of that feature, it came to my mind to check on the nature of the Nulls
# Trying to see what kind of number_of_reviews are corresponding to the reviews_per_month Nulls

raw_data[['number_of_reviews', 'reviews_per_month']][raw_data['reviews_per_month'].isnull()]

Unnamed: 0,number_of_reviews,reviews_per_month
2,0,
19,0,
26,0,
36,0,
38,0,
193,0,
204,0,
260,0,
265,0,
267,0,


In [31]:
# Although there are only roughly 10,000 Nulls to check, I realized it isn't always realistic with even larger dataset
# So I thought we can pull the summary statistics that meets the criteria

raw_data[['number_of_reviews']][raw_data['reviews_per_month'].isnull()].describe()

Unnamed: 0,number_of_reviews
count,10052.0
mean,0.0
std,0.0
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,0.0


In [32]:
# Just as I suspected, the reviews_per_month NaN is due to having no reviews.
# One option is to fill the Nulls with the mean of the feature by using the following code
# raw_data.reviews_per_month = raw_data.reviews_per_month.fillna(raw_data.reviews_per_month.mean())

In [41]:
# The option that makes more sense is simply fill them with 0's

data = raw_data
data.reviews_per_month = data.reviews_per_month.fillna(0)

In [42]:
# Check again to see if there are anymore Nulls

data.isnull().sum()

name                              16
host_name                         21
neighbourhood_group                0
neighbourhood                      0
latitude                           0
longitude                          0
room_type                          0
price                              0
minimum_nights                     0
number_of_reviews                  0
reviews_per_month                  0
calculated_host_listings_count     0
availability_365                   0
dtype: int64

In [48]:
print((16+21)/len(data))

0.0007567235913692607


In [46]:
len(data)

48895

In [49]:
# Since 'name' and 'host_name' only has 16 and 21, respectively, missing variables
# That is at most 0.076% of the data, so I will just drop them

data = data.dropna()

In [50]:
data.isnull().sum()

name                              0
host_name                         0
neighbourhood_group               0
neighbourhood                     0
latitude                          0
longitude                         0
room_type                         0
price                             0
minimum_nights                    0
number_of_reviews                 0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
dtype: int64

In [51]:
data.describe(include='all')

Unnamed: 0,name,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
count,48858,48858,48858,48858,48858.0,48858.0,48858,48858.0,48858.0,48858.0,48858.0,48858.0,48858.0
unique,47884,11450,5,221,,,3,,,,,,
top,Hillside Hotel,Michael,Manhattan,Williamsburg,,,Entire home/apt,,,,,,
freq,18,417,21643,3917,,,25393,,,,,,
mean,,,,,40.728941,-73.95217,,152.740309,7.012444,23.273098,1.091124,7.148369,112.801425
std,,,,,0.054528,0.046159,,240.232386,20.019757,44.549898,1.59727,32.9646,131.610962
min,,,,,40.49979,-74.24442,,0.0,1.0,0.0,0.0,1.0,0.0
25%,,,,,40.69009,-73.98307,,69.0,1.0,1.0,0.04,1.0,0.0
50%,,,,,40.72307,-73.95568,,106.0,3.0,5.0,0.37,1.0,45.0
75%,,,,,40.763107,-73.93628,,175.0,5.0,24.0,1.58,2.0,227.0


In [52]:
# Saving the dataframe to CSV file

data.to_csv

<bound method DataFrame.to_csv of                                                     name         host_name  \
0                     Clean & quiet apt home by the park              John   
1                                  Skylit Midtown Castle          Jennifer   
2                    THE VILLAGE OF HARLEM....NEW YORK !         Elisabeth   
3                        Cozy Entire Floor of Brownstone       LisaRoxanne   
4       Entire Apt: Spacious Studio/Loft by central park             Laura   
5              Large Cozy 1 BR Apartment In Midtown East             Chris   
6                                        BlissArtsSpace!             Garon   
7                       Large Furnished Room Near B'way           Shunichi   
8                     Cozy Clean Guest Room - Family Apt         MaryEllen   
9                     Cute & Cozy Lower East Side 1 bdrm               Ben   
10                      Beautiful 1br on Upper West Side              Lena   
11                       Centr