In [63]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette
import plotly.graph_objs as go
import plotly.offline as py
from plotly.offline import iplot
from plotly import tools
%matplotlib inline

# Importing data set

In [64]:
df = pd.read_csv("AB_NYC_2019.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 16 columns):
id                                48895 non-null int64
name                              48879 non-null object
host_id                           48895 non-null int64
host_name                         48874 non-null object
neighbourhood_group               48895 non-null object
neighbourhood                     48895 non-null object
latitude                          48895 non-null float64
longitude                         48895 non-null float64
room_type                         48895 non-null object
price                             48895 non-null int64
minimum_nights                    48895 non-null int64
number_of_reviews                 48895 non-null int64
last_review                       38843 non-null object
reviews_per_month                 38843 non-null float64
calculated_host_listings_count    48895 non-null int64
availability_365                  48895 non-null int64

In [65]:
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


# missing values

In [66]:
for column in df.columns:
    print("Missing values in {} are {}".format(column,df[column].isnull().sum()))

Missing values in id are 0
Missing values in name are 16
Missing values in host_id are 0
Missing values in host_name are 21
Missing values in neighbourhood_group are 0
Missing values in neighbourhood are 0
Missing values in latitude are 0
Missing values in longitude are 0
Missing values in room_type are 0
Missing values in price are 0
Missing values in minimum_nights are 0
Missing values in number_of_reviews are 0
Missing values in last_review are 10052
Missing values in reviews_per_month are 10052
Missing values in calculated_host_listings_count are 0
Missing values in availability_365 are 0


# Replacing null values in reviews_per_month

In [67]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")
reviews = imputer.fit_transform(np.array(df["reviews_per_month"]).reshape(-1,1))
reviews_df = pd.DataFrame(reviews,columns = ["reviews_per_month"])

In [68]:
df.drop("reviews_per_month",axis=1,inplace = True)
df.columns

Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review',
       'calculated_host_listings_count', 'availability_365'],
      dtype='object')

In [69]:
df = pd.concat([df,reviews_df],axis = 1)
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,calculated_host_listings_count,availability_365,reviews_per_month
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,6,365,0.21
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,2,355,0.38
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,1,365,0.72
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,1,194,4.64
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,1,0,0.1


In [70]:
df.reviews_per_month.isnull().sum()

0

In [71]:
# we dont need id,host_id,host_name
df.drop(["id","host_id"],axis=1,inplace = True)
df.columns

Index(['name', 'host_name', 'neighbourhood_group', 'neighbourhood', 'latitude',
       'longitude', 'room_type', 'price', 'minimum_nights',
       'number_of_reviews', 'last_review', 'calculated_host_listings_count',
       'availability_365', 'reviews_per_month'],
      dtype='object')

# some insights

In [72]:
df.head()

Unnamed: 0,name,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,calculated_host_listings_count,availability_365,reviews_per_month
0,Clean & quiet apt home by the park,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,6,365,0.21
1,Skylit Midtown Castle,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,2,355,0.38
2,THE VILLAGE OF HARLEM....NEW YORK !,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,1,365,0.72
3,Cozy Entire Floor of Brownstone,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,1,194,4.64
4,Entire Apt: Spacious Studio/Loft by central park,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,1,0,0.1


In [73]:
#distribution by neighbourhood_good
name = list(df.neighbourhood_group.value_counts().index)
number = list(df.neighbourhood_group.value_counts().values*100/df.neighbourhood_group.value_counts().values.sum())
trace = go.Pie(labels = name, values = number)
layout = go.Layout(title = "distribution of name by neighbourhood_group")
data =[trace]
fig = go.Figure(data=data,layout = layout)
py.iplot(fig,filename = "distribution by neighbourhood_groud")


In [74]:
total_room_types = df.room_type.value_counts().index
total_room_types

Index(['Entire home/apt', 'Private room', 'Shared room'], dtype='object')

In [75]:
#distribution of room types by neighbourhood group
total_neighobourhood_group = df.neighbourhood_group.value_counts().index
new_df = pd.DataFrame(columns = ["neighbourhood_group","Entire home/apt","Private room","Shared room"])

for i in total_neighobourhood_group:
    data = []
    data.append(i)
    for j in total_room_types:
        data.append(len(df.loc[(df.room_type == j) & (df.neighbourhood_group == i),"name"]))
    new_df.loc[len(new_df)] = data
new_df

Unnamed: 0,neighbourhood_group,Entire home/apt,Private room,Shared room
0,Manhattan,13199,7982,480
1,Brooklyn,9559,10132,413
2,Queens,2096,3372,198
3,Bronx,379,652,60
4,Staten Island,176,188,9


In [76]:
trace1 = go.Bar(x = list(new_df.neighbourhood_group),y = list(new_df["Entire home/apt"]),name = "Entire home")
trace2 = go.Bar(x = list(new_df.neighbourhood_group),y = list(new_df["Private room"]),name = "Private room")
trace3 = go.Bar(x = list(new_df.neighbourhood_group),y = list(new_df["Shared room"]),name = "Shared room")
data = [trace1,trace2,trace3]
layout = go.Layout(title = "distribution of room_type by neighbourhood")
fig = go.Figure(data = data, layout = layout)
py.iplot(fig)

In [77]:
price_df = pd.DataFrame(columns = ["neighbourhood_group","Entire home/apt","Private room","Shared room"])

for i in total_neighobourhood_group:
    data = []
    data.append(i)
    for j in total_room_types:
        data.append(sum(list(df.loc[(df.room_type == j) & (df.neighbourhood_group == i),"price"]))/len(list(df.loc[(df.room_type == j) & (df.neighbourhood_group == i),"price"])))
    price_df.loc[len(price_df)] = data
price_df

Unnamed: 0,neighbourhood_group,Entire home/apt,Private room,Shared room
0,Manhattan,249.239109,116.776622,88.977083
1,Brooklyn,178.327545,76.500099,50.527845
2,Queens,147.050573,71.762456,69.020202
3,Bronx,127.506596,66.788344,59.8
4,Staten Island,173.846591,62.292553,57.444444


In [78]:
#distribution of avg price of different room type based on neighbourhood_group
trace1 = go.Bar(x = list(price_df.neighbourhood_group),y = list(price_df["Entire home/apt"]),name = "avg price of enitre home/apt")
trace2 = go.Bar(x = list(price_df.neighbourhood_group),y = list(price_df["Private room"]),name = "avg price of private room")
trace3 = go.Bar(x = list(price_df.neighbourhood_group),y = list(price_df["Shared room"]),name = "avg price of shared room")
data =[trace1,trace2,trace3]
layout = go.Layout(title = "Distribution of avg price of different room types by neighbourhood group")
fig = go.Figure(data= data,layout = layout)
py.iplot(fig)

In [79]:
#top most number of reviews based on name
reviews_df = df.sort_values(by =["number_of_reviews"],ascending = False).head(20)
trace = go.Bar(x = list(reviews_df.name),y = list(reviews_df.number_of_reviews))
layout = go.Layout(title = "distribution of name by number of reviews")
data = [trace]
fig = go.Figure(data=data,layout=layout)
py.iplot(fig)

In [90]:
#distribution of avg number of reviews by neighbourhood_group
l = []
for i in total_neighobourhood_group:
    l.append(df.number_of_reviews.loc[df.neighbourhood_group == i].sum()/len(df.number_of_reviews.loc[df.neighbourhood_group == i]))

trace = go.Bar(x = list(df.neighbourhood_group.value_counts().index),y = l)
data = [trace]
layout = go.Layout(title = "distribution of avg number of reviews based on neighbourhood_group")
fig = go.Figure(data = data,layout=layout)
py.iplot(fig)

In [81]:
# distribution of 365 days available rooms based on neighbourhood_group
availability_df = df.loc[df.availability_365 == 365]
x = availability_df.neighbourhood_group.value_counts().index
y = availability_df.neighbourhood_group.value_counts().values
trace = go.Pie(labels = list(x),values = list(y*100/len(y)))
layout = go.Layout(title = "distribution of 365 days available rooms by neighbourhood_group")
data = [trace]
fig = go.Figure(data = data,layout = layout)
py.iplot(fig)