<a href="https://colab.research.google.com/github/lamphgg/Airbnb_filter/blob/main/pandas_project_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Downloading the Dataset


In [None]:
%%capture
!pip install numpy pandas streamlit gdown pyarrow

We will download the datasets from Google Drive just like we did last week, but this time the datasets are in [Pickle](https://pythonnumericalmethods.berkeley.edu/notebooks/chapter11.03-Pickle-Files.html) and [Parquet](https://arrow.apache.org/docs/python/parquet.html) format.

In [None]:
import os
import shutil

import gdown
import numpy as np
import pandas as pd

# Download files from Google Drive
# Based on data from: http://insideairbnb.com/get-the-data/
file_id_1 = "1m185vTdh-u7_A2ZElBvUD4SCO6oETll2"
file_id_2 = "1w41V1oWHJrBdaNJJQ4oxVBuml5CO7MQX"
downloaded_file_1 = "listings_project.pkl"
downloaded_file_2 = "calendar_project.parquet"
# Download the files from Google Drive
gdown.download(id=file_id_1, output=downloaded_file_1)
gdown.download(id=file_id_2, output=downloaded_file_2)

Downloading...
From: https://drive.google.com/uc?id=1m185vTdh-u7_A2ZElBvUD4SCO6oETll2
To: /content/listings_project.pkl
100%|██████████| 1.42M/1.42M [00:00<00:00, 103MB/s]
Downloading...
From: https://drive.google.com/uc?id=1w41V1oWHJrBdaNJJQ4oxVBuml5CO7MQX
To: /content/calendar_project.parquet
100%|██████████| 1.23M/1.23M [00:00<00:00, 89.2MB/s]


'calendar_project.parquet'

In [None]:
pd.set_option("display.max_columns", None)

pd.set_option("display.float_format", "{:.2f}".format)

## Preprocessing the Dataset


In [None]:
df_list = pd.read_pickle("listings_project.pkl")
df_cal = pd.read_parquet("calendar_project.parquet")

In [None]:
df_list.head()

Unnamed: 0,id,host_acceptance_rate,host_is_superhost,host_listings_count,host_total_listings_count,neighbourhood_cleansed,latitude,longitude,room_type,accommodates,bedrooms,beds,amenities,price,minimum_nights,maximum_nights,has_availability,availability_30,availability_60,availability_90,availability_365,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,review_scores_rating,instant_bookable,reviews_per_month,price_in_euros,price_per_person,minimum_price,discount_per_5_days_booked,discount_per_10_days_booked,discount_per_30_and_more_days_booked,service_cost
0,23726706,0.95,f,1,1,IJburg - Zeeburgereiland,52.35,4.98,Private room,2,1.0,1.0,6,$88.00,2,14,t,0,6,20,66,78,11,3,4.99,t,1.53,,$44,$176,5%,11%,16%,$4.99
1,35815036,1.0,t,1,1,Noord-Oost,52.42,4.96,Entire home/apt,2,,1.0,5,$105.00,3,100,t,4,6,12,243,95,36,6,4.96,f,2.65,,$52.5,$315,5%,12%,16%,$4.99
2,31553121,1.0,f,1,1,Noord-West,52.43,4.92,Entire home/apt,4,1.0,3.0,3,$152.00,2,60,t,0,3,3,3,82,26,1,4.74,f,2.02,,$38,$304,7%,11%,22%,$4.99
3,34745823,0.94,f,3,3,Gaasperdam - Driemond,52.3,5.01,Entire home/apt,2,1.0,2.0,8,$87.00,2,1125,t,5,20,26,290,39,4,0,4.87,f,1.08,,$43.5,$174,6%,10%,15%,$4.99
4,44586947,0.88,t,0,0,Gaasperdam - Driemond,52.31,5.03,Private room,4,2.0,3.0,4,$160.00,2,31,t,9,32,62,152,15,12,3,5.0,f,0.68,,$40,$320,9%,22%,20%,$4.99


In [None]:
df_cal.head()

Unnamed: 0,listing_id,date,available,price_in_dollar,minimum_nights,maximum_nights
0,23726706,2022-06-05,False,90.0,2,1125
1,23726706,2022-06-06,False,90.0,2,1125
2,23726706,2022-06-07,False,90.0,2,1125
3,23726706,2022-06-08,False,90.0,2,1125
4,23726706,2022-06-09,False,85.0,2,1125


In [None]:
df_list.describe()

Unnamed: 0,id,host_acceptance_rate,host_listings_count,host_total_listings_count,latitude,longitude,accommodates,bedrooms,beds,amenities,minimum_nights,maximum_nights,availability_30,availability_60,availability_90,availability_365,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,review_scores_rating,reviews_per_month
count,6165.0,5365.0,6165.0,6165.0,6165.0,6165.0,6165.0,5859.0,6082.0,6165.0,6165.0,6165.0,6165.0,6165.0,6165.0,6165.0,6165.0,6165.0,6165.0,5581.0,5581.0
mean,4.431834092624375e+16,0.71,2.85,2.85,52.37,4.89,3.0,1.6,1.98,8.1,4.5,493.12,3.16,8.34,14.98,88.11,50.82,10.27,1.28,4.76,1.21
std,1.575603836907899e+17,0.31,29.94,29.94,0.02,0.04,1.52,0.95,1.71,5.76,31.24,515.26,5.85,13.3,22.01,112.6,94.41,29.66,3.03,0.35,2.25
min,2818.0,0.0,0.0,0.0,52.27,4.74,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01
25%,12885866.0,0.51,1.0,1.0,52.36,4.87,2.0,1.0,1.0,4.0,2.0,22.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,4.67,0.26
50%,27348973.0,0.79,1.0,1.0,52.37,4.89,2.0,1.0,1.0,7.0,2.0,180.0,0.0,2.0,4.0,23.0,17.0,3.0,0.0,4.86,0.57
75%,44297369.0,1.0,2.0,2.0,52.38,4.91,4.0,2.0,2.0,11.0,3.0,1125.0,4.0,11.0,22.0,158.0,49.0,10.0,2.0,4.98,1.43
max,6.411317621180361e+17,1.0,1926.0,1926.0,52.44,5.07,16.0,15.0,34.0,92.0,1001.0,1825.0,30.0,60.0,90.0,365.0,1110.0,1088.0,128.0,5.0,94.88


In [None]:
df_cal.describe()

Unnamed: 0,listing_id,price_in_dollar,minimum_nights,maximum_nights
count,2252415.0,2252415.0,2252415.0,2252415.0
mean,4.427525065800575e+16,218.29,4.61,696643.83
std,1.5747708753384077e+17,464.77,31.26,38640953.67
min,2818.0,0.0,1.0,1.0
25%,12898673.0,115.0,2.0,30.0
50%,27362708.0,160.0,3.0,1125.0
75%,44297369.0,247.0,4.0,1125.0
max,6.411317621180361e+17,18400.0,1001.0,2147483647.0


In [None]:
df_list.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6165 entries, 0 to 6172
Data columns (total 34 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   id                                    6165 non-null   int64  
 1   host_acceptance_rate                  5365 non-null   float64
 2   host_is_superhost                     6165 non-null   object 
 3   host_listings_count                   6165 non-null   int64  
 4   host_total_listings_count             6165 non-null   int64  
 5   neighbourhood_cleansed                6165 non-null   object 
 6   latitude                              6165 non-null   float64
 7   longitude                             6165 non-null   float64
 8   room_type                             6165 non-null   object 
 9   accommodates                          6165 non-null   int64  
 10  bedrooms                              5859 non-null   float64
 11  beds             

In [None]:
df_list.discount_per_5_days_booked.head(5)

0    5%
1    5%
2    7%
3    6%
4    9%
Name: discount_per_5_days_booked, dtype: object

#### Clean


In [None]:
df_list["discount_per_5_days_booked"] = df_list["discount_per_5_days_booked"].str.replace("%"," ").astype(float)*0.01
df_list["discount_per_10_days_booked"] = df_list["discount_per_10_days_booked"].str.replace("%"," ").astype(float)*0.01
df_list["discount_per_30_and_more_days_booked"] = df_list["discount_per_30_and_more_days_booked"].str.replace("%"," ").astype(float)*0.01

In [None]:
df_list.discount_per_5_days_booked.head(5)

0   0.05
1   0.05
2   0.07
3   0.06
4   0.09
Name: discount_per_5_days_booked, dtype: float64

In [None]:
df_list[["host_is_superhost", "instant_bookable", "has_availability"]].head(5)

Unnamed: 0,host_is_superhost,instant_bookable,has_availability
0,f,t,t
1,t,f,t
2,f,f,t
3,f,f,t
4,t,f,t


In [None]:
df_list["host_is_superhost"] = df_list["host_is_superhost"].replace({'f': False, 't': True}).astype(bool)
df_list["instant_bookable"] = df_list["instant_bookable"].replace({'f': False, 't': True}).astype(bool)
df_list["has_availability"] = df_list["has_availability"].replace({'f': False, 't': True}).astype(bool)

In [None]:
df_list[["host_is_superhost", "instant_bookable", "has_availability"]].head(5)

Unnamed: 0,host_is_superhost,instant_bookable,has_availability
0,False,True,True
1,True,False,True
2,False,False,True
3,False,False,True
4,True,False,True


In [None]:
df_list[["price", "price_per_person", "minimum_price", 'service_cost']].head(5)

Unnamed: 0,price,price_per_person,minimum_price,service_cost
0,$88.00,$44,$176,$4.99
1,$105.00,$52.5,$315,$4.99
2,$152.00,$38,$304,$4.99
3,$87.00,$43.5,$174,$4.99
4,$160.00,$40,$320,$4.99


1. Remove dollar signs and commas
1. Convert to `float`



In [None]:
df_list["price"] = df_list["price"].str.replace("[$,]","",regex=True).astype(float)
df_list["price_per_person"] = df_list["price_per_person"].str.replace("[$,]","",regex=True).astype(float)
df_list["minimum_price"] = df_list["minimum_price"].str.replace("[$,]","",regex=True).astype(float)
df_list["service_cost"] = df_list["service_cost"].str.replace("[$,]","",regex=True).astype(float)

In [None]:
df_list[["price", "price_per_person", "minimum_price", 'service_cost']].head(5)

Unnamed: 0,price,price_per_person,minimum_price,service_cost
0,88.0,44.0,176.0,4.99
1,105.0,52.5,315.0,4.99
2,152.0,38.0,304.0,4.99
3,87.0,43.5,174.0,4.99
4,160.0,40.0,320.0,4.99


#### Change column names

- `price` into `price_in_dollar`
- `neighbourhood_cleansed` into `neighbourhood`  


In [None]:
df_list = df_list.rename(columns={'price':'price_in_dollar', 'neighbourhood_cleansed':'neighbourhood'})

In [None]:
df_list[['price_in_dollar','neighbourhood']].head()

Unnamed: 0,price_in_dollar,neighbourhood
0,88.0,IJburg - Zeeburgereiland
1,105.0,Noord-Oost
2,152.0,Noord-West
3,87.0,Gaasperdam - Driemond
4,160.0,Gaasperdam - Driemond


In [None]:
df_list['neighbourhood'] = df_list['neighbourhood'].astype('category')
df_list['room_type'] = df_list['room_type'].astype('category')

#### Delete irrelevant columns
* `host_listings_count`
* `host_total_listings_count`
* `availability_60`
* `availability_90`
* `availability_365`
* `number_of_reviews`
* `number_of_reviews_ltm`
* `reviews_per_month`

In [None]:
df_list = df_list.drop(columns={"host_listings_count", "host_total_listings_count", "availability_60", "availability_90", "availability_365", "number_of_reviews", "number_of_reviews_ltm", "reviews_per_month"})

In [None]:
df_list.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6165 entries, 0 to 6172
Data columns (total 26 columns):
 #   Column                                Non-Null Count  Dtype   
---  ------                                --------------  -----   
 0   id                                    6165 non-null   int64   
 1   host_acceptance_rate                  5365 non-null   float64 
 2   host_is_superhost                     6165 non-null   bool    
 3   neighbourhood                         6165 non-null   category
 4   latitude                              6165 non-null   float64 
 5   longitude                             6165 non-null   float64 
 6   room_type                             6165 non-null   category
 7   accommodates                          6165 non-null   int64   
 8   bedrooms                              5859 non-null   float64 
 9   beds                                  6082 non-null   float64 
 10  amenities                             6165 non-null   int64   
 11  pric

In [None]:
df_list.price_in_euros

0       None
1       None
2       None
3       None
4       None
        ... 
6168    None
6169    None
6170    None
6171    None
6172    None
Name: price_in_euros, Length: 6165, dtype: object

In [None]:
df_list['price_in_euros'].unique()

array([None], dtype=object)

In [None]:
df_list = df_list.drop(columns=['price_in_euros'])

In [None]:
df_list.head()

Unnamed: 0,id,host_acceptance_rate,host_is_superhost,neighbourhood,latitude,longitude,room_type,accommodates,bedrooms,beds,amenities,price_in_dollar,minimum_nights,maximum_nights,has_availability,availability_30,number_of_reviews_l30d,review_scores_rating,instant_bookable,price_per_person,minimum_price,discount_per_5_days_booked,discount_per_10_days_booked,discount_per_30_and_more_days_booked,service_cost
0,23726706,0.95,False,IJburg - Zeeburgereiland,52.35,4.98,Private room,2,1.0,1.0,6,88.0,2,14,True,0,3,4.99,True,44.0,176.0,0.05,0.11,0.16,4.99
1,35815036,1.0,True,Noord-Oost,52.42,4.96,Entire home/apt,2,,1.0,5,105.0,3,100,True,4,6,4.96,False,52.5,315.0,0.05,0.12,0.16,4.99
2,31553121,1.0,False,Noord-West,52.43,4.92,Entire home/apt,4,1.0,3.0,3,152.0,2,60,True,0,1,4.74,False,38.0,304.0,0.07,0.11,0.22,4.99
3,34745823,0.94,False,Gaasperdam - Driemond,52.3,5.01,Entire home/apt,2,1.0,2.0,8,87.0,2,1125,True,5,0,4.87,False,43.5,174.0,0.06,0.1,0.15,4.99
4,44586947,0.88,True,Gaasperdam - Driemond,52.31,5.03,Private room,4,2.0,3.0,4,160.0,2,31,True,9,3,5.0,False,40.0,320.0,0.09,0.22,0.2,4.99


In [None]:
df_list.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6165 entries, 0 to 6172
Data columns (total 25 columns):
 #   Column                                Non-Null Count  Dtype   
---  ------                                --------------  -----   
 0   id                                    6165 non-null   int64   
 1   host_acceptance_rate                  5365 non-null   float64 
 2   host_is_superhost                     6165 non-null   bool    
 3   neighbourhood                         6165 non-null   category
 4   latitude                              6165 non-null   float64 
 5   longitude                             6165 non-null   float64 
 6   room_type                             6165 non-null   category
 7   accommodates                          6165 non-null   int64   
 8   bedrooms                              5859 non-null   float64 
 9   beds                                  6082 non-null   float64 
 10  amenities                             6165 non-null   int64   
 11  pric

In [None]:
df_list = df_list.dropna(subset=["host_acceptance_rate", "review_scores_rating"])

In [None]:
df_list.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4886 entries, 0 to 6172
Data columns (total 25 columns):
 #   Column                                Non-Null Count  Dtype   
---  ------                                --------------  -----   
 0   id                                    4886 non-null   int64   
 1   host_acceptance_rate                  4886 non-null   float64 
 2   host_is_superhost                     4886 non-null   bool    
 3   neighbourhood                         4886 non-null   category
 4   latitude                              4886 non-null   float64 
 5   longitude                             4886 non-null   float64 
 6   room_type                             4886 non-null   category
 7   accommodates                          4886 non-null   int64   
 8   bedrooms                              4622 non-null   float64 
 9   beds                                  4817 non-null   float64 
 10  amenities                             4886 non-null   int64   
 11  pric

In [None]:
df_list["room_type"].unique()

['Private room', 'Entire home/apt', 'Hotel room', 'Shared room']
Categories (4, object): ['Entire home/apt', 'Hotel room', 'Private room', 'Shared room']


- **"Private room"** or **"Shared room"** as `room_type`, then the listing only has one bedroom.
- **"Hotel room"** or **"Entire home/apt"** as `room_type`, then the number of guests the listing accomodates by 2 and round up.
- If any of these numbers are missing, then we can leave it empty.

In [None]:
def fill_empty_bedrooms(accommodates: int, bedrooms: int, room_type: str) -> int:
    if (room_type == "Private room") or (room_type == "Shared room"):
        return 1
    elif (room_type == "Hotel room") or (room_type == "Entire home/apt"):
        return np.ceil(accommodates / 2)
    else:
        return bedrooms

In [None]:
%%timeit -r 4 -n 100

temp_df = df_list.copy()  # Deep copy of the df, not a "view"
temp_df["rooms"] = df_list[["accommodates", "bedrooms", "room_type"]].apply(
    lambda x: fill_empty_bedrooms(x["accommodates"], x["bedrooms"], x["room_type"]),
    axis=1,
)

117 ms ± 35.4 ms per loop (mean ± std. dev. of 4 runs, 100 loops each)


In [None]:
df_list["bedrooms"] = df_list[["accommodates", "bedrooms", "room_type"]].apply(
    lambda x: fill_empty_bedrooms(x["accommodates"], x["bedrooms"], x["room_type"]),
    axis=1,
)

In [None]:
df_list[["accommodates", "bedrooms", "room_type"]].head()

Unnamed: 0,accommodates,bedrooms,room_type
0,2,1.0,Private room
1,2,1.0,Entire home/apt
2,4,2.0,Entire home/apt
3,2,1.0,Entire home/apt
4,4,1.0,Private room


In [None]:
%%timeit -r 4 -n 100

temp_df = df_list.copy()

# Please use as many lines as you think you need to
# implement this function. We required 5 separate
# statements.

temp_df["beds"] = temp_df.bedrooms

priv_shared_mask = (
    (temp_df.room_type == "private room") | (temp_df.room_type == "Shared room")
)
temp_df.loc[priv_shared_mask, "beds"] = 1


hotel_apt_mask = (
    (temp_df.room_type == "Hotel room") | (temp_df.room_type == "Entire home/apt")
)
temp_df.loc[hotel_apt_mask, "beds"] = np.ceil(temp_df.accommodates/2)

The slowest run took 5.91 times longer than the fastest. This could mean that an intermediate result is being cached.
8.32 ms ± 4.41 ms per loop (mean ± std. dev. of 4 runs, 100 loops each)


In [None]:
df_list =  df_list.dropna(subset=["bedrooms", "beds"])

In [None]:
df_list["beds"] = df_list["beds"].astype("int")
df_list["bedrooms"] = df_list["bedrooms"].astype("int")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_list["beds"] = df_list["beds"].astype("int")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_list["bedrooms"] = df_list["bedrooms"].astype("int")


In [None]:
#Check the current memory usage of the bedrooms column at current int64
df_list["bedrooms"].memory_usage(index=False, deep=True)

38536

In [None]:
df_list["bedrooms"] = df_list["bedrooms"].astype("int8")
df_list["bedrooms"].memory_usage(index=False, deep=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_list["bedrooms"] = df_list["bedrooms"].astype("int8")


4817

In [None]:
df_list.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4817 entries, 0 to 6172
Data columns (total 25 columns):
 #   Column                                Non-Null Count  Dtype   
---  ------                                --------------  -----   
 0   id                                    4817 non-null   int64   
 1   host_acceptance_rate                  4817 non-null   float64 
 2   host_is_superhost                     4817 non-null   bool    
 3   neighbourhood                         4817 non-null   category
 4   latitude                              4817 non-null   float64 
 5   longitude                             4817 non-null   float64 
 6   room_type                             4817 non-null   category
 7   accommodates                          4817 non-null   int64   
 8   bedrooms                              4817 non-null   int8    
 9   beds                                  4817 non-null   int64   
 10  amenities                             4817 non-null   int64   
 11  pric

In [None]:
df_list.head(3)

Unnamed: 0,id,host_acceptance_rate,host_is_superhost,neighbourhood,latitude,longitude,room_type,accommodates,bedrooms,beds,amenities,price_in_dollar,minimum_nights,maximum_nights,has_availability,availability_30,number_of_reviews_l30d,review_scores_rating,instant_bookable,price_per_person,minimum_price,discount_per_5_days_booked,discount_per_10_days_booked,discount_per_30_and_more_days_booked,service_cost
0,23726706,0.95,False,IJburg - Zeeburgereiland,52.35,4.98,Private room,2,1,1,6,88.0,2,14,True,0,3,4.99,True,44.0,176.0,0.05,0.11,0.16,4.99
1,35815036,1.0,True,Noord-Oost,52.42,4.96,Entire home/apt,2,1,1,5,105.0,3,100,True,4,6,4.96,False,52.5,315.0,0.05,0.12,0.16,4.99
2,31553121,1.0,False,Noord-West,52.43,4.92,Entire home/apt,4,2,3,3,152.0,2,60,True,0,1,4.74,False,38.0,304.0,0.07,0.11,0.22,4.99


In [None]:
# The Calendar DataFrame!
df_cal.head(3)

Unnamed: 0,listing_id,date,available,price_in_dollar,minimum_nights,maximum_nights
0,23726706,2022-06-05,False,90.0,2,1125
1,23726706,2022-06-06,False,90.0,2,1125
2,23726706,2022-06-07,False,90.0,2,1125


---

#### Minimum stay

- Create a conditional index for entries `listing_id` that are above the 3 day threshold
- Calculate the price of booking a listing for 5 days by multiplying the current day multiplied by 5, and assign this to a column called `five_day_dollar_price`

In [None]:
# First start by making a copy, for debugging purposes
calendar_newdf = df_cal.copy()

include_list = (
    calendar_newdf["minimum_nights"] >= 3
)

In [None]:
# Get all the listings with a minimum nights of 3+
# Use the include_list
calendar_newdf = calendar_newdf.loc[include_list]

| Related functions |
| ---- |
| [isin()](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.isin.html): Filter the DataFrame on provided values |
| [eq()](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.eq.html#pandas.DataFrame.eq): Filter the DataFrame for all values equal to the provided input |
| [ne()](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.ne.html#pandas.DataFrame.ne): Filter the DataFrame for all values not equal to the provided input |

In [None]:
calendar_newdf["five_day_dollar_price"] = calendar_newdf["price_in_dollar"] * 5

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  calendar_newdf["five_day_dollar_price"] = calendar_newdf["price_in_dollar"] * 5


Transform our newly created DataFrame into a **pivot table**, where we aggregate our rows using the `listing_id` as the index, and the columns `available` and `five_day_dollar_price` as values.

In [None]:
calendar_summarizeddf = pd.pivot_table(
    data=calendar_newdf,
    index=["listing_id"],
    values=["available", "five_day_dollar_price"],
    aggfunc=np.mean,  # The default aggregation function used
    # for merging multiple related rows of data.
)

calendar_summarizeddf.head(3)

Unnamed: 0_level_0,available,five_day_dollar_price
listing_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2818,0.21,346.9
44391,0.0,1200.0
49552,0.46,1162.5


#### Maximum price and date

Make a **pivot table** that states the **maximum `price_in_dollar`** for every Airbnb listing

In [None]:
temp_sum_df = pd.pivot_table(
   data=calendar_newdf,
    index=["listing_id"],
    values=["price_in_dollar"],
    aggfunc=np.max
)

temp_sum_df.head(3)

Unnamed: 0_level_0,price_in_dollar
listing_id,Unnamed: 1_level_1
2818,80.0
44391,240.0
49552,300.0


#### Task 17: Merging

In [None]:
final_df = pd.merge(
    df_list,
    calendar_summarizeddf,
    left_on=["id"],
    right_on=["listing_id"],
    how="inner"
)

final_df.head(3)

Unnamed: 0,id,host_acceptance_rate,host_is_superhost,neighbourhood,latitude,longitude,room_type,accommodates,bedrooms,beds,amenities,price_in_dollar,minimum_nights,maximum_nights,has_availability,availability_30,number_of_reviews_l30d,review_scores_rating,instant_bookable,price_per_person,minimum_price,discount_per_5_days_booked,discount_per_10_days_booked,discount_per_30_and_more_days_booked,service_cost,available,five_day_dollar_price
0,35815036,1.0,True,Noord-Oost,52.42,4.96,Entire home/apt,2,1,1,5,105.0,3,100,True,4,6,4.96,False,52.5,315.0,0.05,0.12,0.16,4.99,0.66,528.89
1,19572024,1.0,False,Watergraafsmeer,52.31,4.91,Entire home/apt,6,3,6,14,279.0,3,300,True,6,3,4.69,False,46.5,837.0,0.09,0.16,0.14,4.99,0.82,1496.55
2,2973384,0.38,False,Watergraafsmeer,52.31,4.91,Entire home/apt,5,3,3,7,185.0,6,21,True,0,0,4.83,False,37.0,1110.0,0.06,0.12,0.18,4.99,0.04,941.3


In [None]:
final_df.groupby(by=["room_type"])[
    [
        "review_scores_rating",
        "five_day_dollar_price"
    ]
].median()

Unnamed: 0_level_0,review_scores_rating,five_day_dollar_price
room_type,Unnamed: 1_level_1,Unnamed: 2_level_1
Entire home/apt,4.88,975.0
Hotel room,4.56,1110.16
Private room,4.79,710.91
Shared room,4.6,724.11


### Download the Dataset to Your Local Machine

Let's first export our final DataFrame.

In [None]:
final_df.to_csv(
    "WK2_Airbnb_Amsterdam_listings_proj_solution.csv",
    index=True,
)

In [None]:
from google.colab import files

# Download the file locally
files.download('WK2_Airbnb_Amsterdam_listings_proj_solution.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>