In [39]:
# Loading data from Google Sheets
import pandas as pd

url = "https://docs.google.com/spreadsheets/d/1ecopK6oyyb4d_7-QLrCr8YlgFrCetHU7-VQfnYej7JY/export?format=xlsx"
dataset = pd.ExcelFile(url, engine='openpyxl')

sheets = []
for sheet in dataset.sheet_names:
    df = dataset.parse(sheet)
    df["sheet"] = sheet #adding a column to track from which sheet is the data
    sheets.append(df)

df_airbnb = pd.concat(sheets, ignore_index=True)

In [40]:
# Check the shape of the combined DataFrame
df_airbnb.shape

(51707, 21)

In [41]:
# Exploratory Data Analysis (EDA)
# Create a column for country and weekday/weekend from group column
df_airbnb['city'] = df_airbnb['sheet'].str.split('_').str[0]
df_airbnb['weekday/weekend'] = df_airbnb['sheet'].str.split('_').str[1]


In [42]:
df_airbnb['city'].unique()

array(['amsterdam', 'athens', 'berlin', 'barcelona', 'budapest', 'lisbon',
       'london', 'paris', 'rome', 'vienna'], dtype=object)

In [43]:
# Create a column for country by mapping city names
country_map = {'amsterdam': 'netherlands', 
               'athens': 'greece',
               'berlin': 'germany',
               'barcelona': 'spain',
               'budapest': 'hungary',
               'lisbon': 'portugal',
               'london': 'uk',
               'paris': 'france',
               'rome': 'italy',
               'vienna': 'austria'}

df_airbnb['country'] = df_airbnb['city'].replace(country_map)

In [44]:
# Change the name of realSum to price
df_airbnb.rename(columns={'realSum': 'price'}, inplace=True)

In [45]:
# Instead of having two columns of room_shared and room_private, create one with the respective categories (was combined with room type)
df_airbnb['room_type'] = df_airbnb.apply(
    lambda row: 'shared room' if row['room_shared'] == True 
        else ('private room' if row['room_private'] == True 
              else 'entire home/apt'), axis=1)

df_airbnb = df_airbnb.drop(columns=['room_shared', 'room_private'])

In [46]:
# create a column where one can know if the host of the listing has only one, two to four, or more than four listings based on column multi and biz
df_airbnb['host_listing_count'] = df_airbnb.apply(
    lambda row: 'more than four' if row['biz'] == 1
    else ('one' if row['multi'] == 0
          else 'two to four'), axis=1)

df_airbnb = df_airbnb.drop(columns=['biz', 'multi'])

In [47]:
df_airbnb.shape

(51707, 21)

In [48]:
# Display the first entry
df_airbnb.head(1)

Unnamed: 0.1,Unnamed: 0,price,room_type,person_capacity,host_is_superhost,cleanliness_rating,guest_satisfaction_overall,bedrooms,dist,metro_dist,...,attr_index_norm,rest_index,rest_index_norm,lng,lat,sheet,city,weekday/weekend,country,host_listing_count
0,0,194.033698,private room,2,False,10,93,1,5.022964,2.53938,...,4.166708,98.253896,6.846473,4.90569,52.41772,amsterdam_weekdays,amsterdam,weekdays,netherlands,two to four


In [49]:
# Display the last entry
df_airbnb.tail(1)

Unnamed: 0.1,Unnamed: 0,price,room_type,person_capacity,host_is_superhost,cleanliness_rating,guest_satisfaction_overall,bedrooms,dist,metro_dist,...,attr_index_norm,rest_index,rest_index_norm,lng,lat,sheet,city,weekday/weekend,country,host_listing_count
51706,1798,133.230489,private room,4,True,10,93,1,1.263932,0.480903,...,10.774264,225.247293,5.44414,16.39066,48.20811,vienna_weekends,vienna,weekends,austria,two to four


In [50]:
# Display random sample of 7 entries
df_airbnb.sample(7)

Unnamed: 0.1,Unnamed: 0,price,room_type,person_capacity,host_is_superhost,cleanliness_rating,guest_satisfaction_overall,bedrooms,dist,metro_dist,...,attr_index_norm,rest_index,rest_index_norm,lng,lat,sheet,city,weekday/weekend,country,host_listing_count
2429,349,86.94022,entire home/apt,4,False,9,95,1,3.694986,0.389358,...,1.715476,62.911018,4.723742,23.77107,37.99317,athens_weekdays,athens,weekdays,greece,two to four
2434,354,90.455323,entire home/apt,3,True,10,98,1,2.780092,0.596741,...,2.541818,101.986078,7.657735,23.71418,37.99414,athens_weekdays,athens,weekdays,greece,two to four
21007,1451,216.228893,entire home/apt,5,False,9,92,2,2.439231,0.672802,...,3.572866,266.557596,14.981242,-9.127,38.732,lisbon_weekends,lisbon,weekends,portugal,more than four
12449,1050,120.324901,private room,2,True,9,90,2,4.828414,0.102734,...,7.59467,289.41238,6.357418,2.14047,41.42451,barcelona_weekends,barcelona,weekends,spain,two to four
8848,288,347.761055,entire home/apt,2,False,9,87,0,8.754093,0.910233,...,10.003326,115.390668,23.792766,13.29593,52.49559,berlin_weekdays,berlin,weekdays,germany,one
24155,1693,109.833251,private room,2,False,10,100,1,7.516524,0.557976,...,11.431346,344.166752,6.162593,-0.02619,51.53215,london_weekdays,london,weekdays,uk,one
13301,624,133.242468,entire home/apt,4,True,9,95,1,1.30425,0.191635,...,14.724962,399.266272,30.23367,19.06409,47.5082,budapest_weekdays,budapest,weekdays,hungary,one


In [51]:
# Check data types
df_airbnb.dtypes

Unnamed: 0                      int64
price                         float64
room_type                      object
person_capacity                 int64
host_is_superhost                bool
cleanliness_rating              int64
guest_satisfaction_overall      int64
bedrooms                        int64
dist                          float64
metro_dist                    float64
attr_index                    float64
attr_index_norm               float64
rest_index                    float64
rest_index_norm               float64
lng                           float64
lat                           float64
sheet                          object
city                           object
weekday/weekend                object
country                        object
host_listing_count             object
dtype: object

In [52]:
# basic summary statistics for numeric data
df_airbnb.describe()

Unnamed: 0.1,Unnamed: 0,price,person_capacity,cleanliness_rating,guest_satisfaction_overall,bedrooms,dist,metro_dist,attr_index,attr_index_norm,rest_index,rest_index_norm,lng,lat
count,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0
mean,1620.502388,279.879591,3.161661,9.390624,92.628232,1.15876,3.191285,0.68154,294.204105,13.423792,626.856696,22.786177,7.426068,45.671128
std,1217.380366,327.948386,1.298545,0.954868,8.945531,0.62741,2.393803,0.858023,224.754123,9.807985,497.920226,17.804096,9.799725,5.249263
min,0.0,34.779339,2.0,2.0,20.0,0.0,0.015045,0.002301,15.152201,0.926301,19.576924,0.592757,-9.22634,37.953
25%,646.0,148.752174,2.0,9.0,90.0,1.0,1.453142,0.24848,136.797385,6.380926,250.854114,8.75148,-0.0725,41.39951
50%,1334.0,211.343089,3.0,10.0,95.0,1.0,2.613538,0.413269,234.331748,11.468305,522.052783,17.542238,4.873,47.50669
75%,2382.0,319.694287,4.0,10.0,99.0,1.0,4.263077,0.73784,385.756381,17.415082,832.628988,32.964603,13.518825,51.471885
max,5378.0,18545.450285,6.0,10.0,100.0,10.0,25.284557,14.273577,4513.563486,100.0,6696.156772,100.0,23.78602,52.64141


In [None]:
# basic summary statistics for categorical data
df_airbnb.describe(include="object")