In [None]:
# Loading data from Google Sheets
import pandas as pd

url = "https://docs.google.com/spreadsheets/d/1ecopK6oyyb4d_7-QLrCr8YlgFrCetHU7-VQfnYej7JY/export?format=xlsx"
dataset = pd.ExcelFile(url, engine='openpyxl')

sheets = []
for sheet in dataset.sheet_names:
    df = dataset.parse(sheet)
    df["sheet"] = sheet #adding a column to track from which sheet is the data
    sheets.append(df)

df_airbnb = pd.concat(sheets, ignore_index=True)

In [None]:
# Check the shape of the combined DataFrame
df_airbnb.shape

(51707, 21)

In [None]:
# Exploratory Data Analysis (EDA)
# Create a column for country and weekday/weekend from group column
df_airbnb['city'] = df_airbnb['sheet'].str.split('_').str[0]
df_airbnb['weekday/weekend'] = df_airbnb['sheet'].str.split('_').str[1]


In [None]:
df_airbnb['city'].unique()

array(['amsterdam', 'athens', 'berlin', 'barcelona', 'budapest', 'lisbon',
       'london', 'paris', 'rome', 'vienna'], dtype=object)

In [None]:
# Create a column for country by mapping city names
country_map = {'amsterdam': 'netherlands', 
               'athens': 'greece',
               'berlin': 'germany',
               'barcelona': 'spain',
               'budapest': 'hungary',
               'lisbon': 'portugal',
               'london': 'uk',
               'paris': 'france',
               'rome': 'italy',
               'vienna': 'austria'}

df_airbnb['country'] = df_airbnb['city'].replace(country_map)

In [None]:
# Change the name of realSum to price
df_airbnb.rename(columns={'realSum': 'price'}, inplace=True)

In [None]:
# Instead of having two columns of room_shared and room_private, create one with the respective categories (was combined with room type)
df_airbnb['room_type_clean'] = df_airbnb.apply(
    lambda row: 'shared room' if row['room_shared'] == True 
        else ('private room' if row['room_private'] == True 
              else 'entire home/apt'), axis=1)

In [None]:
# create a column where one can know if the host of the listing has only one, two to four, or more than four listings based on column mulit and biz
df_airbnb['host_listing_count'] = df_airbnb.apply(
    lambda row: 'one' if row['mulit'] == 0 and row['biz'] == 0
        else ('two to four' if (row['mulit'] >= 1 and row['mulit'] <= 3) or (row['biz'] >= 1 and row['biz'] <= 3)
              else 'more than four'), axis=1)


In [None]:
# Display the first entry
df_airbnb.head(1)

In [None]:
# Display the last entry
df_airbnb.tail(1)

In [None]:
# Display random sample of 7 entries
df_airbnb.sample(7)

In [None]:
# Check data types
df_airbnb.dtypes

In [None]:
# basic summary statistics for numeric data
df_airbnb.describe()

In [None]:
# basic summary statistics for categorical data
df_airbnb.describe(include="object")