In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
pd.set_option('display.max_rows', None)  
pd.set_option('display.max_columns', None)

In [2]:
# We are going to use a web page that i saved and select the desired table
url = 'D:\Downloads\Cuba Hotel Search Results.html'

with open(url, encoding='utf-8') as file:
    reader = file.read()
    soup = BeautifulSoup(reader, 'html.parser')

tables = soup.find_all(class_='uitk-spacing uitk-spacing-margin-blockstart-three') # This is the table i put in my git hub repo

In [3]:
# After a manual search we find the labels we need to create a table with key info about the houses
rows = []
for table in tables:
    name = table.find('h3', class_="uitk-heading uitk-heading-5 overflow-wrap uitk-layout-grid-item uitk-layout-grid-item-has-row-start")
    if name is None:
        continue
    location = table.find('div', class_="uitk-text uitk-text-spacing-half truncate-lines-2 uitk-type-300 uitk-text-default-theme")
    breakfast = table.find('div', class_="uitk-text truncate-lines-2 uitk-type-200 uitk-text-default-theme")
    if breakfast is not None:
        breakfast = 'yes'
    else: breakfast = 'no'
    rating = table.find('span', class_="uitk-badge-base-text")
    if rating is None:
      continue
    reviews = table.find('span', class_="uitk-text uitk-type-200 uitk-type-regular uitk-text-default-theme")
    if reviews is None:
        reviews = 0
    else: reviews = reviews.text
    current_price = table.find('div', class_="uitk-text uitk-type-500 uitk-type-medium uitk-text-emphasis-theme")
    old_price = table.find('div', class_="uitk-text uitk-type-300 uitk-text-default-theme")
    if old_price is None:
        old_price = current_price

    row = [name.text, location.text, breakfast, rating.text, reviews, old_price.text, current_price.text]
    rows.append(row)

columns = ['Name', 'Location', 'Breakfast', 'Rating', 'Reviews', 'Old_Price(USD)', 'Current_Price(USD)']
airbnb_table = pd.DataFrame(rows, columns=columns)
airbnb_table['Current_Price(USD)'] = airbnb_table['Current_Price(USD)'].apply(lambda x: x[1:]).astype('int64')
airbnb_table['Old_Price(USD)'] = airbnb_table['Old_Price(USD)'].apply(lambda x: x[1:]).astype('int64')
airbnb_table['Reviews'] = airbnb_table['Reviews'].apply(lambda x: x[:-8]).astype('int64')
airbnb_table

Unnamed: 0,Name,Location,Breakfast,Rating,Reviews,Old_Price(USD),Current_Price(USD)
0,Art Studio Habana Vieja 55,Havana,no,8.4,387,80,44
1,Alhabana,Havana,yes,9.6,548,68,61
2,Vapor 156 Boutique Hotel,Havana,no,9.4,299,120,90
3,CasavanaCuba Boutique Hotel,Havana,no,9.4,736,79,79
4,Boutique Casa Italia,Havana,yes,9.8,24,217,195
5,Napper by Rottenberg,Havana,yes,9.6,113,162,105
6,Alahabana Boutique Hotel,Havana,no,9.6,271,133,47
7,La Alameda Boutique Hotel,Havana,yes,10.0,55,131,117
8,Casa 2 Sonrisas,Viñales,no,10.0,134,38,30
9,Casa Mi Tierra,Havana,yes,9.6,224,39,35


In [4]:
airbnb_table.info() # there is no missing value

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99 entries, 0 to 98
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Name                99 non-null     object
 1   Location            99 non-null     object
 2   Breakfast           99 non-null     object
 3   Rating              99 non-null     object
 4   Reviews             99 non-null     int64 
 5   Old_Price(USD)      99 non-null     int64 
 6   Current_Price(USD)  99 non-null     int64 
dtypes: int64(3), object(4)
memory usage: 5.5+ KB


In [5]:
# At first glance we notice some mistakes (name in row 62 and location value in rows 39 and 56) and manually fix them)
airbnb_table.loc[airbnb_table['Location']=='Entire apartment', 'Location'] = 'Viñales'
airbnb_table.loc[airbnb_table['Location']=='Sleeps\xa03, 1\xa0bathroom', 'Location'] = 'Cárdenas'
airbnb_table.loc[airbnb_table['Name']=='Wilber House 4 Hours Free Horse Tour Included', 'Name'] = 'Wilber House'
airbnb_table

Unnamed: 0,Name,Location,Breakfast,Rating,Reviews,Old_Price(USD),Current_Price(USD)
0,Art Studio Habana Vieja 55,Havana,no,8.4,387,80,44
1,Alhabana,Havana,yes,9.6,548,68,61
2,Vapor 156 Boutique Hotel,Havana,no,9.4,299,120,90
3,CasavanaCuba Boutique Hotel,Havana,no,9.4,736,79,79
4,Boutique Casa Italia,Havana,yes,9.8,24,217,195
5,Napper by Rottenberg,Havana,yes,9.6,113,162,105
6,Alahabana Boutique Hotel,Havana,no,9.6,271,133,47
7,La Alameda Boutique Hotel,Havana,yes,10.0,55,131,117
8,Casa 2 Sonrisas,Viñales,no,10.0,134,38,30
9,Casa Mi Tierra,Havana,yes,9.6,224,39,35


In [6]:
# Now we add a province column for future analysis
province = {'Havana': 'Havana', 'Viñales': 'Pinar del Rio', 'Trinidad': 'Sancti Spiritus', 'Camaguey': 'Camaguey',
            'Cárdenas': 'Matanzas', 'Cienfuegos': 'Cienfuegos', 'Holguín': 'Holguín', 'Baracoa': 'Guantanamo'}
airbnb_table['Province'] = airbnb_table['Location'].map(province)
airbnb_table

Unnamed: 0,Name,Location,Breakfast,Rating,Reviews,Old_Price(USD),Current_Price(USD),Province
0,Art Studio Habana Vieja 55,Havana,no,8.4,387,80,44,Havana
1,Alhabana,Havana,yes,9.6,548,68,61,Havana
2,Vapor 156 Boutique Hotel,Havana,no,9.4,299,120,90,Havana
3,CasavanaCuba Boutique Hotel,Havana,no,9.4,736,79,79,Havana
4,Boutique Casa Italia,Havana,yes,9.8,24,217,195,Havana
5,Napper by Rottenberg,Havana,yes,9.6,113,162,105,Havana
6,Alahabana Boutique Hotel,Havana,no,9.6,271,133,47,Havana
7,La Alameda Boutique Hotel,Havana,yes,10.0,55,131,117,Havana
8,Casa 2 Sonrisas,Viñales,no,10.0,134,38,30,Pinar del Rio
9,Casa Mi Tierra,Havana,yes,9.6,224,39,35,Havana


In [7]:
# Now lets see if there are some correlation between the current price and the rest of the columns
# (rating, review, breakfast, location)
from statsmodels.formula.api import ols
from scipy.stats import pearsonr
from scipy.stats import ttest_ind
import statsmodels.api as sm

rating_corr = airbnb_table['Current_Price(USD)'].corr(airbnb_table['Rating'], method='spearman')
reviews_corr = airbnb_table['Current_Price(USD)'].corr(airbnb_table['Reviews'], method='spearman')

# ANOVA for Location
airbnb_table.rename(columns={'Current_Price(USD)': 'Current_Price'}, inplace=True)

model = ols('Current_Price ~ Location', data=airbnb_table).fit()
anova_table = sm.stats.anova_lm(model, typ=1)

# Point Biserial Correlation for breakfast
airbnb_table['Breakfast_encoded'] = airbnb_table['Breakfast'].map({'yes': 1, 'no': 0}) 
correlation, p_value = pearsonr(airbnb_table['Breakfast_encoded'], airbnb_table['Current_Price'])

# ttest for breakfast
yes_values = airbnb_table['Current_Price'][airbnb_table['Breakfast'] == 'yes']
no_values = airbnb_table['Current_Price'][airbnb_table['Breakfast'] == 'no']
t_statistic, p_value_ttest = ttest_ind(yes_values, no_values)

print(f'Rating Correlation: {rating_corr}')
print(f'Reviews Correlation: {reviews_corr}')
print('='*86)
print('ANOVA for Location:')
print(f'F: {anova_table.loc["Location", "F"]}')
print(f'pvalue: {anova_table.loc["Location","PR(>F)"]}')
print('='*86)
print('Breakfast (Point Biserial Correlation):')
print(f'Point Biserial Correlation: {correlation}')
print(f'P-value: {p_value}')
print('='*86)
print('Breakfast (ttest):')
print(f'T-test Statistic: {t_statistic}')
print(f'P-value: {p_value_ttest}')

Rating Correlation: 0.06895577192252746
Reviews Correlation: 0.05213556033998342
ANOVA for Location:
F: 1.5520984939311815
pvalue: 0.15995265280315513
Breakfast (Point Biserial Correlation):
Point Biserial Correlation: 0.35859819668653276
P-value: 0.000267711848530921
Breakfast (ttest):
T-test Statistic: 3.7834096147557235
P-value: 0.00026771184853092674


In [None]:
# As we can see above, there is no strong correlation between the current price and the rest of the columns,
# so we cannot use them to predict a price of a house
# However, we are going to see some graphs on Tableau Public