# Authors: Juan Andrés Méndez Galvis

In [86]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import sklearn as sk
# Import linear regression from scikit-learn
from sklearn.linear_model import LinearRegression
# Import train_test_split function
from sklearn.model_selection import train_test_split
# Import metrics from scikit-learn
from sklearn import metrics
from sklearn.model_selection import cross_val_score
# Import standard scaler
from sklearn.preprocessing import StandardScaler


# 3. Desarrollo de modelos analiticos

## Entendimiento de los datos

In [87]:
df_airbnb_listings = pd.read_csv('listings.csv')
df_airbnb_listings.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,528022,https://www.airbnb.com/rooms/528022,20221205143541,2022-12-05,city scrape,Somewhere Else,Highlights<br />*Located only 5 minutes from C...,,https://a0.muscache.com/pictures/bf6fa79c-5863...,2594559,...,4.9,4.78,4.72,0363 9289 A94D 5C21 A579,f,1,0,1,0,3.3
1,2818,https://www.airbnb.com/rooms/2818,20221205143541,2022-12-05,city scrape,Quiet Garden View Room & Super Fast Wi-Fi,Quiet Garden View Room & Super Fast Wi-Fi<br /...,"Indische Buurt (""Indies Neighborhood"") is a ne...",https://a0.muscache.com/pictures/10272854/8dcc...,3159,...,4.98,4.69,4.81,0363 5F3A 5684 6750 D14D,f,1,0,1,0,1.88
2,20168,https://www.airbnb.com/rooms/20168,20221205143541,2022-12-05,previous scrape,Studio with private bathroom in the centre 1,17th century Dutch townhouse in the heart of t...,Located just in between famous central canals....,https://a0.muscache.com/pictures/69979628/fd6a...,59484,...,4.62,4.87,4.49,0363 CBB3 2C10 0C2A 1E29,t,2,0,2,0,2.18
3,27886,https://www.airbnb.com/rooms/27886,20221205143541,2022-12-05,city scrape,"Romantic, stylish B&B houseboat in canal district",Stylish and romantic houseboat on fantastic hi...,"Central, quiet, safe, clean and beautiful.",https://a0.muscache.com/pictures/02c2da9d-660e...,97647,...,4.92,4.89,4.79,0363 974D 4986 7411 88D8,t,1,0,1,0,1.83
4,28871,https://www.airbnb.com/rooms/28871,20221205143541,2022-12-05,city scrape,Comfortable double room,<b>The space</b><br />In a monumental house ri...,"Flower market , Leidseplein , Rembrantsplein",https://a0.muscache.com/pictures/160889/362340...,124245,...,4.94,4.97,4.83,0363 607B EA74 0BD8 2F6F,f,2,0,2,0,3.03


In [88]:
# Print the name of the columns
print(df_airbnb_listings.columns)

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'source', 'name',
       'description', 'neighborhood_overview', 'picture_url', 'host_id',
       'host_url', 'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_thumbnail_url', 'host_picture_url',
       'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'ca

# Tipos de columnas y categorias

- Numeric:

        - id, scrape_id, host_id, accommodates, bathrooms, bedrooms, beds, price, minimum_nights, maximum_nights, minimum_minimum_nights, maximum_minimum_nights, minimum_maximum_nights, maximum_maximum_nights, minimum_nights_avg_ntm, maximum_nights_avg_ntm, availability_30, availability_60, availability_90, availability_365, number_of_reviews, number_of_reviews_ltm, number_of_reviews_l30d, review_scores_rating, review_scores_accuracy, review_scores_cleanliness, review_scores_checkin, review_scores_communication, review_scores_location, review_scores_value, calculated_host_listings_count, calculated_host_listings_count_entire_homes, calculated_host_listings_count_private_rooms, calculated_host_listings_count_shared_rooms
- Categorical with levels:

        - host_response_time, neighbourhood, neighbourhood_cleansed, neighbourhood_group_cleansed, property_type, room_type, bathrooms_text, bed_type, calendar_updated
- Categorical:

        - available, host_is_superhost, host_has_profile_pic, host_identity_verified, has_availability, instant_bookable
- Text:

        - listing_url, source, name, description, neighborhood_overview, picture_url, host_url, host_name, host_since, host_location, host_about, host_thumbnail_url, host_picture_url, host_neighbourhood, host_verifications, amenities, license
- Date:

        - last_scraped, first_review, last_review, calendar_last_scraped

In [89]:
# Describe the data
print(df_airbnb_listings.shape)
df_airbnb_listings.describe()

(6809, 75)


Unnamed: 0,id,scrape_id,host_id,host_listings_count,host_total_listings_count,neighbourhood_group_cleansed,latitude,longitude,accommodates,bathrooms,...,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
count,6809.0,6809.0,6809.0,6809.0,6809.0,0.0,6809.0,6809.0,6809.0,0.0,...,6157.0,6157.0,6157.0,6157.0,6157.0,6809.0,6809.0,6809.0,6809.0,6161.0
mean,1.543801e+17,20221210000000.0,97577240.0,2.801146,4.819944,,52.366847,4.890363,2.918784,,...,4.760789,4.880387,4.890471,4.790138,4.650797,1.853576,0.975621,0.786606,0.040828,1.226475
std,2.861175e+17,0.0,131518200.0,18.066322,29.766061,,0.017026,0.03555,1.401175,,...,0.316494,0.216868,0.221001,0.247403,0.30507,2.56115,1.292887,2.168739,0.440437,2.335964
min,2818.0,20221210000000.0,3159.0,1.0,1.0,,52.29034,4.75571,0.0,,...,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.01
25%,15109770.0,20221210000000.0,9287585.0,1.0,1.0,,52.35604,4.86689,2.0,,...,4.67,4.85,4.87,4.67,4.51,1.0,0.0,0.0,0.0,0.28
50%,33730180.0,20221210000000.0,33214580.0,1.0,1.0,,52.36619,4.88821,2.0,,...,4.86,4.95,4.97,4.86,4.7,1.0,1.0,0.0,0.0,0.61
75%,53137910.0,20221210000000.0,133682300.0,2.0,3.0,,52.37663,4.90847,4.0,,...,5.0,5.0,5.0,5.0,4.83,1.0,1.0,1.0,0.0,1.41
max,7.743397e+17,20221210000000.0,490155700.0,799.0,799.0,,52.42512,5.02643,16.0,,...,5.0,5.0,5.0,5.0,5.0,22.0,14.0,20.0,8.0,106.74


In [90]:
# Get all the numerical columns names
numerical_columns = df_airbnb_listings.select_dtypes(include=np.number).columns
print(numerical_columns)

Index(['id', 'scrape_id', 'host_id', 'host_listings_count',
       'host_total_listings_count', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'accommodates', 'bathrooms', 'bedrooms', 'beds',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'calendar_updated', 'availability_30',
       'availability_60', 'availability_90', 'availability_365',
       'number_of_reviews', 'number_of_reviews_ltm', 'number_of_reviews_l30d',
       'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'calculated_host_listings_count',
       'calculated_host_listings_count_entire_homes',
       'calculated_host_listings_count_private_rooms',
       'calculated_host_listings_

In [91]:

# Drop the columns that are not useful for the analysis
df_airbnb_listings = df_airbnb_listings.drop(['id', 'source', 'name', 'description', 'picture_url', 'host_url', 'host_name', 'host_about', 'neighbourhood', 'host_location', 'neighborhood_overview', 'listing_url', 'host_thumbnail_url', 'neighbourhood_cleansed', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood'], axis=1)

categorical_columns = df_airbnb_listings.select_dtypes(include='object').columns

# Print value counts for the categorical columns
for col in categorical_columns:
    # print the column name
    print(col)
    # print the value counts
    print(df_airbnb_listings[col].value_counts())

last_scraped
2022-12-05    6808
2022-12-17       1
Name: last_scraped, dtype: int64
host_since
2014-04-22    25
2018-07-20    23
2019-01-17    15
2020-08-25    15
2013-04-06    15
              ..
2012-12-03     1
2015-12-12     1
2012-04-25     1
2016-12-13     1
2012-11-23     1
Name: host_since, Length: 2666, dtype: int64
host_response_time
within an hour        2655
within a day          1020
within a few hours     989
a few days or more      91
Name: host_response_time, dtype: int64
host_response_rate
100%    3653
90%      183
80%      115
96%       68
50%       65
0%        59
67%       54
75%       51
97%       50
86%       43
93%       40
70%       36
98%       34
83%       28
99%       27
88%       24
89%       23
78%       20
92%       19
84%       18
60%       18
95%       17
94%       17
91%       12
63%       10
33%        9
71%        8
81%        6
87%        4
57%        4
20%        4
40%        4
25%        3
30%        3
82%        3
77%        3
56%        3
43%    

# Mapa de calor de correlaciones

Este mapa de calor nos permite ver las correlaciones entre las variables numéricas del dataset. Se puede ver que las variables que tienen una correlación más alta son:

In [None]:
# print the heatmap of the correlation matrix
plt.figure(figsize=(10,10))
sns.heatmap(df_airbnb_listings.corr(), annot=True, fmt='.2f', cmap='coolwarm')
plt.show()


En este análisis se puede ver que las variables que tienen una correlación más alta son:

- availability_30 y availability_60
- availability_60 y availability_90
- availability_90 y availability_365
- availability_365 y number_of_reviews
- price y accommodates
- price y bedrooms
- price y beds
- rating y number_of_reviews
- rating y number_of_reviews_ltm
- popularity y number_of_reviews
- utilization_index y availability_365


# Explicación de selección de modelo

> We choose to use the LinearRegression model because it is the most simple model to use and it is the most used model for this type of data. We also choose to use the StandardScaler because it is the most used scaler for this type of data.

# Create a train and test set

In [None]:
# Create the train and test data
X_train, X_test, y_train, y_test = train_test_split(df_airbnb_listings.drop(['price'], axis=1), df_airbnb_listings['price'], test_size=0.2, random_state=42)

# Create the scaler object with a range of 0-1
scaler = StandardScaler()

# Fit on the training data
scaler.fit(X_train)

# Transform both the training and test data
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Create the linear regression model with scikit-learn
reg = LinearRegression()

# Fit the model to the training data
reg.fit(X_train, y_train)

# Metrics for the model

En este caso se utilizo el error cuadratico medio (MSE) para medir la calidad del modelo. Tambien se utilizo el error absoluto medio (MAE) para medir la calidad del modelo. Adicionalmente se utilizo el coeficiente de determinación (R2) para medir la calidad del modelo.

In [None]:
# Predict on the test data: y_pred
y_pred = reg.predict(X_test)

# Compute and print R^2 and RMSE
print("R^2: {}".format(reg.score(X_test, y_test)))
rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error: {}".format(rmse))

# Compute 5-fold cross-validation scores: cv_scores
cv_scores = cross_val_score(reg, X_train, y_train, cv=5)
