In [1]:
import pandas as pd

### Airbnb front pages dataset
This dataset contains the information gathered from the rooms at all the search pages.
It contains:
- `roomTitle`: the type of the room.
- `roomPrice`: the price for a night.
- `roomURL`: the url for the room page, used for merging in the next phase.
- `hostType`: the class of the host.

In [2]:
airbnb_frontPage = pd.read_json('airbnb_frontPage.json', encoding='utf-8')
airbnb_frontPage.drop(columns= ["Keyword", "Host", "roomName", "roomRating", "roomReviewcount"], inplace=True)

airbnb_frontPage.head(2)

Unnamed: 0,roomTitle,roomPrice,roomURL,hostType
0,Loft em Campos do Jordão,R$268 por noite,https://www.airbnb.com/rooms/92836566916150808...,Preferido dos hóspedes\nPreferido dos hóspedes
1,Loft em Campos do Jordão,R$99 por noite,https://www.airbnb.com/rooms/53832612?adults=2...,Preferido dos hóspedes\nPreferido dos hóspedes


In [3]:
airbnb_frontPage.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 672 entries, 0 to 671
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   roomTitle  672 non-null    object
 1   roomPrice  672 non-null    object
 2   roomURL    672 non-null    object
 3   hostType   672 non-null    object
dtypes: object(4)
memory usage: 21.1+ KB


### Airbnb room pages
This dataset contains some information scraped from the room's page.
- `Page_URL`: the url for the room page, used for merging in the next phase.
- `Rating`: the rating of the room.
- `Number_of_Reviews`: the number of reviews registered.
- `Amenities`: List of each rooms' amenities.

In [4]:
airbnb_rooms = pd.read_json('airbnb_rooms.json', encoding='utf-8')
airbnb_rooms.drop(columns= ["Title", "Location", "Number_of_Guests", "Number_of_Bedrooms", "Number_of_Beds", "Number_of_Bath", "Price", "Sleeping_Arrangements", "Hosted_by", "Response_Rate", "Image_1", "Image_2", "Image_3", "Current_Time"], inplace=True)
airbnb_rooms.head(2)

Unnamed: 0,Page_URL,Rating,Number_of_Reviews,Amenities
0,https://www.airbnb.com/rooms/11369075570005485...,,,Kitchen\nWifi\nFree parking on premises\nHot t...
1,https://www.airbnb.com/rooms/92836566916150808...,4.89,74.0,


## Merging the DataFrames

In [5]:
merged_df = pd.merge(airbnb_frontPage, airbnb_rooms, left_on='roomURL', right_on='Page_URL')
df = merged_df.drop(columns=["Page_URL"]) # keeping the url for future cleaning
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 669 entries, 0 to 668
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   roomTitle          669 non-null    object
 1   roomPrice          669 non-null    object
 2   roomURL            669 non-null    object
 3   hostType           669 non-null    object
 4   Rating             669 non-null    object
 5   Number_of_Reviews  669 non-null    object
 6   Amenities          669 non-null    object
dtypes: object(7)
memory usage: 36.7+ KB


## Cleaning and Preprocessing

In [6]:
# Renaming columns for consistency
df.rename(columns={
    'roomTitle' : 'roomType',
    'Rating' : 'rating',
    'Number_of_Reviews' : 'countReviews',
    'Amenities': 'amenities',
}, inplace=True)

In [7]:
# Drop duplicate rows
df.drop_duplicates(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 669 entries, 0 to 668
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   roomType      669 non-null    object
 1   roomPrice     669 non-null    object
 2   roomURL       669 non-null    object
 3   hostType      669 non-null    object
 4   rating        669 non-null    object
 5   countReviews  669 non-null    object
 6   amenities     669 non-null    object
dtypes: object(7)
memory usage: 36.7+ KB


In [8]:
# Fixing 'roomType' column
df['roomType'] = df['roomType'].str.split().str[0]  # Selecting the first word
df['roomType'] = df['roomType'].astype('category')  # Transforming in category dtype

df['roomType'].value_counts()  # Checking categories

roomType
Casa           113
Cabana         105
Apartamento    102
Quarto          89
Chalé           83
Loft            45
Microcasa       42
Hotel           21
Pousada         18
Lugar           13
Contêiner       12
Suíte           11
Condomínio      10
Trailer          2
Name: count, dtype: int64

In [9]:
# Fixing 'roomPrice' column
df['roomPrice'] = df['roomPrice'].str.extract('(\d+)').astype(float)  # Keeping just the numerical values
df.head(1)

Unnamed: 0,roomType,roomPrice,roomURL,hostType,rating,countReviews,amenities
0,Loft,268.0,https://www.airbnb.com/rooms/92836566916150808...,Preferido dos hóspedes\nPreferido dos hóspedes,4.89,74,


In [10]:
# Fixing the 'hostType' column
import unicodedata

def normalize_text(text):  # Function to normalize the strings
    text = unicodedata.normalize('NFKD', text)  # Normalize Unicode characters
    text = text.encode('ascii', 'ignore').decode('ascii')  # Remove non-ASCII characters
    return text.strip()

df['hostType'] = df['hostType'].apply(normalize_text)  # Apply normalization to the hostType column

print(df['hostType'].unique())  # Check unique values to debug

['Preferido dos hospedes\nPreferido dos hospedes' 'Superhost\nSuperhost'
 'De 18 a 20 de set.\n18  20 de set.' '']


In [11]:
# Replace specific problematic strings
df['hostType'] = df['hostType'].replace({
    'Preferido dos hospedes\nPreferido dos hospedes': 'preferido',
    'Superhost\nSuperhost': 'superhost',
    'De 18 a 20 de set.\n18  20 de set.': None,
    '': 'no_class'
}).astype('category')

# Display value counts
print(df['hostType'].value_counts())

hostType
preferido    476
no_class     123
superhost     69
Name: count, dtype: int64


In [12]:
# This code will process the DataFrame, aggregate the amenities, 
# and perform one-hot encoding to prepare the data for model training. 

df['amenities'] = df['amenities'].str.split('\n')

aggregation_map = {
    'wifi': 'WiFi',
    'hd': 'HDTV',
    'tv': 'TV',
    'netflix': 'Streaming Service',
    'prime': 'Streaming Service',
    'roku': 'Streaming Service',
    'disney+': 'Streaming Service',
    'hbo max': 'Streaming Service',
    'streaming': 'Streaming Service',
    'parking': 'Parking',
    'garage': 'Parking',
    'carport': 'Parking',
    'ac': 'Air Conditioning',
    'air conditioning': 'Air Conditioning',
    'pool': 'Pool',
    'hot tub': 'Hot Tub',
    'sauna': 'Sauna',
    'fireplace': 'Fireplace',
    'microwave': 'Microwave',
    'washer': 'Washer',
    'dryer': 'Dryer',
    'refrigerator': 'Refrigerator',
    'smoke alarm': 'Smoke Alarm',
    'carbon monoxide alarm': 'Carbon Monoxide Alarm',
    'bathroom': 'Bathroom',
    'kitchen': 'Kitchen',
    'patio': 'Patio',
    'balcony': 'Balcony',
    'backyard': 'Backyard',
    'view': 'View',
    'security cameras': 'Security Cameras',
    'ev charger': 'EV Charger',
    'breakfast': 'Breakfast',
    'pets allowed': 'Pets Allowed',
    'luggage dropoff allowed': 'Luggage Dropoff Allowed',
    'step-free access': 'Accessible',
    'step-free path': 'Accessible',
    'step-free guest entrance': 'Accessible',
    'crib': 'Crib',
    'high chair': 'High Chair',
    'pack ’n play/travel crib': 'Travel Crib',
}

# Function to aggregate amenities
def aggregate_amenity(amenity):
    for keyword, category in aggregation_map.items():
        if keyword.lower() in amenity.lower():
            return category
    return amenity

# Aggregate the amenities in the DataFrame
df['amenities'] = df['amenities'].apply(lambda amenities: [aggregate_amenity(amenity) for amenity in amenities])

# Flatten the list of amenities and get unique values
unique_amenities = set(amenity for amenities in df['amenities'] for amenity in amenities)

# Create separate columns for each amenity
for amenity in unique_amenities:
    df[amenity] = df['amenities'].apply(lambda x: 1 if amenity in x else 0)

# Drop the original amenities column
df = df.drop(columns=['amenities'])

In [13]:
# Convert the DataFrame to CSV
df.to_csv('merged_df.csv', encoding='utf-8', sep=';')

## Fine cleaning

In [14]:
# Load DataFrame 'dataset' and drop first column
dataset = pd.read_csv('merged_df.csv', encoding='utf-8', sep=';')
dataset.drop(dataset.columns[0], axis=1, inplace=True)
dataset.drop(dataset.columns[6], axis=1, inplace=True)

dataset.head(3)

Unnamed: 0,roomType,roomPrice,roomURL,hostType,rating,countReviews,Carbon Monoxide Alarm,Lock on bedroom door,Sauna,High Chair,...,Breakfast,Parking,Smoking allowed,Air Conditioning,Dryer,Pets Allowed,Accessible,Waterfront,Fire pit,Washer
0,Loft,268.0,https://www.airbnb.com/rooms/92836566916150808...,preferido,4.89,74.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Loft,99.0,https://www.airbnb.com/rooms/53832612?adults=2...,preferido,4.95,148.0,1,0,0,0,...,0,1,0,1,1,0,0,0,0,0
2,Chalé,329.0,https://www.airbnb.com/rooms/22588205?adults=2...,preferido,4.94,319.0,1,0,0,0,...,0,1,0,1,0,1,0,0,0,0


In [15]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 669 entries, 0 to 668
Data columns (total 46 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   roomType                           666 non-null    object 
 1   roomPrice                          668 non-null    float64
 2   roomURL                            669 non-null    object 
 3   hostType                           668 non-null    object 
 4   rating                             637 non-null    float64
 5   countReviews                       637 non-null    float64
 6   Carbon Monoxide Alarm              669 non-null    int64  
 7   Lock on bedroom door               669 non-null    int64  
 8   Sauna                              669 non-null    int64  
 9   High Chair                         669 non-null    int64  
 10  Long term stays allowed            669 non-null    int64  
 11  Crib                               669 non-null    int64  

In [16]:
# Finding registers with null values
nulls = dataset.isnull().sum()
nulls[nulls > 0]

## 'rating' and 'countReviews' columns have too much null values to judt drop
## or to fix by statistical replacements. I'll have to fix them manually...

roomType         3
roomPrice        1
hostType         1
rating          32
countReviews    32
dtype: int64

In [18]:
# Find rows with missing values in any column and extract their indices and URLs
rows_with_any_missing_values = dataset[dataset.isnull().any(axis=1)]

# Extract indices and URLs
indices_and_urls_with_missing_values = rows_with_any_missing_values[['roomURL']].copy()
indices_and_urls_with_missing_values.reset_index(inplace=True)

## Saved the results to a txt file...


In [None]:
corrections = \
{7: {'roomType': 'Chalé',
  'roomPrice': 359,
  'hostType': 'preferido',
  'rating': 4.84,
  'countReviews': 256},
 21: {'roomType': None,
  'roomPrice': None,
  'hostType': None,
  'rating': None,
  'countReviews': None},
 31: {'roomType': None,
  'roomPrice': None,
  'hostType': None,
  'rating': None,
  'countReviews': None},
 114: {'roomType': None,
  'roomPrice': None,
  'hostType': None,
  'rating': None,
  'countReviews': None},
 116: {'roomType': None,
  'roomPrice': None,
  'hostType': None,
  'rating': None,
  'countReviews': None},
 129: {'roomType': None,
  'roomPrice': None,
  'hostType': None,
  'rating': None,
  'countReviews': None},
 171: {'roomType': None,
  'roomPrice': None,
  'hostType': None,
  'rating': None,
  'countReviews': None},
 199: {'roomType': None,
  'roomPrice': None,
  'hostType': None,
  'rating': None,
  'countReviews': None},
 201: {'roomType': None,
  'roomPrice': None,
  'hostType': None,
  'rating': None,
  'countReviews': None},
 205: {'roomType': None,
  'roomPrice': None,
  'hostType': None,
  'rating': None,
  'countReviews': None},
 218: {'roomType': None,
  'roomPrice': None,
  'hostType': None,
  'rating': None,
  'countReviews': None},
 233: {'roomType': None,
  'roomPrice': None,
  'hostType': None,
  'rating': None,
  'countReviews': None},
 236: {'roomType': None,
  'roomPrice': None,
  'hostType': None,
  'rating': None,
  'countReviews': None},
 248: {'roomType': None,
  'roomPrice': None,
  'hostType': None,
  'rating': None,
  'countReviews': None},
 258: {'roomType': None,
  'roomPrice': None,
  'hostType': None,
  'rating': None,
  'countReviews': None},
 268: {'roomType': None,
  'roomPrice': None,
  'hostType': None,
  'rating': None,
  'countReviews': None},
 351: {'roomType': None,
  'roomPrice': None,
  'hostType': None,
  'rating': None,
  'countReviews': None},
 353: {'roomType': None,
  'roomPrice': None,
  'hostType': None,
  'rating': None,
  'countReviews': None},
 366: {'roomType': None,
  'roomPrice': None,
  'hostType': None,
  'rating': None,
  'countReviews': None},
 408: {'roomType': None,
  'roomPrice': None,
  'hostType': None,
  'rating': None,
  'countReviews': None},
 436: {'roomType': None,
  'roomPrice': None,
  'hostType': None,
  'rating': None,
  'countReviews': None},
 438: {'roomType': None,
  'roomPrice': None,
  'hostType': None,
  'rating': None,
  'countReviews': None},
 442: {'roomType': None,
  'roomPrice': None,
  'hostType': None,
  'rating': None,
  'countReviews': None},
 455: {'roomType': None,
  'roomPrice': None,
  'hostType': None,
  'rating': None,
  'countReviews': None},
 471: {'roomType': None,
  'roomPrice': None,
  'hostType': None,
  'rating': None,
  'countReviews': None},
 474: {'roomType': None,
  'roomPrice': None,
  'hostType': None,
  'rating': None,
  'countReviews': None},
 486: {'roomType': None,
  'roomPrice': None,
  'hostType': None,
  'rating': None,
  'countReviews': None},
 561: {'roomType': None,
  'roomPrice': None,
  'hostType': None,
  'rating': None,
  'countReviews': None},
 580: {'roomType': None,
  'roomPrice': None,
  'hostType': None,
  'rating': None,
  'countReviews': None},
 582: {'roomType': None,
  'roomPrice': None,
  'hostType': None,
  'rating': None,
  'countReviews': None},
 588: {'roomType': None,
  'roomPrice': None,
  'hostType': None,
  'rating': None,
  'countReviews': None},
 593: {'roomType': None,
  'roomPrice': None,
  'hostType': None,
  'rating': None,
  'countReviews': None},
 608: {'roomType': None,
  'roomPrice': None,
  'hostType': None,
  'rating': None,
  'countReviews': None},
 636: {'roomType': None,
  'roomPrice': None,
  'hostType': None,
  'rating': None,
  'countReviews': None},
 640: {'roomType': None,
  'roomPrice': None,
  'hostType': None,
  'rating': None,
  'countReviews': None},
 642: {'roomType': None,
  'roomPrice': None,
  'hostType': None,
  'rating': None,
  'countReviews': None}}

In [None]:
# Updating the DataFrame using the corrections dictionary
for index, correction in corrections.items():
    for column, value in correction.items():
        if value is not None:
            dataset.at[index, column] = value