# 2. Getting started

## Loading necessary libraries

In [8]:
import pandas as pd

## Loading JSON dataset and dropping some columns

In [13]:
airbnb = pd.read_json("C:\Github\Data_Science_Portfolio\Airbnb-Price-Prediction\octoparse_airbnb_v1.json")
airbnb = airbnb.drop(columns= ["Keyword", "Host", "roomName", "roomRating", "roomReviewcount"])

airbnb.head(2)

Unnamed: 0,roomTitle,roomPrice,roomURL,hostType
0,Loft em Campos do Jordão,R$268 por noite,https://www.airbnb.com/rooms/92836566916150808...,Preferido dos hóspedes\nPreferido dos hóspedes
1,Loft em Campos do Jordão,R$99 por noite,https://www.airbnb.com/rooms/53832612?adults=2...,Preferido dos hóspedes\nPreferido dos hóspedes


In [5]:
airbnb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 672 entries, 0 to 671
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   roomTitle        672 non-null    object
 1   roomRating       672 non-null    object
 2   roomReviewcount  672 non-null    object
 3   roomPrice        672 non-null    object
 4   roomURL          672 non-null    object
 5   hostType         672 non-null    object
dtypes: object(6)
memory usage: 31.6+ KB


In [14]:
airbnb2 = pd.read_json('C:\\Github\\Data_Science_Portfolio\\Airbnb-Price-Prediction\\octoparse_airbnb_v2.json')
airbnb2 = airbnb2.drop(columns= ["Title", "Location", "Number_of_Guests", "Number_of_Bedrooms", "Number_of_Beds", "Number_of_Bath", "Price", "Sleeping_Arrangements", "Hosted_by", "Response_Rate", "Image_1", "Image_2", "Image_3", "Current_Time"])
airbnb2.head(2)

Unnamed: 0,Page_URL,Rating,Number_of_Reviews,Amenities
0,https://www.airbnb.com/rooms/11369075570005485...,,,Kitchen\nWifi\nFree parking on premises\nHot t...
1,https://www.airbnb.com/rooms/92836566916150808...,4.89,74.0,


# 3. Joining DataFrames

In [19]:
merged_df = pd.merge(airbnb, airbnb2, left_on='roomURL', right_on='Page_URL')
df = merged_df.drop(columns=["roomURL", "Page_URL"])
print(merged_df.info())
print(merged_df.head(2))


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 669 entries, 0 to 668
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   roomTitle          669 non-null    object
 1   roomPrice          669 non-null    object
 2   roomURL            669 non-null    object
 3   hostType           669 non-null    object
 4   Page_URL           669 non-null    object
 5   Rating             669 non-null    object
 6   Number_of_Reviews  669 non-null    object
 7   Amenities          669 non-null    object
dtypes: object(8)
memory usage: 41.9+ KB
None
                  roomTitle        roomPrice  \
0  Loft em Campos do Jordão  R$268 por noite   
1  Loft em Campos do Jordão   R$99 por noite   

                                             roomURL  \
0  https://www.airbnb.com/rooms/92836566916150808...   
1  https://www.airbnb.com/rooms/53832612?adults=2...   

                                         hostType  \
0  Preferido 

# 4. Cleaning

In [23]:
# Renaming columns for consistency
df.rename(columns={
    'roomTitle' : 'roomType',
    'Rating' : 'rating',
    'Number_of_Reviews' : 'numberReviews',
    'Amenities': 'amenities',
}, inplace=True)

In [26]:
# Drop duplicate rows
df.drop_duplicates(inplace=True)

In [27]:
# Find rows with missing values
missing = df[df.isnull().any(axis=1)]

# Display rows with missing values
print(missing)

Empty DataFrame
Columns: [roomType, roomPrice, hostType, rating, numberReviews, amenities]
Index: []


In [28]:
df.head()

Unnamed: 0,roomType,roomPrice,hostType,rating,numberReviews,amenities
0,Loft em Campos do Jordão,R$268 por noite,Preferido dos hóspedes\nPreferido dos hóspedes,4.89,74,
1,Loft em Campos do Jordão,R$99 por noite,Preferido dos hóspedes\nPreferido dos hóspedes,4.95,148,Kitchen\nFast wifi – 354 Mbps\nDedicated works...
2,Chalé em Jd do Pai,R$329 por noite,Preferido dos hóspedes\nPreferido dos hóspedes,4.94,319,Courtyard view\nGarden view\nKitchen\nWifi\nDe...
3,Chalé em Campos do Jordão,R$595 por noite,Superhost\nSuperhost,5.0,3,
4,Quarto de hotel em V Inglesa,R$276 por noite,Preferido dos hóspedes\nPreferido dos hóspedes,4.93,507,Wifi\nFree parking on premises\nPets allowed\n...


In [34]:
df['roomType'] = df['roomType'].str.split().str[0]

df['roomType'] = df['roomType'].astype('category')

df['roomType'].value_counts()

roomType
Casa           80
Apartamento    65
Cabana         63
Quarto         57
Chalé          49
Loft           33
Microcasa      28
Hotel          12
Lugar          10
Contêiner       8
Pousada         7
Condomínio      6
Suíte           6
Trailer         2
Name: count, dtype: int64

In [35]:
df['roomPrice'] = df['roomPrice'].str.extract('(\d+)').astype(float)
df.head()

Unnamed: 0,roomType,roomPrice,hostType,rating,numberReviews,amenities
0,Loft,268.0,Preferido dos hóspedes\nPreferido dos hóspedes,4.89,74,
1,Loft,99.0,Preferido dos hóspedes\nPreferido dos hóspedes,4.95,148,Kitchen\nFast wifi – 354 Mbps\nDedicated works...
2,Chalé,329.0,Preferido dos hóspedes\nPreferido dos hóspedes,4.94,319,Courtyard view\nGarden view\nKitchen\nWifi\nDe...
3,Chalé,595.0,Superhost\nSuperhost,5.0,3,
4,Quarto,276.0,Preferido dos hóspedes\nPreferido dos hóspedes,4.93,507,Wifi\nFree parking on premises\nPets allowed\n...


In [44]:
df['hostType'] = df['hostType'].replace({
    'Preferido dos\xa0hóspedes\nPreferido dos\xa0hóspedes' : 'preferido',
    'Superhost\nSuperhost' : 'superhost',
    'De 18 a 20 de set.\n18 – 20 de set.' : 'no_class',
    '' : 'no_class'
}).astype('category')

df['hostType'].value_counts()

hostType
preferido    309
no_class      75
superhost     45
Name: count, dtype: int64

In [48]:
print(df['rating'].isna().value_counts())

rating
False    429
Name: count, dtype: int64


In [49]:
print(df['numberReviews'].isna().value_counts())

numberReviews
False    429
Name: count, dtype: int64


In [52]:
df['amenities'] = df['amenities'].astype('category')
df['amenities'].unique().value_counts()

                                                                                                                                                                                                                                                                                        1
Bedroom\nStep-free access\nBedroom 2\nStep-free access\nFull bathroom\nStep-free access\nFull bathroom 2\nStep-free access\nHalf bathroom\nStep-free access\nFull kitchen\nStep-free access\nLaundry room\nStep-free access                                                             1
City skyline view\nCourtyard view\nKitchen\nWifi\nDedicated workspace\nFree parking on premises\nPets allowed\nTV\nShared backyard\nUnavailable: Carbon monoxide alarm\nCarbon monoxide alarm                                                                                           1
City skyline view\nCourtyard view\nKitchen\nWifi\nDedicated workspace\nFree parking on premises\nShared indoor pool - available all year, open specific ho

In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 429 entries, 0 to 668
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   roomType       426 non-null    category
 1   roomPrice      428 non-null    float64 
 2   hostType       429 non-null    category
 3   rating         429 non-null    object  
 4   numberReviews  429 non-null    object  
 5   amenities      429 non-null    category
dtypes: category(3), float64(1), object(2)
memory usage: 41.9+ KB
