# 2. Getting started

## Loading necessary libraries

In [None]:
import pandas as pd

## Loading JSON dataset and dropping some columns

In [None]:
airbnb = pd.read_json("C:\Github\Data_Science_Portfolio\Airbnb-Price-Prediction\octoparse_airbnb_v1.json", encoding='latin-1')
airbnb = airbnb.drop(columns= ["Keyword", "Host", "roomName", "roomRating", "roomReviewcount"])

airbnb.head(2)

In [None]:
airbnb.info()

In [None]:
airbnb2 = pd.read_json('C:\\Github\\Data_Science_Portfolio\\Airbnb-Price-Prediction\\octoparse_airbnb_v2.json', encoding='latin-1')
airbnb2 = airbnb2.drop(columns= ["Title", "Location", "Number_of_Guests", "Number_of_Bedrooms", "Number_of_Beds", "Number_of_Bath", "Price", "Sleeping_Arrangements", "Hosted_by", "Response_Rate", "Image_1", "Image_2", "Image_3", "Current_Time"])
airbnb2.head(2)

# 3. Joining DataFrames

In [None]:
merged_df = pd.merge(airbnb, airbnb2, left_on='roomURL', right_on='Page_URL')
df = merged_df.drop(columns=["roomURL", "Page_URL"])
print(merged_df.info())
print(merged_df.head(2))


# 4. Cleaning

In [None]:
# Renaming columns for consistency
df.rename(columns={
    'roomTitle' : 'roomType',
    'Rating' : 'rating',
    'Number_of_Reviews' : 'numberReviews',
    'Amenities': 'amenities',
}, inplace=True)

In [None]:
# Drop duplicate rows
df.drop_duplicates(inplace=True)

In [None]:
# Find rows with missing values
missing = df[df.isnull().any(axis=1)]

# Display rows with missing values
print(missing)

In [None]:
df.head()

In [None]:
df['roomType'] = df['roomType'].str.split().str[0]

df['roomType'] = df['roomType'].astype('category')

df['roomType'].value_counts()

In [None]:
df['roomPrice'] = df['roomPrice'].str.extract('(\d+)').astype(float)
df.head()

In [None]:
df['hostType'] = df['hostType'].replace({
    'Preferido dos\xa0hóspedes\nPreferido dos\xa0hóspedes' : 'preferido',
    'Superhost\nSuperhost' : 'superhost',
    'De 18 a 20 de set.\n18 – 20 de set.' : 'no_class',
    '' : 'no_class'
}).astype('category')

df['hostType'].value_counts()

In [None]:
print(df['rating'].isna().value_counts())

In [None]:
print(df['numberReviews'].isna().value_counts())

In [None]:
# This code will process the DataFrame, aggregate the amenities, 
# and perform one-hot encoding to prepare the data for model training. 


df['amenities'] = df['amenities'].str.split('\n')

aggregation_map = {
    'wifi': 'WiFi',
    'hd': 'HDTV',
    'tv': 'TV',
    'netflix': 'Streaming Service',
    'prime': 'Streaming Service',
    'roku': 'Streaming Service',
    'disney+': 'Streaming Service',
    'hbo max': 'Streaming Service',
    'streaming': 'Streaming Service',
    'parking': 'Parking',
    'garage': 'Parking',
    'carport': 'Parking',
    'ac': 'Air Conditioning',
    'air conditioning': 'Air Conditioning',
    'pool': 'Pool',
    'hot tub': 'Hot Tub',
    'sauna': 'Sauna',
    'fireplace': 'Fireplace',
    'microwave': 'Microwave',
    'washer': 'Washer',
    'dryer': 'Dryer',
    'refrigerator': 'Refrigerator',
    'smoke alarm': 'Smoke Alarm',
    'carbon monoxide alarm': 'Carbon Monoxide Alarm',
    'bathroom': 'Bathroom',
    'kitchen': 'Kitchen',
    'patio': 'Patio',
    'balcony': 'Balcony',
    'backyard': 'Backyard',
    'view': 'View',
    'security cameras': 'Security Cameras',
    'ev charger': 'EV Charger',
    'breakfast': 'Breakfast',
    'pets allowed': 'Pets Allowed',
    'luggage dropoff allowed': 'Luggage Dropoff Allowed',
    'step-free access': 'Accessible',
    'step-free path': 'Accessible',
    'step-free guest entrance': 'Accessible',
    'crib': 'Crib',
    'high chair': 'High Chair',
    'pack ’n play/travel crib': 'Travel Crib',
}

# Function to aggregate amenities
def aggregate_amenity(amenity):
    for keyword, category in aggregation_map.items():
        if keyword.lower() in amenity.lower():
            return category
    return amenity

# Aggregate the amenities in the DataFrame
df['amenities'] = df['amenities'].apply(lambda amenities: [aggregate_amenity(amenity) for amenity in amenities])

# Flatten the list of amenities and get unique values
unique_amenities = set(amenity for amenities in df['amenities'] for amenity in amenities)

# Create separate columns for each amenity
for amenity in unique_amenities:
    df[amenity] = df['amenities'].apply(lambda x: 1 if amenity in x else 0)

# Drop the original amenities column
df = df.drop(columns=['amenities'])

print(df)

In [None]:
# Convert the DataFrame to a JSON string
json_str = df.to_json(orient='records', lines=True, force_ascii=False)

# Save the JSON string to a file with latin-1 encoding
with open('final_df.json', 'w', encoding='latin-1') as file:
    file.write(json_str)

print("The DataFrame has been saved to 'final_df.json' with latin-1 encoding.")