# Exploratory Data Analysis
This dataset was scraped from Airbnb using Octoparse through a 2-step method, treated, merged and processed. You can check this all in the `preprocessing.ipynb` notebook in the repository. 

Our objective is to use it to analyse and understand the Airbnb market in Campos do Jordão, a touristic town in the state of São Paulo, Brazil.

In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Loading the dataset
df = pd.read_csv('datasets/final_dataset.csv')

df.info()

## Descriptive Analysis

In [None]:
## Descriptive Analysis
num_df = df[['roomPrice', 'rating', 'countReviews']]
cat_df = df[['roomType', 'hostType', 'is_new']].astype('category')

In [None]:
# Numerical columns Analysis
num_descriptive = num_df.describe()

num_descriptive

### Price

In [None]:
# Price columns Boxplots
sns.set_theme(context= 'notebook', style="whitegrid")

plt.figure(figsize=(14, 4))
sns.boxplot(x=df['roomPrice'])

plt.title('Boxplot of Room Prices')
plt.xlabel('Price (R$)')
plt.grid(True, axis='x')

plt.show()

In [None]:
# Price histogram
sns.set_theme(context= 'notebook', style="whitegrid")
plt.figure(figsize=(12, 6))

sns.histplot(df['roomPrice'], bins=30, kde=True)

plt.title('Distribution of Room Prices')
plt.xlabel('Price (R$)')
plt.ylabel('Frequency')
plt.grid(True, axis='y')

plt.show()

In [None]:
# Creating 3 price groups
price_p25 = num_descriptive['roomPrice'].loc['25%']
price_p75 = num_descriptive['roomPrice'].loc['75%']

def categorize_price(price):
    if price <= price_p25:
        return 'Low price group'
    elif price <= price_p75:
        return 'Medium price group'
    else:
        return 'High price group'

df['priceCategory'] = df['roomPrice'].apply(categorize_price)

price_category_counts = df['priceCategory'].value_counts()
print(price_category_counts)

In [None]:
grouped_description = df.groupby('priceCategory')[['roomPrice', 'rating', 'countReviews']].describe()

for group in grouped_description.index.levels[0]:
    print(f"Descrição para {group}:")
    print(grouped_description.loc[group])
    print("\n" + "="*80 + "\n")

### Room types

In [None]:
# Room types
room_types = df['roomType'].value_counts(normalize=True)
room_types