In [None]:
#importando librerias
import pandas as pd
import numpy as np
import seaborn as sns
from Database import Airbnbs, Hosts, Airbnb_Details, Neighbourhoods, creating_engine, creating_session, closing_session

## Proceso de Exploración

### Leemos el archivo como un dataframe usando pandas

In [None]:
df_airbnb=pd.read_csv("Airbnb_Open_Data.csv",  na_values=[''])
df_airbnb.head(2)

### ¿Qué tipo de dato son las variables del conjunto de datos?

In [None]:
df_airbnb.dtypes

### ¿Cuántas variables de cada tipo de dato tenemos en el conjunto de datos?

In [None]:
(
    df_airbnb
    .dtypes
    .value_counts()
)

###  ¿Cuales son las dimensiones del Dataset?

In [None]:
df_airbnb.shape

### ¿Existen valores nulos explicitos en el conjunto de datos?

In [None]:
df_airbnb.isnull().any()

### De tener observaciones con valores nulos, ¿cuántas tenemos por cada variable?

In [None]:
(
    df_airbnb
    .isnull()
    .sum()
    .sort_values(ascending=False)
)


### ¿Cuál es la proporción de valores nulos por cada variable? 

In [None]:
(
    df_airbnb
    .isnull()
    .melt(value_name='missing')
    .pipe(
        lambda df: (
            sns.displot(
                data=df,
                y='variable',
                hue='missing',
                multiple='fill',
                aspect=2
            )
        )
    )
)

We can certainly see that the column "license" has no sense to keep it, so we proceed to delete it, it's also known that all the information comes from the USA, so the columns "country" and "country code" don't provide any value. 

In [None]:
df_airbnb.drop(["license", "country", "country code"], axis=1, inplace=True)
print(df_airbnb.columns)

### ¿Cuántos valores nulos tenemos en total en el conjunto de datos?

In [None]:
(
    df_airbnb
    .isnull()
    .sum()
    .sum()
)

## Proceso de Limpieza 

### Creación de las dimensiones "Hosts" y "Neighbourhoods"

In [None]:
#Creamos el dataframe "host_table", esta será una de las dimensiones del Data Warehouse
host_table = df_airbnb[["host id", "host name", "host_identity_verified"]]
host_table.head(2)

In [None]:
#Dado que el atributo de "host_identity_verified" cuenta con valores nulos, vamos a llenar dichos valores con unverified
host_table["host_identity_verified"].fillna("unverified", inplace=True)
host_table

In [None]:
#Creamos el dataframe "neighbourhood_table", esta será otra dimension del Data Warehouse

neighbourhood_table = df_airbnb[["neighbourhood group", "neighbourhood", "lat", "long"]]
neighbourhood_table.rename(columns={'neighbourhood group': 'neighbourhood_group'}, inplace=True)
neighbourhood_table.head(2)

### Proceso de limpieza a la dimensión de Neighbourhoods

#### Agrupamos con el fin de no tener datos redundantes, calculando además promedio de latitud y longitud

In [231]:
neighbourhood_table = neighbourhood_table.groupby(["neighbourhood_group", "neighbourhood"]).agg({
    "lat": "mean",
    "long": "mean"
}).reset_index()

#Creamos ID y reorganizamos columnas
neighbourhood_table["neighbourhood_id"] = range(1, len(neighbourhood_table) + 1)
column_order = ["neighbourhood_id", "neighbourhood_group", "neighbourhood", "lat", "long"]
neighbourhood_table = neighbourhood_table[column_order]

neighbourhood_table.head(2)

Unnamed: 0,neighbourhood_id,neighbourhood_group,neighbourhood,lat,long
0,1,Bronx,Allerton,40.86473,-73.859631
1,2,Bronx,Baychester,40.873964,-73.84308


### Proceso de limpieza a la dimensión de Airbnb Details

#### Creamos el dataframe "airbnb_detail", esta será otra dimension del Data Warehouse

In [None]:
airbnb_detail = df_airbnb[
    ["id", "NAME", "instant_bookable", 
     "cancellation_policy", "room type", "Construction year", 
     "price", "service fee", "minimum nights", "number of reviews", 
     "last review", "reviews per month", "review rate number", "calculated host listings count", 
     "availability 365", "house_rules"]]

#Estandarizamos los nombres
new_column_names = [x.lower().replace(" ", "_") for x in airbnb_detail.columns]
airbnb_detail.columns = new_column_names

airbnb_detail.head(2)

#### Es necesario eliminar los símbolos de dólar de las columnas "price" y "service_fee"

In [None]:
airbnb_detail["price"] = airbnb_detail["price"].str.replace('$', '').str.replace(',', '').str.strip()
airbnb_detail["service_fee"] = airbnb_detail["service_fee"].str.replace('$', '').str.replace(',', '').str.strip()

# Convertir las columnas "price" y "service_fee" a valores flotantes
airbnb_detail["price"] = airbnb_detail["price"].astype(float)
airbnb_detail["service_fee"] = airbnb_detail["service_fee"].astype(float)

#Verificamos que no haya ningún registro en el que el service fee sea mayor al precio
counter = airbnb_detail["price"] < airbnb_detail["service_fee"]
count_greater_service_fee = counter.sum()
count_greater_service_fee

#### Limpia los valores no finitos en la columna "Construction year"

In [None]:
airbnb_detail["construction_year"] = airbnb_detail["construction_year"].replace([np.inf, -np.inf], np.nan)
airbnb_detail["construction_year"] = airbnb_detail["construction_year"].fillna(0)  # Rellenar valores nulos con 0 o el valor 

#### Cambiamos la columna "Construction year" de tipo float64 a int32, ya que los años son enteros.

In [None]:

airbnb_detail["construction_year"] = airbnb_detail["construction_year"].astype(int)

In [None]:
filtered_df = airbnb_detail[airbnb_detail["number_of_reviews"] == 0]
# Muestra los registros filtrados
print(filtered_df)

# Cuenta los valores nulos en la columna "last_review"
null_last_review = filtered_df["last_review"].isnull().sum()
# Cuenta los valores nulos en la columna "reviews_per_month"
null_reviews_per_month = filtered_df["reviews_per_month"].isnull().sum()

# Muestra los resultados
print("Registros nulos en last review:", null_last_review)
print("Registros nulos en reviews per month:", null_reviews_per_month)

#### Reemplaza los valores nulos por 0 en las columnas "last review" y "reviews per month". Cuando la columna number of reviews es 0.

In [None]:
airbnb_detail.loc[airbnb_detail["number_of_reviews"] == 0, "last_review"] = 0
airbnb_detail.loc[airbnb_detail["number_of_reviews"] == 0, "reviews_per_month"] = 0

#### Remplazar los nulos de la columna house_rules 

In [None]:
airbnb_detail["house_rules"] = airbnb_detail["house_rules"].fillna("No se Especificaron Las Reglas")

In [230]:
airbnb_detail

Unnamed: 0,id,name,instant_bookable,cancellation_policy,room_type,construction_year,price,service_fee,minimum_nights,number_of_reviews,last_review,reviews_per_month,review_rate_number,calculated_host_listings_count,availability_365,house_rules
0,1001254,Clean & quiet apt home by the park,False,strict,Private room,2020,966.0,193.0,10.0,9.0,10/19/2021,0.21,4.0,6.0,286.0,Clean up and treat the home the way you'd like...
1,1002102,Skylit Midtown Castle,False,moderate,Entire home/apt,2007,142.0,28.0,30.0,45.0,5/21/2022,0.38,4.0,2.0,228.0,Pet friendly but please confirm with me if the...
2,1002403,THE VILLAGE OF HARLEM....NEW YORK !,True,flexible,Private room,2005,620.0,124.0,3.0,0.0,0,0.00,5.0,1.0,352.0,"I encourage you to use my kitchen, cooking and..."
3,1002755,,True,moderate,Entire home/apt,2005,368.0,74.0,30.0,270.0,7/5/2019,4.64,4.0,1.0,322.0,No se Especificaron Las Reglas
4,1003689,Entire Apt: Spacious Studio/Loft by central park,False,moderate,Entire home/apt,2009,204.0,41.0,10.0,9.0,11/19/2018,0.10,3.0,1.0,289.0,"Please no smoking in the house, porch or on th..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102594,6092437,Spare room in Williamsburg,False,flexible,Private room,2003,844.0,169.0,1.0,0.0,0,0.00,3.0,1.0,227.0,No Smoking No Parties or Events of any kind Pl...
102595,6092990,Best Location near Columbia U,True,moderate,Private room,2016,837.0,167.0,1.0,1.0,7/6/2015,0.02,2.0,2.0,395.0,House rules: Guests agree to the following ter...
102596,6093542,"Comfy, bright room in Brooklyn",True,moderate,Private room,2009,988.0,198.0,3.0,0.0,0,0.00,5.0,1.0,342.0,No se Especificaron Las Reglas
102597,6094094,Big Studio-One Stop from Midtown,True,strict,Entire home/apt,2015,546.0,109.0,2.0,5.0,10/11/2015,0.10,3.0,1.0,386.0,No se Especificaron Las Reglas


### Proceso de limpieza a la dimesión de Hots

In [229]:
#Estandarizamos los nombres
new_column_names = [x.lower().replace(" ", "_") for x in host_table.columns]
host_table.columns = new_column_names
host_table.head(2)

Unnamed: 0,host_id,host_name,host_identity_verified
0,80014485718,Madaline,unconfirmed
1,52335172823,Jenna,verified


### Proceso de limpieza a nuestra tabla de hecho 

In [None]:
#Delete the following columns since we already set up diferent tables to save all the information related with neighbourhoods and hosts
df_airbnb.drop(["lat", "long"], axis=1, inplace=True)
df_airbnb.drop(["host name", "host_identity_verified"], axis=1, inplace=True)
df_airbnb.drop(["NAME", "instant_bookable", 
     "cancellation_policy", "room type", "Construction year", 
     "price", "service fee", "minimum nights", "number of reviews", 
     "last review", "reviews per month", "review rate number", "calculated host listings count", 
     "availability 365", "house_rules"], axis=1, inplace=True)

In [None]:
renaming_columns = {
    "id" : "airbnb_id",
    "neighbourhood group" : "neighbourhood_group"
}
df_airbnb.rename(columns=renaming_columns, inplace=True)
df_airbnb.head(2)

#### Se cambian las columnas neighbourhood_group y neighbourhood por un identificador númerico

In [None]:
def get_neighbourhood_id(row):
    mask = (neighbourhood_table["neighbourhood_group"] == row["neighbourhood_group"]) & (neighbourhood_table["neighbourhood"] == row["neighbourhood"])
    matched_row = neighbourhood_table[mask]
    if not matched_row.empty:
        return matched_row["neighbourhood_id"].iloc[0]
    else:
        return None

# Aplicar la función para obtener "neighbourhood_id"
df_airbnb["neighbourhood_id"] = df_airbnb.apply(get_neighbourhood_id, axis=1)

#### Una vez asignados los IDs de forma númerica podemos prescindir de las columnas iniciales

In [None]:
df_airbnb.drop(["neighbourhood_group", "neighbourhood"], axis=1, inplace=True)
df_airbnb

## Proceso de Carga

In [None]:
#Creating engine
engine1 = creating_engine()

#Creating session
session1 = creating_session(engine1)

