#**PREPROCESAMIENTO Y NORMALIZACIÓN EN VAL**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

In [None]:
## Cargar datos fd_val
fd_val = pd.read_csv("/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/fd_val.csv", sep=';',)

print(f"Registros en fd_val: {len(fd_val)}")

Registros en fd_val: 374


In [None]:
fd_val.shape

(374, 72)

##Normalización y redimensionado de datos numéricos y categóricos DATOS TABULARES fd_val basado en TRAIN

In [None]:
#

# Definir las columnas que no aportan información relevante
not_useful_columns = [
    'first_review', 'last_review', 'calculated_host_listings_count_entire_homes',
    'calculated_host_listings_count_private_rooms', 'calculated_host_listings_count_shared_rooms',
    'calculated_host_listings_count', 'license', 'calendar_last_scraped',
    'has_availability', 'availability_30', 'availability_60', 'availability_90',
    'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights',
    'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'minimum_minimum_nights', 'host_listings_count',
    'host_neighbourhood', 'host_picture_url', 'host_about', 'host_location',
    'host_thumbnail_url', 'host_name', 'host_url', 'host_id', 'source',
    'last_scraped', 'scrape_id', 'neighbourhood_group_cleansed', 'calendar_updated'
]

# Hacemos un drop de todas las columnas que no aportan información relevante a precio o inmueble en fd_val
fd_val.drop(not_useful_columns, axis=1, inplace=True)

fd_val.shape

(374, 40)

##nulls en review_scores_* y reviews_per_month

In [None]:
import pandas as pd
import joblib

# Definir las columnas a imputar
review_columns = ['review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness',
                  'review_scores_checkin', 'review_scores_communication', 'review_scores_location',
                  'review_scores_value', 'reviews_per_month']

# Cargar las medianas previamente guardadas del archivo
medianas_reviews = joblib.load('/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/pkl/medianas_reviews.pkl')

# Imputar valores nulos en el conjunto de validación con las medianas
for column in review_columns:
    fd_val[column] = fd_val[column].fillna(medianas_reviews[column])

# Imprimir el número de nulos restantes después de la imputación
print("Número de nulos restantes después de la imputación en val:")
print(fd_val[review_columns].isnull().sum())


Número de nulos restantes después de la imputación en val:
review_scores_rating           0
review_scores_accuracy         0
review_scores_cleanliness      0
review_scores_checkin          0
review_scores_communication    0
review_scores_location         0
review_scores_value            0
reviews_per_month              0
dtype: int64


##Imputacion 'bathrooms' y 'bathrooms_text'

In [None]:
import re
import joblib

# Cargar la información de imputación guardada en train
bathrooms_info = joblib.load('/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/pkl/bathrooms_info.pkl')
bathrooms_median = bathrooms_info['bathrooms_median']
porcentaje_bathrooms_shared = bathrooms_info['porcentaje_bathrooms_shared']

# Función para imputar 'bathrooms' en val
def extract_bathrooms_val(text):
    if pd.isnull(text):
        return None
    text = str(text).lower()
    if "half" in text:  # Manejar específicamente "Half-bath"
        return 0.5
    match = re.search(r'(\d+(\.\d+)?)', text)  # Buscar números con decimales
    return float(match.group(1)) if match else None

# Crear columna 'bathrooms_shared' para val
fd_val['bathrooms_shared'] = fd_val['bathrooms_text'].apply(
    lambda x: 1 if 'shared' in str(x).lower() else 0
)

# Imputar bathrooms usando bathrooms_text
fd_val['bathrooms'] = fd_val['bathrooms'].combine_first(
    fd_val['bathrooms_text'].apply(extract_bathrooms_val)
)

# Imputar valores restantes con la mediana calculada en train
fd_val['bathrooms'].fillna(bathrooms_median, inplace=True)

# Imputar bathrooms_shared restantes con el porcentaje calculado en train
fd_val['bathrooms_shared'].fillna(round(porcentaje_bathrooms_shared), inplace=True)

# Verificar si 'bathrooms_text' puede eliminarse
if fd_val['bathrooms'].isnull().sum() == 0 and fd_val['bathrooms_shared'].isnull().sum() == 0:
    fd_val.drop(columns=['bathrooms_text'], inplace=True)
    print("'bathrooms_text' ha sido eliminada porque ya no hay nulos en 'bathrooms' y 'bathrooms_shared'.")
else:
    print("'bathrooms_text' no ha sido eliminada porque aún hay nulos en 'bathrooms' o 'bathrooms_shared'.")

# Imprimir resumen de imputación
print("Nulos en 'bathrooms' después de la imputación:", fd_val['bathrooms'].isnull().sum())
print("Nulos en 'bathrooms_shared':", fd_val['bathrooms_shared'].isnull().sum())


'bathrooms_text' ha sido eliminada porque ya no hay nulos en 'bathrooms' y 'bathrooms_shared'.
Nulos en 'bathrooms' después de la imputación: 0
Nulos en 'bathrooms_shared': 0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  fd_val['bathrooms'].fillna(bathrooms_median, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  fd_val['bathrooms_shared'].fillna(round(porcentaje_bathrooms_shared), inplace=True)


# Imputar 'beds'

In [None]:
import pandas as pd
import joblib

# Definir las columnas a imputar
group_columns = ['property_type', 'room_type', 'accommodates', 'bathrooms', 'bathrooms_shared']

# Cargar las medianas previamente guardadas del archivo
median_beds_grouped = joblib.load('/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/pkl/median_beds_grouped.pkl')
mediana_global_beds = joblib.load('/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/pkl/mediana_global_beds.pkl')

# Imputar valores nulos en el conjunto de validación con las medianas calculadas por grupo
fd_val = fd_val.merge(median_beds_grouped, on=group_columns, how='left')
fd_val['beds'] = fd_val['beds'].fillna(fd_val['beds_median_grouped'])

# Imputar cualquier valor restante con la mediana global de 'beds'
fd_val['beds'] = fd_val['beds'].fillna(mediana_global_beds)

# Eliminar la columna temporal 'beds_median_grouped' después de la imputación
fd_val.drop(columns=['beds_median_grouped'], inplace=True)

# Imprimir el número de nulos restantes después de la imputación
print("Nulos restantes en 'beds' después de la imputación en val:", fd_val['beds'].isnull().sum())

Nulos restantes en 'beds' después de la imputación en val: 0


# Imputar 'bedrooms'


In [None]:
import numpy as np
import pandas as pd
from difflib import get_close_matches

# Definir los valores conocidos de 'room_type'
known_room_types = ["Entire home/apt", "Private room", "Hotel room", "Shared room"]

# Función de imputación de 'bedrooms' basada en reglas mejoradas (misma que en train)
def imputar_bedrooms(df):
    for index, row in df.iterrows():
        room_type = row['room_type']

        # Normalizar valor: Convertir a minúsculas para evitar problemas de mayúsculas/minúsculas
        room_type_normalized = room_type.lower() if pd.notnull(room_type) else None

        # Intentar encontrar coincidencias aproximadas con los valores conocidos
        match = get_close_matches(room_type_normalized, [rt.lower() for rt in known_room_types], n=1, cutoff=0.8)

        # Si encontramos una coincidencia, utilizamos el valor conocido
        if match:
            room_type_final = match[0].capitalize()
        else:
            # Si no hay coincidencias, intentamos hacer una clasificación basada en accommodates
            accommodates = row['accommodates']
            if pd.notnull(accommodates):
                if accommodates <= 2:
                    room_type_final = "Private room"  # Suposición razonable si solo tiene capacidad para 2
                elif accommodates >= 3 and accommodates <= 6:
                    room_type_final = "Entire home/apt"  # Suposición razonable para acomodar 3 a 6 personas
                else:
                    room_type_final = "Entire home/apt"  # Suposición razonable para más de 6 personas
            else:
                # Si no podemos hacer una clasificación basada en accommodates, asignamos un valor por defecto
                room_type_final = "Unknown"

        # Aplicar reglas para imputar 'bedrooms'
        if room_type_final == "Entire home/apt":
            # Reglas para 'Entire home/apt'
            if row['accommodates'] <= 2:
                df.at[index, 'bedrooms'] = 1
            elif 3 <= row['accommodates'] <= 4:
                df.at[index, 'bedrooms'] = 2
            elif 5 <= row['accommodates'] <= 6:
                df.at[index, 'bedrooms'] = 3
            else:
                df.at[index, 'bedrooms'] = np.ceil(row['accommodates'] / 2)

        elif room_type_final == "Private room":
            # Regla para 'Private room'
            df.at[index, 'bedrooms'] = 1

        elif room_type_final in ["Hotel room", "Shared room"]:
            # Regla para 'Hotel room' y 'Shared room'
            df.at[index, 'bedrooms'] = 1

        elif room_type_final == "Unknown":
            # Valor predeterminado para un 'room_type' desconocido, basándose en el número de accommodates
            df.at[index, 'bedrooms'] = np.ceil(row['accommodates'] / 2) if pd.notnull(row['accommodates']) else 1

    return df

# Aplicar la función a fd_val
fd_val = imputar_bedrooms(fd_val)

# Verificar nulos restantes en 'bedrooms' después de la imputación en val
print("Nulos restantes en 'bedrooms' después de la imputación en fd_val:", fd_val['bedrooms'].isnull().sum())

Nulos restantes en 'bedrooms' después de la imputación en fd_val: 0


In [None]:
# Identificar columnas que contienen la palabra 'host' en su nombre
host_columns = [col for col in fd_val.columns if 'host' in col]

# Mostrar valores nulos en estas columnas
null_counts = fd_val[host_columns].isnull().sum()

print("Valores nulos en las columnas que contienen 'host':")
print(null_counts)


Valores nulos en las columnas que contienen 'host':
host_since                    0
host_response_time           34
host_response_rate           34
host_acceptance_rate         30
host_is_superhost             3
host_total_listings_count     0
host_verifications            0
host_has_profile_pic          0
host_identity_verified        0
dtype: int64


###Imputacion de NULLS columnas catégoricas:
Imputar columnas host_

In [None]:
import joblib
import numpy as np
import pandas as pd

# Cargar la información guardada de train
host_info = joblib.load('/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/pkl/host_info.pkl')
mean_host_response_rate = host_info['mean_host_response_rate']
mean_host_acceptance_rate = host_info['mean_host_acceptance_rate']
mode_host_is_superhost = host_info['mode_host_is_superhost']

# Imputar 'host_is_superhost' con el valor más frecuente (moda) en el dataset
fd_val['host_is_superhost'] = fd_val['host_is_superhost'].fillna(mode_host_is_superhost)

# Convertir 'host_response_rate' y 'host_acceptance_rate' de texto a valores numéricos quitando el % y dividiendo el número por 100
if fd_val['host_response_rate'].dtype == 'O':
    fd_val['host_response_rate'] = fd_val['host_response_rate'].str.rstrip('%').astype(float) / 100
if fd_val['host_acceptance_rate'].dtype == 'O':
    fd_val['host_acceptance_rate'] = fd_val['host_acceptance_rate'].str.rstrip('%').astype(float) / 100

# Imputar 'host_response_rate' y 'host_acceptance_rate' basado en la media dentro de los grupos de 'host_is_superhost'
fd_val['host_response_rate'] = fd_val.apply(
    lambda row: mean_host_response_rate.get(row['host_is_superhost'], np.nan) if pd.isnull(row['host_response_rate']) else row['host_response_rate'], axis=1)
fd_val['host_acceptance_rate'] = fd_val.apply(
    lambda row: mean_host_acceptance_rate.get(row['host_is_superhost'], np.nan) if pd.isnull(row['host_acceptance_rate']) else row['host_acceptance_rate'], axis=1)

# Imputar 'host_response_time' basado en reglas condicionales en función de 'host_is_superhost' y 'host_response_rate'
def imputar_host_response_time(row):
    if pd.isnull(row['host_response_time']):
        if row['host_is_superhost'] == 't' or (row['host_response_rate'] is not None and row['host_response_rate'] >= 0.9):
            return 'within an hour'
        elif row['host_response_rate'] is not None and 0.5 <= row['host_response_rate'] < 0.9:
            return 'within a day'
        else:
            return 'a few days or more'
    return row['host_response_time']

# Aplicar la imputación condicional a 'host_response_time'
fd_val['host_response_time'] = fd_val.apply(imputar_host_response_time, axis=1)

# Verificar nulos restantes en las columnas host_*
print("Nulos restantes en host_response_time (val):", fd_val['host_response_time'].isnull().sum())
print("Nulos restantes en host_response_rate (val):", fd_val['host_response_rate'].isnull().sum())
print("Nulos restantes en host_acceptance_rate (val):", fd_val['host_acceptance_rate'].isnull().sum())
print("Nulos restantes en host_is_superhost (val):", fd_val['host_is_superhost'].isnull().sum())


Nulos restantes en host_response_time (val): 0
Nulos restantes en host_response_rate (val): 0
Nulos restantes en host_acceptance_rate (val): 0
Nulos restantes en host_is_superhost (val): 0


##NULLS 'neighbourhood', primero CODIFICACIÓN

In [None]:
# Verificar las categorías únicas en 'neighbourhood_cleansed'
unique_neighbourhood_cleansed = fd_val['neighbourhood_cleansed'].unique()
print(unique_neighbourhood_cleansed)
print(f'Número total de categorías únicas en neighbourhood_cleansed: {len(unique_neighbourhood_cleansed)}')

['Dorchester' 'North End' 'Roxbury' 'Brighton' 'Beacon Hill' 'South End'
 'Allston' 'Downtown' 'Jamaica Plain' 'Fenway' 'Back Bay' 'South Boston'
 'Roslindale' 'Charlestown' 'Hyde Park' 'East Boston' 'Mission Hill'
 'Bay Village' 'West Roxbury' 'South Boston Waterfront'
 'Longwood Medical Area' 'Mattapan' 'West End']
Número total de categorías únicas en neighbourhood_cleansed: 23


In [None]:
import numpy as np
import pandas as pd
import joblib
from sklearn.preprocessing import LabelEncoder

# Cargar el LabelEncoder y los centroides previamente guardados
le_combined = joblib.load("/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/pkl/label_encoder_combined.pkl")
centroides = joblib.load("/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/pkl/centroides_neighbourhood.pkl")


# Definición de la función haversine
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Radio de la Tierra en kilómetros
    phi1 = np.radians(lat1)
    phi2 = np.radians(lat2)
    delta_phi = np.radians(lat2 - lat1)
    delta_lambda = np.radians(lon2 - lon1)
    a = np.sin(delta_phi / 2) ** 2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda / 2) ** 2
    return R * (2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a)))

# Limpieza y estandarización de las columnas 'neighbourhood' y 'neighbourhood_cleansed' en validación
fd_val['neighbourhood'] = fd_val['neighbourhood'].str.strip()
fd_val['neighbourhood'] = fd_val['neighbourhood'].str.replace(r"\s+", " ", regex=True)
fd_val['neighbourhood'] = fd_val['neighbourhood'].str.replace(r", ,", ",", regex=True)
fd_val['neighbourhood'] = fd_val['neighbourhood'].str.replace(r"Dorchster", "Dorchester", regex=False)
fd_val['neighbourhood'] = fd_val['neighbourhood'].str.replace(r"\s+,", ",", regex=True)
fd_val['neighbourhood'] = fd_val['neighbourhood'].str.replace(r"\(.*\)", "", regex=True).str.strip()
fd_val['neighbourhood'] = fd_val['neighbourhood'].str.replace(r", Massachusetts, United States", "", regex=False)
fd_val['neighbourhood'] = fd_val['neighbourhood'].str.strip()
fd_val['neighbourhood'] = fd_val['neighbourhood'].str.replace(r"Jamaica Plain, Boston", "Jamaica Plain", regex=False)

fd_val['neighbourhood_cleansed'] = fd_val['neighbourhood_cleansed'].str.strip()
fd_val['neighbourhood_cleansed'] = fd_val['neighbourhood_cleansed'].str.replace(r"\s+", " ", regex=True)
fd_val['neighbourhood_cleansed'] = fd_val['neighbourhood_cleansed'].str.replace(r", ,", ",", regex=True)
fd_val['neighbourhood_cleansed'] = fd_val['neighbourhood_cleansed'].str.replace(r"Dorchster", "Dorchester", regex=False)
fd_val['neighbourhood_cleansed'] = fd_val['neighbourhood_cleansed'].str.replace(r"\s+,", ",", regex=True)
fd_val['neighbourhood_cleansed'] = fd_val['neighbourhood_cleansed'].str.replace(r"\(.*\)", "", regex=True).str.strip()
fd_val['neighbourhood_cleansed'] = fd_val['neighbourhood_cleansed'].str.replace(r", Massachusetts, United States", "", regex=False)
fd_val['neighbourhood_cleansed'] = fd_val['neighbourhood_cleansed'].str.strip()

# Manejar valores desconocidos para 'neighbourhood' usando los centroides
def imputar_barrio_mas_cercano(row, centroides):
    if row['neighbourhood'] not in le_combined.classes_:
        # Calcular distancias geográficas a los centroides y asignar el barrio más cercano
        distancias = centroides.apply(lambda x: haversine(row['latitude'], row['longitude'], x['latitude'], x['longitude']), axis=1)
        barrio_mas_cercano = distancias.idxmin()
        return barrio_mas_cercano
    return row['neighbourhood']

# Aplicar la imputación para barrios desconocidos
fd_val['neighbourhood'] = fd_val.apply(imputar_barrio_mas_cercano, axis=1, args=(centroides,))

# Codificar las columnas utilizando el LabelEncoder
fd_val['neighbourhood'] = fd_val['neighbourhood'].apply(
    lambda x: le_combined.transform([x])[0] if x in le_combined.classes_ else le_combined.transform(["<UNK>"])[0]
)
fd_val['neighbourhood_cleansed'] = fd_val['neighbourhood_cleansed'].apply(
    lambda x: le_combined.transform([x])[0] if x in le_combined.classes_ else le_combined.transform(["<UNK>"])[0]
)

print("Valores únicos resultantes de 'neighbourhood' y 'neighbourhood_cleansed' en validación:",
      pd.concat([fd_val['neighbourhood'], fd_val['neighbourhood_cleansed']]).unique())


Valores únicos resultantes de 'neighbourhood' y 'neighbourhood_cleansed' en validación: [12  5  8  9 17 25 16 11 23  6 30  7 22 24  4 28  1 13 15  2 26 14 21  3
 27 19 20 29]


##Imputación 'neighbourhood'

In [None]:
from sklearn.impute import KNNImputer

import joblib

# Cargar el imputador KNN previamente ajustado en el conjunto de entrenamiento
imputer = joblib.load("/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/pkl/knn_imputer_trained.pkl")

# Definir las columnas que se utilizarán para la imputación
columns_for_imputation = ['latitude', 'longitude', 'neighbourhood', 'neighbourhood_cleansed']

# Aplicar el imputador al conjunto de validación
fd_val_imputed = imputer.transform(fd_val[columns_for_imputation])

# Actualizar el DataFrame con los valores imputados en 'neighbourhood'
fd_val['neighbourhood'] = fd_val_imputed[:, 2]

# Guardar el dataset de validación imputado si es necesario
fd_val.to_csv('/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/fd_val_imputed.csv', sep=';', index=False)

In [None]:
# Identificar columnas categóricas con valores nulos
categorical_columns = fd_val.select_dtypes(include=['object']).columns
categorical_null_columns = categorical_columns[fd_val[categorical_columns].isnull().any()]

# Contar los valores nulos en cada una de las columnas categóricas con nulos
categorical_null_counts = fd_val[categorical_null_columns].isnull().sum()
print(categorical_null_counts)


Series([], dtype: float64)


##Ya no tenemos NULLS ahora convertiremos las categoricas a numericas restantes:

In [None]:
#cargar dataset fd_val_imputed
fd_val_imputed = pd.read_csv("/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/fd_val_imputed.csv", sep=';')

##Comenzaré con las categorias host_

In [None]:
# Convertir la columna 'host_since' a datetime para crear la columna 'years_being_host' que indicaria la antiguedad del host
from datetime import datetime

fd_val_imputed['host_since'] = fd_val_imputed['host_since'].apply(lambda x: datetime.strptime(str(x), '%Y-%m-%d'))
fd_val_imputed['years_being_host'] = fd_val_imputed['host_since'].apply(lambda x: datetime.now().year - x.year)

fd_val_imputed.drop(['host_since'], axis=1, inplace=True)

print(fd_val_imputed['years_being_host'].dtype)

int64


##'host_response_time' valores unicos

In [None]:
# Mapping para convertir los valores de 'host_response_time' en valores numéricos relevancia
response_time_mapping = {
    'within an hour': 4,
    'within a few hours': 3,
    'within a day': 2,
    'a few days or more': 1
}

# Aplicar la transformación a la columna 'host_response_time'
fd_val_imputed['host_response_time'] = fd_val_imputed['host_response_time'].map(response_time_mapping)

# Verificar los valores únicos después de la transformación
print(fd_val_imputed['host_response_time'].unique())

[3 4 2 1]


##'host_is_superhost' ,'host_has_profile_pic', 'host_identity_verified' e 'instant_bookable'

In [None]:
# Convertir las cadenas 't' y 'f' a valores numéricos binarios (0 y 1) usando .apply()

boolean_columns = ['host_is_superhost', 'host_has_profile_pic', 'host_identity_verified', 'instant_bookable']
for col in boolean_columns:
    fd_val_imputed[col] = fd_val_imputed[col].apply(lambda x: 1 if x == 't' else 0)

# Verificar los valores únicos para confirmar la transformación
for col in boolean_columns:
    print(f"Valores únicos en {col} (val):", fd_val_imputed[col].unique())

Valores únicos en host_is_superhost (val): [1 0]
Valores únicos en host_has_profile_pic (val): [1 0]
Valores únicos en host_identity_verified (val): [1 0]
Valores únicos en instant_bookable (val): [0 1]


##'host_verifications'

In [None]:
print("Valores únicos en 'host_verifications':")
print(fd_val_imputed['host_verifications'].unique())


Valores únicos en 'host_verifications':
["['email', 'phone']" "['email', 'phone', 'work_email']" "['phone']"
 "['phone', 'work_email']"]


In [None]:
from ast import literal_eval
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import joblib

# Cargar el escalador previamente entrenado en train
scaler = joblib.load('/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/pkl/scaler_verifications.pkl')

# Convertir las cadenas de texto en listas reales usando literal_eval
fd_val_imputed['host_verifications'] = fd_val_imputed['host_verifications'].apply(literal_eval)

# Normalizar las listas eliminando espacios innecesarios y asegurando un formato consistente
fd_val_imputed['host_verifications'] = fd_val_imputed['host_verifications'].apply(
    lambda x: [item.strip().lower() for item in x]
)

# Contar el número de verificaciones para cada anfitrión, asegurando que las listas vacías sean tratadas como 0
fd_val_imputed['num_host_verifications'] = fd_val_imputed['host_verifications'].apply(
    lambda x: len(x) if len(x) > 0 else 0  # Listas vacías -> 0
)

# Clipping en el rango esperado (0 a 3) definido en train
fd_val_imputed['num_host_verifications'] = np.clip(
    fd_val_imputed['num_host_verifications'], scaler.data_min_[0], scaler.data_max_[0]
)

# Transformar los valores usando el escalador de train
fd_val_imputed['num_host_verifications'] = scaler.transform(
    fd_val_imputed[['num_host_verifications']]
)

# Eliminar la columna original 'host_verifications' para evitar redundancia
fd_val_imputed.drop('host_verifications', axis=1, inplace=True)

# Verificar las primeras filas para asegurarnos de que la transformación se realizó correctamente
print(fd_val_imputed[['num_host_verifications']].head())


   num_host_verifications
0                0.663333
1                0.990000
2                0.663333
3                0.336667
4                0.663333


##'property_type'

In [None]:
import joblib

# Cargar las medias calculadas para 'property_type' desde el conjunto de entrenamiento
property_price_means = joblib.load("/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/pkl/property_price_means.pkl")

# Crear una nueva columna 'property_type_encoded' en el conjunto de validación
fd_val_imputed['property_type_encoded'] = fd_val_imputed['property_type'].map(property_price_means)

# Manejar valores no vistos en el diccionario, asignándoles la media global calculada en el conjunto de entrenamiento
global_mean_price_train = sum(property_price_means.values()) / len(property_price_means)
fd_val_imputed['property_type_encoded'] = fd_val_imputed['property_type_encoded'].fillna(global_mean_price_train)

# Eliminar la columna original 'property_type'
fd_val_imputed.drop('property_type', axis=1, inplace=True)

# Verificar las primeras filas para asegurarnos de que la transformación se realizó correctamente
print(fd_val_imputed[['property_type_encoded']].head())


   property_type_encoded
0              82.168798
1             195.894336
2              86.526829
3             195.894336
4             195.894336


##'room_type'

In [None]:
import joblib

# Cargar las medias calculadas previamente y la media global desde el entrenamiento
room_data = joblib.load("/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/pkl/room_price_means.pkl")
room_price_means = room_data['room_price_means']
global_mean_price_train = room_data['global_mean_price_train']

# Aplicar el promedio calculado en train para codificar la columna en val
fd_val_imputed['room_type_encoded'] = fd_val_imputed['room_type'].map(room_price_means)

# Manejar valores no vistos en train asignándoles la media global
fd_val_imputed['room_type_encoded'] = fd_val_imputed['room_type_encoded'].fillna(global_mean_price_train)

# Eliminar la columna original 'room_type' para evitar redundancia
fd_val_imputed.drop('room_type', axis=1, inplace=True)

# Verificar las primeras filas para asegurarnos de que la transformación se realizó correctamente
print(fd_val_imputed[['room_type_encoded']].head())

   room_type_encoded
0         112.701754
1         223.597325
2         112.701754
3         223.597325
4         223.597325


In [None]:
# Eliminar columnas innecesarias del conjunto val
fd_val_imputed.drop(columns=['listing_url', 'picture_url', 'amenities', 'id'], inplace=True)

# Verificar que las columnas han sido eliminadas
print("Columnas restantes en fd_val:", fd_val_imputed.columns)

Columnas restantes en fd_val: Index(['host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_total_listings_count',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_cleansed', 'latitude', 'longitude', 'accommodates',
       'bathrooms', 'bedrooms', 'beds', 'price', 'minimum_nights',
       'maximum_nights', 'availability_365', 'number_of_reviews',
       'number_of_reviews_ltm', 'number_of_reviews_l30d',
       'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'instant_bookable', 'reviews_per_month',
       'bathrooms_shared', 'years_being_host', 'num_host_verifications',
       'property_type_encoded', 'room_type_encoded'],
      dtype='object')


In [None]:
print(len(fd_val_imputed))

374


In [None]:
# Verificar si existen columnas no numéricas en fd_val_imputed
non_numeric_columns = fd_val_imputed.select_dtypes(exclude=['number']).columns

# Imprimir las columnas no numéricas
print("Columnas no numéricas en fd_val_imputed:", non_numeric_columns)


Columnas no numéricas en fd_val_imputed: Index([], dtype='object')


In [None]:
# Guardar el conjunto val imputado y con todas las columnas convertidas a numéricas
fd_val_imputed.to_csv("/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/fd_val_imputed_num.csv", sep=';', index=False)

# Confirmación del guardado
print("El archivo CSV con los datos imputados y numéricos ha sido guardado como 'fd_val_imputed_num.csv'")

El archivo CSV con los datos imputados y numéricos ha sido guardado como 'fd_val_imputed_num.csv'


In [None]:
# Cargar el conjunto de validación imputado y numérico desde el archivo CSV
fd_val_imputed_num = pd.read_csv("/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/fd_val_imputed_num.csv", sep=';')

# Confirmación de la carga
print("El archivo CSV 'fd_val_imputed_num.csv' ha sido cargado exitosamente.")

El archivo CSV 'fd_val_imputed_num.csv' ha sido cargado exitosamente.


## **OUTLIERS**
"price"

In [None]:
# Contar registros antes de eliminar outliers
initial_val_count = fd_val_imputed_num.shape[0]

# Identificar índices de los registros que serán eliminados
outliers_val = fd_val_imputed_num[fd_val_imputed_num['price'] > 3000].index

# Eliminar registros
fd_val_imputed_num = fd_val_imputed_num.drop(outliers_val)

# Contar registros después de eliminar outliers
final_val_count = fd_val_imputed_num.shape[0]

# Mostrar cuántos registros se eliminaron
print(f"Registros eliminados en val: {initial_val_count - final_val_count}")
print(f"Registros restantes en val después del filtro: {final_val_count}")


Registros eliminados en val: 1
Registros restantes en val después del filtro: 373


## ⏰ **OJO SE ELIMINA 1 REGISTRO**

Capping a 'number_of_reviews', 'reviews_per_month', 'host_total_listings_count', 'minimum_nights', 'maximum_nights'

In [None]:
import numpy as np
import pickle

# Cargar los thresholds calculados previamente en train
with open('/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/pkl/capping_thresholds.pkl', 'rb') as f:
    capping_thresholds = pickle.load(f)

# Aplicar capping a las columnas en el conjunto de validación
for column in ['number_of_reviews', 'reviews_per_month', 'host_total_listings_count', 'minimum_nights', 'maximum_nights']:
    # Obtener el threshold para la columna
    threshold = capping_thresholds[column]

    # Aplicar capping en los valores que superan el threshold
    fd_val_imputed_num[column] = np.where(fd_val_imputed_num[column] > threshold, threshold, fd_val_imputed_num[column])

# Verificar algunos valores después del capping en fd_val
print("Valores después del capping en 'maximum_nights' (val):")
print(fd_val_imputed_num['maximum_nights'].sort_values(ascending=False).head())


Valores después del capping en 'maximum_nights' (val):
97     1125.0
285    1125.0
254    1125.0
259    1125.0
128    1125.0
Name: maximum_nights, dtype: float64


In [None]:
# Guardar el fd_val_imputed_num_out preprocesado CON TRABAJO DE OUTLIERS en un archivo CSV
fd_val_imputed_num.to_csv("/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/fd_val_imputed_num_out.csv", sep=';', index=False)

In [3]:
# Cargar el DataFrame preprocesado preprocesado CON TRABAJO DE OUTLIERS desde un archivo CSV
import pandas as pd
fd_val_imputed_num_out = pd.read_csv("/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/fd_val_imputed_num_out.csv", sep=';')

In [4]:
fd_val_imputed_num_out.shape

(373, 36)

Eliminar 'neighbourhood por que tiene mas correlacion con 'price_log 'neighbourhood_cleansed'

In [5]:
#drop de 'neighbourhood'
fd_val_imputed_num_out.drop(columns=['neighbourhood'], inplace=True)

In [6]:
print(fd_val_imputed_num_out.head(6).to_string())

   host_response_time  host_response_rate  host_acceptance_rate  host_is_superhost  host_total_listings_count  host_has_profile_pic  host_identity_verified  neighbourhood_cleansed  latitude  longitude  accommodates  bathrooms  bedrooms  beds  price  minimum_nights  maximum_nights  availability_365  number_of_reviews  number_of_reviews_ltm  number_of_reviews_l30d  review_scores_rating  review_scores_accuracy  review_scores_cleanliness  review_scores_checkin  review_scores_communication  review_scores_location  review_scores_value  instant_bookable  reviews_per_month  bathrooms_shared  years_being_host  num_host_verifications  property_type_encoded  room_type_encoded
0                   3                1.00                  0.99                  1                        3.0                     1                       1                      12  42.30816  -71.07790             2        1.0       1.0   1.0   65.0             1.0            14.0                 0              179.0         

# **NORMALIZACIÓN**

####Normalizare 'bedrooms', 'beds', 'accommodates', 'bathrooms'

In [7]:
from sklearn.preprocessing import MinMaxScaler
import joblib

# Cargar el escalador entrenado en train
scaler_bedrooms_beds_adjusted = joblib.load('/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/pkl/scaler_bedrooms_beds_adjusted.pkl')

# Aplicar el escalador a 'bedrooms', 'beds', 'accommodates', y 'bathrooms' en validación
fd_val_imputed_num_out[['bedrooms', 'beds', 'accommodates', 'bathrooms']] = scaler_bedrooms_beds_adjusted.transform(
    fd_val_imputed_num_out[['bedrooms', 'beds', 'accommodates', 'bathrooms']]
)

# Verificar los valores después del ajuste
adjusted_bedrooms_beds_stats_val = fd_val_imputed_num_out[['bedrooms', 'beds', 'accommodates', 'bathrooms']].agg(['min', 'max'])

# Mostrar los resultados
print(adjusted_bedrooms_beds_stats_val)


     bedrooms  beds  accommodates  bathrooms
min      0.01  0.01          0.01   0.010000
max      0.99  0.71          0.99   0.663333


In [8]:
print(fd_val_imputed_num_out.head().to_string())

   host_response_time  host_response_rate  host_acceptance_rate  host_is_superhost  host_total_listings_count  host_has_profile_pic  host_identity_verified  neighbourhood_cleansed  latitude  longitude  accommodates  bathrooms  bedrooms      beds  price  minimum_nights  maximum_nights  availability_365  number_of_reviews  number_of_reviews_ltm  number_of_reviews_l30d  review_scores_rating  review_scores_accuracy  review_scores_cleanliness  review_scores_checkin  review_scores_communication  review_scores_location  review_scores_value  instant_bookable  reviews_per_month  bathrooms_shared  years_being_host  num_host_verifications  property_type_encoded  room_type_encoded
0                   3                1.00                  0.99                  1                        3.0                     1                       1                      12  42.30816  -71.07790      0.075333   0.173333      0.01  0.010000   65.0             1.0            14.0                 0              179.0 

# **Columnas Numéricas para Normalización**
## **Transformación Logarítmica (log1p)**


In [10]:
import numpy as np
import joblib

# Cargar la información de la transformación desde Train
price_transform = joblib.load("/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/pkl/price_transform.pkl")

# Aplicar log1p a la variable objetivo 'price' en Val y Test
fd_val_imputed_num_out['price'] = np.log1p(fd_val_imputed_num_out['price'])


print("✅ Transformación logarítmica aplicada en `price` de Val")



✅ Transformación logarítmica aplicada en `price` de Val


In [11]:
# Cargar las columnas a transformar
columns_to_log_transform = joblib.load("/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/pkl/log_transform_columns.pkl")

# Aplicar transformación en val
for col in columns_to_log_transform:
    fd_val_imputed_num_out[col] = np.log1p(fd_val_imputed_num_out[col])  # log(1 + x)

print("✅ Transformación logarítmica aplicada en `val` usando los mismos parámetros de `train`.")


✅ Transformación logarítmica aplicada en `val` usando los mismos parámetros de `train`.


# **Embeddings**


In [12]:
# Variables categóricas que usaremos en embeddings
categorical_columns = ['host_response_time', 'neighbourhood_cleansed',
                       'property_type_encoded', 'room_type_encoded']

# Cargar los mapeos guardados en train
category_mappings = joblib.load("/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/pkl/category_mappings.pkl")

# Aplicar los mapeos en val
for col in categorical_columns:
    fd_val_imputed_num_out[col] = fd_val_imputed_num_out[col].map(category_mappings[col])

print("Transformación de variables categóricas a índices en Val completada.")


Transformación de variables categóricas a índices en Val completada.


## **StandardScaler para variables con alta varianza**

## **StandardScaler para Geo**

In [13]:
# Cargar el scaler de train
scaler_geo = joblib.load("/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/pkl/standard_scaler_geo.pkl")

# Aplicar transformación en validación
fd_val_imputed_num_out[['latitude', 'longitude']] = scaler_geo.transform(fd_val_imputed_num_out[['latitude', 'longitude']])



## **MinMaxScaler**

In [14]:

columns_to_minmax = [
    'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness',
    'review_scores_checkin', 'review_scores_communication', 'review_scores_location',
    'review_scores_value'
]

# Cargar scaler de train
scaler_minmax = joblib.load("/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/pkl/minmax_scaler.pkl")

# Aplicar MinMaxScaler en val
fd_val_imputed_num_out[columns_to_minmax] = scaler_minmax.transform(fd_val_imputed_num_out[columns_to_minmax])


print("MinMaxScaler aplicado en Val correctamente.")

MinMaxScaler aplicado en Val correctamente.


In [15]:
# Definir las columnas
columns_to_scale = ['availability_365', 'years_being_host']


# Cargar el scaler guardado de train
scaler = joblib.load("/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/pkl/minmax_scaler_availability_host.pkl")

# Aplicar el escalado en val y test
fd_val_imputed_num_out[columns_to_scale] = scaler.transform(fd_val_imputed_num_out[columns_to_scale])

print("✔️ MinMaxScaler aplicado correctamente en val y test.")

✔️ MinMaxScaler aplicado correctamente en val y test.


###*'neighbourhood_cleansed'* ya paso por labelencoder y esta codficada.

No necesita en este momento normalizacion, puede trabajarse mas segun el modelo que haga si es con redes neuronales o arboles, etc.

In [17]:
# Guardar el fd_val_imputed_num_out PARA CORRELACIONAR
fd_val_imputed_num_out.to_csv("/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/fd_val_acorrelacion.csv", sep=';', index=False)

In [18]:
# Cargar el fd_train_acorrelacion
import pandas as pd
fd_val_acorrelacion= pd.read_csv("/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/fd_val_acorrelacion.csv", sep=';')

In [19]:
fd_val_acorrelacion.shape

(373, 35)

In [20]:
import pandas as pd

# Configuración para mostrar todas las columnas sin truncar
pd.set_option('display.max_columns', None)

# Para fd_val_finalpreprocesado
print("\nMínimo y máximo en cada columna de fd_val_finpreprocesado:")
min_max_val = fd_val_acorrelacion.agg(['min', 'max'])
print(min_max_val)

# Opcional: Restaurar configuración para evitar demasiadas columnas en futuras salidas
pd.reset_option('display.max_columns')


Mínimo y máximo en cada columna de fd_val_finpreprocesado:
     host_response_time  host_response_rate  host_acceptance_rate  \
min                   0                 0.0                   0.0   
max                   3                 1.0                   1.0   

     host_is_superhost  host_total_listings_count  host_has_profile_pic  \
min                  0                   0.693147                     0   
max                  1                   7.270313                     1   

     host_identity_verified  neighbourhood_cleansed  latitude  longitude  \
min                       0                       0 -3.576252  -2.803618   
max                       1                      24  1.958795   2.485527   

     accommodates  bathrooms  bedrooms  beds     price  minimum_nights  \
min          0.01   0.010000      0.01  0.01  3.583519        0.693147   
max          0.99   0.663333      0.99  0.71  7.601402        4.521789   

     maximum_nights  availability_365  number_of_revie

# **CORRELACIONAR**

**review_scores_rating vs. review_scores_accuracy vs. review_scores_value**

In [21]:
# Definir las columnas de reviews con correlación extrema
review_columns_extreme = ['review_scores_rating', 'review_scores_accuracy', 'review_scores_value']


# Cargar la media calculada en train
review_overall_mean_train = joblib.load("/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/pkl/review_overall_mean.pkl")

# Aplicar en val
fd_val_acorrelacion['review_overall_score'] = fd_val_acorrelacion[review_columns_extreme].mean(axis=1)
fd_val_acorrelacion['review_overall_score'].fillna(review_overall_mean_train, inplace=True)
fd_val_acorrelacion.drop(columns=review_columns_extreme, inplace=True)


# Verificar
print(fd_val_acorrelacion[['review_overall_score']].describe())


       review_overall_score
count            373.000000
mean               0.928653
std                0.083326
min                0.283333
25%                0.924000
50%                0.952500
75%                0.969000
max                1.000000


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  fd_val_acorrelacion['review_overall_score'].fillna(review_overall_mean_train, inplace=True)


In [22]:
# Eliminar las columnas mas correlacionadas entre si de las reseñas
fd_val_acorrelacion.drop(columns=['number_of_reviews', 'number_of_reviews_ltm', 'number_of_reviews_l30d'], inplace=True)

## **Creación de nuevas variables**

In [23]:
# Aplicar la misma fórmula que en train para val
fd_val_acorrelacion['price_per_person'] = fd_val_acorrelacion['price'] / fd_val_acorrelacion['accommodates']

# Verificar que la nueva variable fue creada correctamente en val
print(fd_val_acorrelacion[['price', 'accommodates', 'price_per_person']].head())


      price  accommodates  price_per_person
0  4.189655      0.075333         55.614886
1  5.758902      0.206000         27.955834
2  4.110874      0.010000        411.087386
3  6.214608      0.336667         18.459232
4  4.709530      0.206000         22.861797


In [24]:
# Aplicar la misma lógica para val
price_per_neighborhood_val = fd_val_acorrelacion.groupby('neighbourhood_cleansed')['price'].transform('mean')
fd_val_acorrelacion['price_per_neighborhood'] = price_per_neighborhood_val

# Verificar que la nueva variable fue creada correctamente en val
print(fd_val_acorrelacion[['price', 'neighbourhood_cleansed', 'price_per_neighborhood']].head())


      price  neighbourhood_cleansed  price_per_neighborhood
0  4.189655                       7                4.527022
1  5.758902                      17                5.301024
2  4.110874                      19                4.552039
3  6.214608                       4                4.639863
4  4.709530                       3                4.839999


In [25]:
# Aplicar la misma lógica para val
avg_price_by_property_type_val = fd_val_acorrelacion.groupby('property_type_encoded')['price'].transform('mean')
fd_val_acorrelacion['avg_price_by_property_type'] = avg_price_by_property_type_val

# Verificar que la nueva variable fue creada correctamente en val
print(fd_val_acorrelacion[['price', 'property_type_encoded', 'avg_price_by_property_type']].head())


      price  property_type_encoded  avg_price_by_property_type
0  4.189655                      9                    4.135118
1  5.758902                     22                    5.069406
2  4.110874                     11                    4.556527
3  6.214608                     22                    5.069406
4  4.709530                     22                    5.069406


# **Guardar el fd_val_acorrelacion**

In [8]:
# Guardar el fd_val_acorrelacion con final de preprocesado y NORMALIZACION en un archivo CSV fd_val_finpreprocesado.
fd_val_finpreprocesado.to_csv("/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/fd_val_finpreprocesado.csv", sep=';', index=False)


In [3]:
# Cargar el fd_val_finpreprocesado.csv
import pandas as pd
fd_val_finpreprocesado = pd.read_csv("/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/fd_val_finpreprocesado.csv", sep=';')

In [4]:
import numpy as np
# Aplicar log a las nuevas variables para que tengan la misma escala
fd_val_finpreprocesado['price_per_person_log'] = np.log(fd_val_finpreprocesado['price_per_person'])
fd_val_finpreprocesado['price_per_neighborhood_log'] = np.log(fd_val_finpreprocesado['price_per_neighborhood'])
fd_val_finpreprocesado['avg_price_by_property_type_log'] = np.log(fd_val_finpreprocesado['avg_price_by_property_type'])

In [5]:
# Eliminar las columnas
fd_val_finpreprocesado.drop(columns=['price_per_person', 'price_per_neighborhood','avg_price_by_property_type'], inplace=True)

In [6]:
fd_val_finpreprocesado.shape

(373, 33)

In [7]:
fd_val_finpreprocesado.isnull().sum()

Unnamed: 0,0
host_response_time,0
host_response_rate,0
host_acceptance_rate,0
host_is_superhost,0
host_total_listings_count,0
host_has_profile_pic,0
host_identity_verified,0
neighbourhood_cleansed,0
latitude,0
longitude,0
