#**PREPROCESAMIENTO Y NORMALIZACIÓN EN TEST**

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd

In [69]:

## Cargar datos fd_val
fd_test = pd.read_csv("/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/fd_test.csv", sep=';')

print(f"Registros en fd_test: {len(fd_test)}")

Registros en fd_test: 622


In [70]:
fd_test.shape

(622, 72)

## **Normalización y redimensionado de datos numéricos y categóricos DATOS TABULARES fd_test basado en TRAIN**

In [71]:

# Definir las columnas que no aportan información relevante
not_useful_columns = [
    'first_review', 'last_review', 'calculated_host_listings_count_entire_homes',
    'calculated_host_listings_count_private_rooms', 'calculated_host_listings_count_shared_rooms',
    'calculated_host_listings_count', 'license', 'calendar_last_scraped',
    'has_availability', 'availability_30', 'availability_60', 'availability_90',
    'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights',
    'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'minimum_minimum_nights', 'host_listings_count',
    'host_neighbourhood', 'host_picture_url', 'host_about', 'host_location',
    'host_thumbnail_url', 'host_name', 'host_url', 'host_id', 'source',
    'last_scraped', 'scrape_id', 'neighbourhood_group_cleansed', 'calendar_updated'
]

# Hacemos un drop de todas las columnas que no aportan información relevante a precio o inmueble en fd_test
fd_test.drop(not_useful_columns, axis=1, inplace=True)

fd_test.shape

(622, 40)

##nulls en review_scores_* y reviews_per_month

In [72]:
import pandas as pd
import joblib

# Definir las columnas a imputar
review_columns = ['review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness',
                  'review_scores_checkin', 'review_scores_communication', 'review_scores_location',
                  'review_scores_value', 'reviews_per_month']

# Cargar las medianas previamente guardadas del archivo
medianas_reviews = joblib.load('/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/pkl/medianas_reviews.pkl')

# Imputar valores nulos en el conjunto de prueba con las medianas
for column in review_columns:
    fd_test[column] = fd_test[column].fillna(medianas_reviews[column])

# Imprimir el número de nulos restantes después de la imputación
print("Número de nulos restantes después de la imputación en test:")
print(fd_test[review_columns].isnull().sum())


Número de nulos restantes después de la imputación en test:
review_scores_rating           0
review_scores_accuracy         0
review_scores_cleanliness      0
review_scores_checkin          0
review_scores_communication    0
review_scores_location         0
review_scores_value            0
reviews_per_month              0
dtype: int64


##Imputacion 'bathrooms' y 'bathrooms_text'

In [73]:
import re

# Cargar la información de imputación guardada en train
bathrooms_info = joblib.load('/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/pkl/bathrooms_info.pkl')
bathrooms_median = bathrooms_info['bathrooms_median']
porcentaje_bathrooms_shared = bathrooms_info['porcentaje_bathrooms_shared']

# Función para imputar 'bathrooms' en test
def extract_bathrooms_test(text):
    if pd.isnull(text):
        return None
    text = str(text).lower()
    if "half" in text:  # Manejar específicamente "Half-bath"
        return 0.5
    match = re.search(r'(\d+(\.\d+)?)', text)  # Buscar números con decimales
    return float(match.group(1)) if match else None

# Crear columna 'bathrooms_shared' para test
fd_test['bathrooms_shared'] = fd_test['bathrooms_text'].apply(
    lambda x: 1 if 'shared' in str(x).lower() else 0
)

# Imputar bathrooms usando bathrooms_text
fd_test['bathrooms'] = fd_test['bathrooms'].combine_first(
    fd_test['bathrooms_text'].apply(extract_bathrooms_test)
)

# Imputar valores restantes con la mediana calculada en train
fd_test['bathrooms'].fillna(bathrooms_median, inplace=True)

# Imputar bathrooms_shared restantes con el porcentaje calculado en train
fd_test['bathrooms_shared'].fillna(round(porcentaje_bathrooms_shared), inplace=True)

# Verificar si 'bathrooms_text' puede eliminarse
if fd_test['bathrooms'].isnull().sum() == 0 and fd_test['bathrooms_shared'].isnull().sum() == 0:
    fd_test.drop(columns=['bathrooms_text'], inplace=True)
    print("'bathrooms_text' ha sido eliminada porque ya no hay nulos en 'bathrooms' y 'bathrooms_shared'.")
else:
    print("'bathrooms_text' no ha sido eliminada porque aún hay nulos en 'bathrooms' o 'bathrooms_shared'.")

# Imprimir resumen de imputación
print("Nulos en 'bathrooms' después de la imputación:", fd_test['bathrooms'].isnull().sum())
print("Nulos en 'bathrooms_shared':", fd_test['bathrooms_shared'].isnull().sum())


'bathrooms_text' ha sido eliminada porque ya no hay nulos en 'bathrooms' y 'bathrooms_shared'.
Nulos en 'bathrooms' después de la imputación: 0
Nulos en 'bathrooms_shared': 0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  fd_test['bathrooms'].fillna(bathrooms_median, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  fd_test['bathrooms_shared'].fillna(round(porcentaje_bathrooms_shared), inplace=True)


In [74]:
fd_test.shape

(622, 40)

# Imputar 'beds'

In [75]:
import pandas as pd
import joblib

# Definir las columnas a imputar
group_columns = ['property_type', 'room_type', 'accommodates', 'bathrooms', 'bathrooms_shared']

# Cargar las medianas previamente guardadas del archivo
median_beds_grouped = joblib.load('/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/pkl/median_beds_grouped.pkl')
mediana_global_beds = joblib.load('/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/pkl/mediana_global_beds.pkl')

# Imputar valores nulos en el conjunto de prueba con las medianas calculadas por grupo
fd_test = fd_test.merge(median_beds_grouped, on=group_columns, how='left')
fd_test['beds'] = fd_test['beds'].fillna(fd_test['beds_median_grouped'])

# Imputar cualquier valor restante con la mediana global de 'beds'
fd_test['beds'] = fd_test['beds'].fillna(mediana_global_beds)

# Eliminar la columna temporal 'beds_median_grouped' después de la imputación
fd_test.drop(columns=['beds_median_grouped'], inplace=True)

# Imprimir el número de nulos restantes después de la imputación
print("Nulos restantes en 'beds' después de la imputación en test:", fd_test['beds'].isnull().sum())

Nulos restantes en 'beds' después de la imputación en test: 0


# Imputar 'bedrooms'


In [76]:
import numpy as np
import pandas as pd
from difflib import get_close_matches

# Definir los valores conocidos de 'room_type'
known_room_types = ["Entire home/apt", "Private room", "Hotel room", "Shared room"]

# Función de imputación de 'bedrooms' basada en reglas mejoradas (misma que en train)
def imputar_bedrooms(df):
    for index, row in df.iterrows():
        room_type = row['room_type']

        # Normalizar valor: Convertir a minúsculas para evitar problemas de mayúsculas/minúsculas
        room_type_normalized = room_type.lower() if pd.notnull(room_type) else None

        # Intentar encontrar coincidencias aproximadas con los valores conocidos
        match = get_close_matches(room_type_normalized, [rt.lower() for rt in known_room_types], n=1, cutoff=0.8)

        # Si encontramos una coincidencia, utilizamos el valor conocido
        if match:
            room_type_final = match[0].capitalize()
        else:
            # Si no hay coincidencias, intentamos hacer una clasificación basada en accommodates
            accommodates = row['accommodates']
            if pd.notnull(accommodates):
                if accommodates <= 2:
                    room_type_final = "Private room"  # Suposición razonable si solo tiene capacidad para 2
                elif accommodates >= 3 and accommodates <= 6:
                    room_type_final = "Entire home/apt"  # Suposición razonable para acomodar 3 a 6 personas
                else:
                    room_type_final = "Entire home/apt"  # Suposición razonable para más de 6 personas
            else:
                # Si no podemos hacer una clasificación basada en accommodates, asignamos un valor por defecto
                room_type_final = "Unknown"

        # Aplicar reglas para imputar 'bedrooms'
        if room_type_final == "Entire home/apt":
            # Reglas para 'Entire home/apt'
            if row['accommodates'] <= 2:
                df.at[index, 'bedrooms'] = 1
            elif 3 <= row['accommodates'] <= 4:
                df.at[index, 'bedrooms'] = 2
            elif 5 <= row['accommodates'] <= 6:
                df.at[index, 'bedrooms'] = 3
            else:
                df.at[index, 'bedrooms'] = np.ceil(row['accommodates'] / 2)

        elif room_type_final == "Private room":
            # Regla para 'Private room'
            df.at[index, 'bedrooms'] = 1

        elif room_type_final in ["Hotel room", "Shared room"]:
            # Regla para 'Hotel room' y 'Shared room'
            df.at[index, 'bedrooms'] = 1

        elif room_type_final == "Unknown":
            # Valor predeterminado para un 'room_type' desconocido, basándose en el número de accommodates
            df.at[index, 'bedrooms'] = np.ceil(row['accommodates'] / 2) if pd.notnull(row['accommodates']) else 1

    return df

# Aplicar la función a fd_test (misma lógica que en val y train)
fd_test = imputar_bedrooms(fd_test)

# Verificar nulos restantes en 'bedrooms' después de la imputación en test
print("Nulos restantes en 'bedrooms' después de la imputación en fd_test:", fd_test['bedrooms'].isnull().sum())

Nulos restantes en 'bedrooms' después de la imputación en fd_test: 0


In [77]:
# Identificar columnas que contienen la palabra 'host' en su nombre
host_columns = [col for col in fd_test.columns if 'host' in col]

# Mostrar valores nulos en estas columnas
null_counts = fd_test[host_columns].isnull().sum()

print("Valores nulos en las columnas que contienen 'host':")
print(null_counts)


Valores nulos en las columnas que contienen 'host':
host_since                    0
host_response_time           64
host_response_rate           64
host_acceptance_rate         56
host_is_superhost             8
host_total_listings_count     0
host_verifications            0
host_has_profile_pic          0
host_identity_verified        0
dtype: int64


###Imputacion de NULLS columnas catégoricas:
Imputar columnas host_

In [78]:
import joblib
import numpy as np
import pandas as pd

# Cargar la información guardada de train
host_info = joblib.load('/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/pkl/host_info.pkl')
mean_host_response_rate = host_info['mean_host_response_rate']
mean_host_acceptance_rate = host_info['mean_host_acceptance_rate']
mode_host_is_superhost = host_info['mode_host_is_superhost']

# Imputar 'host_is_superhost' con el valor más frecuente (moda) en el dataset
fd_test['host_is_superhost'] = fd_test['host_is_superhost'].fillna(mode_host_is_superhost)

# Convertir 'host_response_rate' y 'host_acceptance_rate' de texto a valores numéricos quitando el % y dividiendo el número por 100
if fd_test['host_response_rate'].dtype == 'O':
    fd_test['host_response_rate'] = fd_test['host_response_rate'].str.rstrip('%').astype(float) / 100
if fd_test['host_acceptance_rate'].dtype == 'O':
    fd_test['host_acceptance_rate'] = fd_test['host_acceptance_rate'].str.rstrip('%').astype(float) / 100

# Imputar 'host_response_rate' y 'host_acceptance_rate' basado en la media dentro de los grupos de 'host_is_superhost'
fd_test['host_response_rate'] = fd_test.apply(
    lambda row: mean_host_response_rate.get(row['host_is_superhost'], np.nan) if pd.isnull(row['host_response_rate']) else row['host_response_rate'], axis=1)
fd_test['host_acceptance_rate'] = fd_test.apply(
    lambda row: mean_host_acceptance_rate.get(row['host_is_superhost'], np.nan) if pd.isnull(row['host_acceptance_rate']) else row['host_acceptance_rate'], axis=1)

# Imputar 'host_response_time' basado en reglas condicionales en función de 'host_is_superhost' y 'host_response_rate'
def imputar_host_response_time(row):
    if pd.isnull(row['host_response_time']):
        if row['host_is_superhost'] == 't' or (row['host_response_rate'] is not None and row['host_response_rate'] >= 0.9):
            return 'within an hour'
        elif row['host_response_rate'] is not None and 0.5 <= row['host_response_rate'] < 0.9:
            return 'within a day'
        else:
            return 'a few days or more'
    return row['host_response_time']

# Aplicar la imputación condicional a 'host_response_time'
fd_test['host_response_time'] = fd_test.apply(imputar_host_response_time, axis=1)

# Verificar nulos restantes en las columnas host_*
print("Nulos restantes en host_response_time (test):", fd_test['host_response_time'].isnull().sum())
print("Nulos restantes en host_response_rate (test):", fd_test['host_response_rate'].isnull().sum())
print("Nulos restantes en host_acceptance_rate (test):", fd_test['host_acceptance_rate'].isnull().sum())
print("Nulos restantes en host_is_superhost (test):", fd_test['host_is_superhost'].isnull().sum())

Nulos restantes en host_response_time (test): 0
Nulos restantes en host_response_rate (test): 0
Nulos restantes en host_acceptance_rate (test): 0
Nulos restantes en host_is_superhost (test): 0


##NULLS 'neighbourhood', primero CODIFICACIÓN

In [79]:
# Verificar las categorías únicas en 'neighbourhood_cleansed'
unique_neighbourhood_cleansed = fd_test['neighbourhood_cleansed'].unique()
print(unique_neighbourhood_cleansed)
print(f'Número total de categorías únicas en neighbourhood_cleansed: {len(unique_neighbourhood_cleansed)}')

['South End' 'Dorchester' 'Brighton' 'Back Bay' 'West Roxbury'
 'Beacon Hill' 'Charlestown' 'North End' 'Jamaica Plain' 'Fenway'
 'Allston' 'East Boston' 'Roxbury' 'Leather District' 'Downtown'
 'Roslindale' 'Bay Village' 'Chinatown' 'South Boston' 'Mattapan'
 'South Boston Waterfront' 'Hyde Park' 'West End' 'Mission Hill'
 'Longwood Medical Area']
Número total de categorías únicas en neighbourhood_cleansed: 25


In [80]:
import numpy as np
import pandas as pd
import joblib
from sklearn.preprocessing import LabelEncoder

# Cargar el LabelEncoder y los centroides previamente guardados
le_combined = joblib.load("/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/pkl/label_encoder_combined.pkl")
centroides = joblib.load("/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/pkl/centroides_neighbourhood.pkl")

# Definición de la función haversine
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Radio de la Tierra en kilómetros
    phi1 = np.radians(lat1)
    phi2 = np.radians(lat2)
    delta_phi = np.radians(lat2 - lat1)
    delta_lambda = np.radians(lon2 - lon1)
    a = np.sin(delta_phi / 2) ** 2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda / 2) ** 2
    return R * (2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a)))


# Limpieza y estandarización de las columnas 'neighbourhood' y 'neighbourhood_cleansed' en prueba
fd_test['neighbourhood'] = fd_test['neighbourhood'].str.strip()
fd_test['neighbourhood'] = fd_test['neighbourhood'].str.replace(r"\s+", " ", regex=True)
fd_test['neighbourhood'] = fd_test['neighbourhood'].str.replace(r", ,", ",", regex=True)
fd_test['neighbourhood'] = fd_test['neighbourhood'].str.replace(r"Dorchster", "Dorchester", regex=False)
fd_test['neighbourhood'] = fd_test['neighbourhood'].str.replace(r"\s+,", ",", regex=True)
fd_test['neighbourhood'] = fd_test['neighbourhood'].str.replace(r"\(.*\)", "", regex=True).str.strip()
fd_test['neighbourhood'] = fd_test['neighbourhood'].str.replace(r", Massachusetts, United States", "", regex=False)
fd_test['neighbourhood'] = fd_test['neighbourhood'].str.strip()
fd_test['neighbourhood'] = fd_test['neighbourhood'].str.replace(r"Jamaica Plain, Boston", "Jamaica Plain", regex=False)

fd_test['neighbourhood_cleansed'] = fd_test['neighbourhood_cleansed'].str.strip()
fd_test['neighbourhood_cleansed'] = fd_test['neighbourhood_cleansed'].str.replace(r"\s+", " ", regex=True)
fd_test['neighbourhood_cleansed'] = fd_test['neighbourhood_cleansed'].str.replace(r", ,", ",", regex=True)
fd_test['neighbourhood_cleansed'] = fd_test['neighbourhood_cleansed'].str.replace(r"Dorchster", "Dorchester", regex=False)
fd_test['neighbourhood_cleansed'] = fd_test['neighbourhood_cleansed'].str.replace(r"\s+,", ",", regex=True)
fd_test['neighbourhood_cleansed'] = fd_test['neighbourhood_cleansed'].str.replace(r"\(.*\)", "", regex=True).str.strip()
fd_test['neighbourhood_cleansed'] = fd_test['neighbourhood_cleansed'].str.replace(r", Massachusetts, United States", "", regex=False)
fd_test['neighbourhood_cleansed'] = fd_test['neighbourhood_cleansed'].str.strip()

# Manejar valores desconocidos para 'neighbourhood' usando los centroides
def imputar_barrio_mas_cercano(row, centroides):
    if row['neighbourhood'] not in le_combined.classes_:
        # Calcular distancias geográficas a los centroides y asignar el barrio más cercano
        distancias = centroides.apply(lambda x: haversine(row['latitude'], row['longitude'], x['latitude'], x['longitude']), axis=1)
        barrio_mas_cercano = distancias.idxmin()
        return barrio_mas_cercano
    return row['neighbourhood']

# Aplicar la imputación para barrios desconocidos
fd_test['neighbourhood'] = fd_test.apply(imputar_barrio_mas_cercano, axis=1, args=(centroides,))

# Codificar las columnas utilizando el LabelEncoder
fd_test['neighbourhood'] = fd_test['neighbourhood'].apply(
    lambda x: le_combined.transform([x])[0] if x in le_combined.classes_ else le_combined.transform(["<UNK>"])[0]
)
fd_test['neighbourhood_cleansed'] = fd_test['neighbourhood_cleansed'].apply(
    lambda x: le_combined.transform([x])[0] if x in le_combined.classes_ else le_combined.transform(["<UNK>"])[0]
)

print("Valores únicos resultantes de 'neighbourhood' y 'neighbourhood_cleansed' en test:")
print(fd_test['neighbourhood'].unique())
print(fd_test['neighbourhood_cleansed'].unique())


Valores únicos resultantes de 'neighbourhood' y 'neighbourhood_cleansed' en test:
[ 5  8 30  9 16 25  6 23 17 14 12 11  7]
[28 12  8  2 30  4  9 22 17 15  1 14 24 18 13 23  3 10 26 20 27 16 29 21
 19]


##Imputación 'neighbourhood'

In [81]:
from sklearn.impute import KNNImputer

import joblib

# Cargar el imputador KNN previamente ajustado en el conjunto de entrenamiento
imputer = joblib.load("/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/pkl/knn_imputer_trained.pkl")

# Definir las columnas que se utilizarán para la imputación
columns_for_imputation = ['latitude', 'longitude', 'neighbourhood', 'neighbourhood_cleansed']

# Aplicar el imputador al conjunto de prueba
fd_test_imputed = imputer.transform(fd_test[columns_for_imputation])

# Actualizar el DataFrame con los valores imputados en 'neighbourhood'
fd_test['neighbourhood'] = fd_test_imputed[:, 2]

# Guardar el dataset de prueba imputado si es necesario
fd_test.to_csv('/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/fd_test_imputed.csv', sep=';', index=False)

In [82]:
# Identificar columnas categóricas con valores nulos
categorical_columns = fd_test.select_dtypes(include=['object']).columns
categorical_null_columns = categorical_columns[fd_test[categorical_columns].isnull().any()]

# Contar los valores nulos en cada una de las columnas categóricas con nulos
categorical_null_counts = fd_test[categorical_null_columns].isnull().sum()
print(categorical_null_counts)


Series([], dtype: float64)


In [83]:
fd_test.shape

(622, 40)

##Ya no tenemos NULLS ahora convertiremos las categoricas a numericas restantes:

In [84]:
#cargar dataset fd_test_imputed
fd_test_imputed = pd.read_csv("/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/fd_test_imputed.csv", sep=';')

##Comenzaré con las categorias host_

In [85]:
from datetime import datetime

fd_test_imputed['host_since'] = fd_test_imputed['host_since'].apply(lambda x: datetime.strptime(str(x), '%Y-%m-%d'))
fd_test_imputed['years_being_host'] = fd_test_imputed['host_since'].apply(lambda x: datetime.now().year - x.year)

fd_test_imputed.drop(['host_since'], axis=1, inplace=True)

print(fd_test_imputed['years_being_host'].dtype)

int64


##'host_response_time' valores unicos

In [86]:
# Mapping para convertir los valores de 'host_response_time' en valores numéricos relevancia
response_time_mapping = {
    'within an hour': 4,
    'within a few hours': 3,
    'within a day': 2,
    'a few days or more': 1
}

# Aplicar la transformación a la columna 'host_response_time'
fd_test_imputed['host_response_time'] = fd_test_imputed['host_response_time'].map(response_time_mapping)

# Verificar los valores únicos después de la transformación
print(fd_test_imputed['host_response_time'].unique())

[4 3 1 2]


##'host_is_superhost' ,'host_has_profile_pic', 'host_identity_verified' e 'instant_bookable'

In [87]:
# Convertir las cadenas 't' y 'f' a valores numéricos binarios (0 y 1) usando .apply()

boolean_columns = ['host_is_superhost', 'host_has_profile_pic', 'host_identity_verified', 'instant_bookable']
# Convertir las cadenas 't' y 'f' a valores numéricos binarios (0 y 1) usando .apply()
for col in boolean_columns:
    fd_test_imputed[col] = fd_test_imputed[col].apply(lambda x: 1 if x == 't' else 0)

# Verificar los valores únicos para confirmar la transformación
for col in boolean_columns:
    print(f"Valores únicos en {col} (test):", fd_test_imputed[col].unique())


Valores únicos en host_is_superhost (test): [0 1]
Valores únicos en host_has_profile_pic (test): [1 0]
Valores únicos en host_identity_verified (test): [1 0]
Valores únicos en instant_bookable (test): [0 1]


##'host_verifications'

In [88]:
print("Valores únicos en 'host_verifications':")
print(fd_test_imputed['host_verifications'].unique())


Valores únicos en 'host_verifications':
["['email', 'phone']" "['phone', 'work_email']" "['phone']"
 "['email', 'phone', 'work_email']" "['email']"]


In [89]:
from ast import literal_eval
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import joblib

# Cargar el escalador guardado desde train
scaler = joblib.load('/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/pkl/scaler_verifications.pkl')

# Convertir las cadenas de texto en listas reales usando literal_eval
fd_test_imputed['host_verifications'] = fd_test_imputed['host_verifications'].apply(literal_eval)

# Normalizar las listas eliminando espacios innecesarios y asegurando un formato consistente
fd_test_imputed['host_verifications'] = fd_test_imputed['host_verifications'].apply(
    lambda x: [item.strip().lower() for item in x]
)

# Contar el número de verificaciones para cada anfitrión, asegurando que las listas vacías sean tratadas como 0
fd_test_imputed['num_host_verifications'] = fd_test_imputed['host_verifications'].apply(
    lambda x: len(x) if len(x) > 0 else 0  # Listas vacías -> 0
)

# Clipping en el rango esperado antes del escalado
fd_test_imputed['num_host_verifications'] = np.clip(
    fd_test_imputed['num_host_verifications'], scaler.data_min_[0], scaler.data_max_[0]
)

# Escalar los valores usando el escalador de train
fd_test_imputed['num_host_verifications'] = scaler.transform(
    fd_test_imputed[['num_host_verifications']]
)

# Eliminar la columna original 'host_verifications' para evitar redundancia
fd_test_imputed.drop('host_verifications', axis=1, inplace=True)

# Verificar las primeras filas para asegurarnos de que la transformación se realizó correctamente
print(fd_test_imputed[['num_host_verifications']].agg(['min', 'max']))


     num_host_verifications
min                0.336667
max                0.990000


In [90]:
# Ver las categorías únicas de 'num_host_verifications' y su frecuencia
unique_categories = fd_test_imputed['num_host_verifications'].value_counts()

print("Categorías únicas en 'num_host_verifications' y sus frecuencias:")
print(unique_categories)

# Si solo quieres los valores únicos sin la frecuencia:
unique_values = fd_test_imputed['num_host_verifications'].unique()
print("\nValores únicos en 'num_host_verifications':")
print(unique_values)


Categorías únicas en 'num_host_verifications' y sus frecuencias:
num_host_verifications
0.663333    490
0.990000     70
0.336667     62
Name: count, dtype: int64

Valores únicos en 'num_host_verifications':
[0.66333333 0.33666667 0.99      ]


In [91]:
# Verificar mínimo y máximo de la columna 'num_host_verifications' en fd_test_finpreprocesado
min_max_verifications = fd_test_imputed['num_host_verifications'].agg(['min', 'max'])
print("Mínimo y máximo de 'num_host_verifications' en fd_test_imputado:")
print(min_max_verifications)

Mínimo y máximo de 'num_host_verifications' en fd_test_imputado:
min    0.336667
max    0.990000
Name: num_host_verifications, dtype: float64


In [92]:
fd_test_imputed.shape

(622, 40)

##'property_type'

In [93]:
import joblib

# Cargar las medias calculadas para 'property_type' desde train
property_price_means = joblib.load("/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/pkl/property_price_means.pkl")

# Crear una nueva columna 'property_type_encoded' en test
fd_test_imputed['property_type_encoded'] = fd_test_imputed['property_type'].map(property_price_means)

# Manejar valores no vistos en el diccionario, asignándoles la media global calculada en train
global_mean_price_train = sum(property_price_means.values()) / len(property_price_means)
fd_test_imputed['property_type_encoded'] = fd_test_imputed['property_type_encoded'].fillna(global_mean_price_train)

# Eliminar la columna original 'property_type'
fd_test_imputed.drop('property_type', axis=1, inplace=True)

# Verificar las primeras filas para asegurarnos de que la transformación se realizó correctamente
print(fd_test_imputed[['property_type_encoded']].head())


   property_type_encoded
0             195.894336
1              82.168798
2              86.526829
3             195.894336
4              86.526829


##'room_type'

In [94]:
import joblib

# Cargar las medias calculadas previamente y la media global desde el entrenamiento
room_data = joblib.load("/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/pkl/room_price_means.pkl")
room_price_means = room_data['room_price_means']
global_mean_price_train = room_data['global_mean_price_train']

# Aplicar el promedio calculado en train para codificar la columna en test
fd_test_imputed['room_type_encoded'] = fd_test_imputed['room_type'].map(room_price_means)

# Manejar valores no vistos en train asignándoles la media global
fd_test_imputed['room_type_encoded'] = fd_test_imputed['room_type_encoded'].fillna(global_mean_price_train)

# Eliminar la columna original 'room_type' para evitar redundancia
fd_test_imputed.drop('room_type', axis=1, inplace=True)

# Verificar las primeras filas para asegurarnos de que la transformación se realizó correctamente
print(fd_test_imputed[['room_type_encoded']].head())

   room_type_encoded
0         223.597325
1         112.701754
2         112.701754
3         223.597325
4         112.701754


###Eliminamos columnas que finalmente son innecesarias

In [95]:
# Eliminar columnas innecesarias del conjunto test
fd_test_imputed.drop(columns=['listing_url', 'picture_url', 'amenities', 'id'], inplace=True)

# Verificar que las columnas han sido eliminadas
print("Columnas restantes en fd_test_imputed:", fd_test.columns)

Columnas restantes en fd_test_imputed: Index(['id', 'listing_url', 'picture_url', 'host_since', 'host_response_time',
       'host_response_rate', 'host_acceptance_rate', 'host_is_superhost',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_cleansed', 'latitude', 'longitude', 'property_type',
       'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds',
       'amenities', 'price', 'minimum_nights', 'maximum_nights',
       'availability_365', 'number_of_reviews', 'number_of_reviews_ltm',
       'number_of_reviews_l30d', 'review_scores_rating',
       'review_scores_accuracy', 'review_scores_cleanliness',
       'review_scores_checkin', 'review_scores_communication',
       'review_scores_location', 'review_scores_value', 'instant_bookable',
       'reviews_per_month', 'bathrooms_shared'],
      dtype='object')


In [96]:
fd_test_imputed.shape

(622, 36)

In [97]:
# Verificar si existen columnas no numéricas en fd_train_imputed
non_numeric_columns = fd_test_imputed.select_dtypes(exclude=['number']).columns

# Imprimir las columnas no numéricas
print("Columnas no numéricas en fd_test_imputed:", non_numeric_columns)


Columnas no numéricas en fd_test_imputed: Index([], dtype='object')


In [98]:
# Guardar el conjunto test imputado y con todas las columnas convertidas a numéricas
fd_test_imputed.to_csv("/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/fd_test_imputed_num.csv", sep=';', index=False)

# Confirmación del guardado
print("El archivo CSV con los datos imputados y numéricos ha sido guardado como 'fd_test_imputed_num.csv'")

El archivo CSV con los datos imputados y numéricos ha sido guardado como 'fd_test_imputed_num.csv'


In [99]:
# Cargar el conjunto test imputado y numérico desde el archivo CSV
fd_test_imputed_num = pd.read_csv("/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/fd_test_imputed_num.csv", sep=';')

# Confirmación de la carga
print("El archivo CSV 'fd_test_imputed_num.csv' ha sido cargado exitosamente.")

El archivo CSV 'fd_test_imputed_num.csv' ha sido cargado exitosamente.


In [120]:
fd_test_imputed_num.isnull().sum(

)

Unnamed: 0,0
host_response_time,0
host_response_rate,0
host_acceptance_rate,0
host_is_superhost,0
host_total_listings_count,0
host_has_profile_pic,0
host_identity_verified,0
neighbourhood,0
neighbourhood_cleansed,0
latitude,0


## **OUTLIERS**
"price"

In [101]:
# Contar registros antes de eliminar outliers
initial_test_count = fd_test_imputed_num.shape[0]

# Identificar índices de los registros que serán eliminados
outliers_test = fd_test_imputed_num[fd_test_imputed_num['price'] > 3000].index

# Eliminar registros
fd_test_imputed_num = fd_test_imputed_num.drop(outliers_test)

# Contar registros después de eliminar outliers
final_test_count = fd_test_imputed_num.shape[0]

# Mostrar cuántos registros se eliminaron
print(f"Registros eliminados en test: {initial_test_count - final_test_count}")
print(f"Registros restantes en test después del filtro: {final_test_count}")


Registros eliminados en test: 0
Registros restantes en test después del filtro: 622


Capping a 'number_of_reviews', 'reviews_per_month', 'host_total_listings_count', 'minimum_nights', 'maximum_nights'

In [102]:
import numpy as np
import pickle

# Cargar los thresholds calculados previamente en train
with open('/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/pkl/capping_thresholds.pkl', 'rb') as f:
    capping_thresholds = pickle.load(f)

# Aplicar capping a las columnas en el conjunto de prueba
for column in ['number_of_reviews', 'reviews_per_month', 'host_total_listings_count', 'minimum_nights', 'maximum_nights']:
    # Obtener el threshold para la columna
    threshold = capping_thresholds[column]

    # Aplicar capping en los valores que superan el threshold
    fd_test_imputed_num[column] = np.where(fd_test_imputed_num[column] > threshold, threshold, fd_test_imputed_num[column])

# Verificar algunos valores después del capping en fd_test
print("Valores después del capping en 'host_total_listings_count' (test):")
print(fd_test_imputed_num['host_total_listings_count'].sort_values(ascending=False).head())

Valores después del capping en 'host_total_listings_count' (test):
357    1436.0
254    1436.0
336    1436.0
239    1436.0
483    1436.0
Name: host_total_listings_count, dtype: float64


In [103]:
# Guardar el fd_test_imputed_num_out preprocesado CON TRABAJO DE OUTLIERS en un archivo CSV
fd_test_imputed_num.to_csv("/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/fd_test_imputed_num_out.csv", sep=';', index=False)

In [104]:
# Cargar el fd preprocesado CON TRABAJO DE OUTLIERS desde un archivo CSV
import pandas as pd
fd_test_imputed_num_out = pd.read_csv("/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/fd_test_imputed_num_out.csv", sep=';')

In [105]:
fd_test_imputed_num_out.shape

(622, 36)

Eliminar 'neighbourhood por que tiene mas correlacion con 'price' 'neighbourhood_cleansed'

In [106]:
#drop de 'neighbourhood'
fd_test_imputed_num_out.drop(columns=['neighbourhood'], inplace=True)

In [107]:
fd_test_imputed_num_out.shape

(622, 35)

In [121]:
fd_test_imputed_num_out.isnull().sum()

Unnamed: 0,0
host_response_time,0
host_response_rate,0
host_acceptance_rate,0
host_is_superhost,0
host_total_listings_count,0
host_has_profile_pic,0
host_identity_verified,0
neighbourhood_cleansed,0
latitude,0
longitude,0


# **NORMALIZACIÓN**

####Normalizare 'bedrooms', 'beds', 'accommodates', 'bathrooms'

In [108]:
from sklearn.preprocessing import MinMaxScaler
import joblib

# Cargar el escalador entrenado en train
scaler_bedrooms_beds_adjusted = joblib.load('/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/pkl/scaler_bedrooms_beds_adjusted.pkl')

# Aplicar el escalador a 'bedrooms', 'beds', 'accommodates', y 'bathrooms' en prueba
fd_test_imputed_num_out[['bedrooms', 'beds', 'accommodates', 'bathrooms']] = scaler_bedrooms_beds_adjusted.transform(
    fd_test_imputed_num_out[['bedrooms', 'beds', 'accommodates', 'bathrooms']]
)

# Verificar los valores después del ajuste
adjusted_bedrooms_beds_stats_test = fd_test_imputed_num_out[['bedrooms', 'beds', 'accommodates', 'bathrooms']].agg(['min', 'max'])

# Mostrar los resultados
print(adjusted_bedrooms_beds_stats_test)

     bedrooms      beds  accommodates  bathrooms
min      0.01  0.010000          0.01   0.010000
max      0.99  0.616667          0.99   0.826667


# **Columnas Numéricas para Normalización**
## **Transformación Logarítmica (log1p)**


In [109]:
import joblib

# Cargar la información de la transformación desde Train
price_transform = joblib.load("/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/pkl/price_transform.pkl")

# Aplicar log1p a la variable objetivo 'price' en Val y Test
fd_test_imputed_num_out['price'] = np.log1p(fd_test_imputed_num_out['price'])

print("✅ Transformación logarítmica aplicada en `price` de Test.")



✅ Transformación logarítmica aplicada en `price` de Test.


In [110]:
# Cargar las columnas a transformar
columns_to_log_transform = joblib.load("/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/pkl/log_transform_columns.pkl")

# Aplicar transformación en test
for col in columns_to_log_transform:
    fd_test_imputed_num_out[col] = np.log1p(fd_test_imputed_num_out[col])  # log(1 + x)

print("✅ Transformación logarítmica aplicada en `test` usando los mismos parámetros de `train`.")


✅ Transformación logarítmica aplicada en `test` usando los mismos parámetros de `train`.


# **Embeddings**


In [111]:
# Variables categóricas que usaremos en embeddings
categorical_columns = ['host_response_time', 'neighbourhood_cleansed',
                       'property_type_encoded', 'room_type_encoded']

# Cargar los mapeos guardados en train
category_mappings = joblib.load("/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/pkl/category_mappings.pkl")

# Aplicar los mapeos en test
for col in categorical_columns:
    fd_test_imputed_num_out[col] = fd_test_imputed_num_out[col].map(category_mappings[col])

print("Transformación de variables categóricas a índices en Test completada.")


Transformación de variables categóricas a índices en Test completada.


## **StandardScaler para variables con alta varianza**

## **StandardScaler para Geo**

In [112]:
# Cargar el scaler de train
scaler_geo = joblib.load("/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/pkl/standard_scaler_geo.pkl")

# Aplicar transformación en test

fd_test_imputed_num_out[['latitude', 'longitude']] = scaler_geo.transform(fd_test_imputed_num_out[['latitude', 'longitude']])


## **MinMaxScaler**

In [113]:

columns_to_minmax = [
    'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness',
    'review_scores_checkin', 'review_scores_communication', 'review_scores_location',
    'review_scores_value'
]

# Cargar scaler de train
scaler_minmax = joblib.load("/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/pkl/minmax_scaler.pkl")

# Aplicar MinMaxScaler en test

fd_test_imputed_num_out[columns_to_minmax] = scaler_minmax.transform(fd_test_imputed_num_out[columns_to_minmax])

print("MinMaxScaler aplicado en Test correctamente.")


MinMaxScaler aplicado en Test correctamente.


In [114]:
# Definir las columnas
columns_to_scale = ['availability_365', 'years_being_host']


# Cargar el scaler guardado de train
scaler = joblib.load("/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/pkl/minmax_scaler_availability_host.pkl")

# Aplicar el escalado en val y test
fd_test_imputed_num_out[columns_to_scale] = scaler.transform(fd_test_imputed_num_out[columns_to_scale])

print("✔️ MinMaxScaler aplicado correctamente test.")


✔️ MinMaxScaler aplicado correctamente en val y test.


###*'neighbourhood_cleansed'* ya paso por labelencoder y esta codficada.

No necesita en este momento normalizacion, puede trabajarse mas segun el modelo que haga si es con redes neuronales o arboles, etc.

In [115]:
# Guardar el fd_test_imputed_num_out PARA CORRELACIONAR
fd_test_imputed_num_out.to_csv("/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/fd_test_acorrelacion.csv", sep=';', index=False)

In [116]:
# Cargar el fd_test_acorrelacion
import pandas as pd
fd_test_acorrelacion= pd.read_csv("/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/fd_test_acorrelacion.csv", sep=';')

In [117]:
fd_test_acorrelacion.shape

(622, 35)

In [118]:
fd_test_acorrelacion.isnull().sum()

Unnamed: 0,0
host_response_time,0
host_response_rate,0
host_acceptance_rate,0
host_is_superhost,0
host_total_listings_count,0
host_has_profile_pic,0
host_identity_verified,0
neighbourhood_cleansed,0
latitude,0
longitude,0


In [54]:
import pandas as pd

# Columnas sin truncar
pd.set_option('display.max_columns', None)

print("\nMínimo y máximo en cada columna de fd_test_finpreprocesado:")
min_max_test = fd_test_acorrelacion.agg(['min', 'max'])
print(min_max_test)

# Opcional: Restaurar configuración para evitar demasiadas columnas en futuras salidas
pd.reset_option('display.max_columns')


Mínimo y máximo en cada columna de fd_test_finpreprocesado:
     host_response_time  host_response_rate  host_acceptance_rate  \
min                   0                 0.0                   0.0   
max                   3                 1.0                   1.0   

     host_is_superhost  host_total_listings_count  host_has_profile_pic  \
min                  0                   0.693147                     0   
max                  1                   7.270313                     1   

     host_identity_verified  neighbourhood_cleansed  latitude  longitude  \
min                       0                       0 -3.077416  -2.684634   
max                       1                      24  1.994752   2.519020   

     accommodates  bathrooms  bedrooms      beds     price  minimum_nights  \
min          0.01   0.010000      0.01  0.010000  3.433987        0.693147   
max          0.99   0.826667      0.99  0.616667  7.378384        4.521789   

     maximum_nights  availability_365  nu

# **CORRELACIONAR**

**review_scores_rating vs. review_scores_accuracy vs. review_scores_value**

In [55]:
# Definir las columnas de reviews con correlación extrema
review_columns_extreme = ['review_scores_rating', 'review_scores_accuracy', 'review_scores_value']


# Cargar la media calculada en train
review_overall_mean_train = joblib.load("/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/pkl/review_overall_mean.pkl")

# Aplicar en test
fd_test_acorrelacion['review_overall_score'] = fd_test_acorrelacion[review_columns_extreme].mean(axis=1)
fd_test_acorrelacion['review_overall_score'].fillna(review_overall_mean_train, inplace=True)
fd_test_acorrelacion.drop(columns=review_columns_extreme, inplace=True)

# Verificar
print(fd_test_acorrelacion[['review_overall_score']].describe())


       review_overall_score
count            622.000000
mean               0.935571
std                0.078716
min                0.066667
25%                0.923417
50%                0.952500
75%                0.971667
max                1.000000


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  fd_test_acorrelacion['review_overall_score'].fillna(review_overall_mean_train, inplace=True)


In [56]:
# Eliminar las columnas mas correlacionadas entre si de las reseñas
fd_test_acorrelacion.drop(columns=['number_of_reviews', 'number_of_reviews_ltm', 'number_of_reviews_l30d'], inplace=True)

## **Creación de nuevas variables**

In [57]:
# Aplicar la misma fórmula que en train para test
fd_test_acorrelacion['price_per_person'] = fd_test_acorrelacion['price'] / fd_test_acorrelacion['accommodates']

# Verificar que la nueva variable fue creada correctamente en test
print(fd_test_acorrelacion[['price', 'accommodates', 'price_per_person']].head())


      price  accommodates  price_per_person
0  4.672829      0.075333         62.028701
1  3.931826      0.010000        393.182563
2  4.094345      0.075333         54.349707
3  4.836282      0.075333         64.198432
4  3.688879      0.010000        368.887945


In [58]:
# Aplicar la misma lógica para test
price_per_neighborhood_test = fd_test_acorrelacion.groupby('neighbourhood_cleansed')['price'].transform('mean')
fd_test_acorrelacion['price_per_neighborhood'] = price_per_neighborhood_test

# Verificar que la nueva variable fue creada correctamente en test
print(fd_test_acorrelacion[['price', 'neighbourhood_cleansed', 'price_per_neighborhood']].head())


      price  neighbourhood_cleansed  price_per_neighborhood
0  4.672829                      22                5.066246
1  3.931826                       7                4.648171
2  4.094345                       4                4.686187
3  4.836282                       1                4.990125
4  3.688879                       7                4.648171


In [59]:
# Aplicar la misma lógica para test
avg_price_by_property_type_test = fd_test_acorrelacion.groupby('property_type_encoded')['price'].transform('mean')
fd_test_acorrelacion['avg_price_by_property_type'] = avg_price_by_property_type_test

# Verificar que la nueva variable fue creada correctamente en test
print(fd_test_acorrelacion[['price', 'property_type_encoded', 'avg_price_by_property_type']].head())


      price  property_type_encoded  avg_price_by_property_type
0  4.672829                   22.0                    5.011296
1  3.931826                    9.0                    4.199577
2  4.094345                   11.0                    4.318779
3  4.836282                   22.0                    5.011296
4  3.688879                   11.0                    4.318779


# **Guardar el fd_test_acorrelacion**

In [64]:
print(fd_test_finpreprocesado.isnull().sum().sum())

2


In [122]:
fd_test_finpreprocesado.isnull().sum()

Unnamed: 0,0
host_response_time,0
host_response_rate,0
host_acceptance_rate,0
host_is_superhost,0
host_total_listings_count,0
host_has_profile_pic,0
host_identity_verified,0
neighbourhood_cleansed,0
latitude,0
longitude,0


In [124]:
# Filtrar los registros con valores nulos en las columnas específicas
df_with_nulls = fd_test_finpreprocesado[fd_test_finpreprocesado['property_type_encoded'].isnull() | fd_test_finpreprocesado['avg_price_by_property_type'].isnull()]

# Ver los registros con valores nulos
df_with_nulls


Unnamed: 0,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_total_listings_count,host_has_profile_pic,host_identity_verified,neighbourhood_cleansed,latitude,longitude,...,reviews_per_month,bathrooms_shared,years_being_host,num_host_verifications,property_type_encoded,room_type_encoded,review_overall_score,price_per_person,price_per_neighborhood,avg_price_by_property_type
201,3,1.0,1.0,0,1.94591,1,1,7,-1.187409,0.293601,...,1.088562,1,0.5,0.663333,,0,0.9085,59.878666,4.648171,


In [125]:
# Cargar fd_train_finpreprocesado para usar caluclo de media
fd_train_finpreprocesado = pd.read_csv("/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/fd_train_finpreprocesado.csv", sep=';')

In [126]:
# Imputar los valores nulos en test usando las medias calculadas en train
fd_test_finpreprocesado['property_type_encoded'].fillna(fd_train_finpreprocesado['property_type_encoded'].mean(), inplace=True)
fd_test_finpreprocesado['avg_price_by_property_type'].fillna(fd_train_finpreprocesado['avg_price_by_property_type'].mean(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  fd_test_finpreprocesado['property_type_encoded'].fillna(fd_train_finpreprocesado['property_type_encoded'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  fd_test_finpreprocesado['avg_price_by_property_type'].fillna(fd_train_finpreprocesado['avg_price_by_prop

In [10]:
fd_test_finpreprocesado.isnull().sum()

Unnamed: 0,0
host_response_time,0
host_response_rate,0
host_acceptance_rate,0
host_is_superhost,0
host_total_listings_count,0
host_has_profile_pic,0
host_identity_verified,0
neighbourhood_cleansed,0
latitude,0
longitude,0


In [9]:
# Guardar el fd_test_acorrelacion con final de preprocesado y NORMALIZACION en un archivo CSV fd_test_finpreprocesado.
fd_test_finpreprocesado.to_csv("/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/fd_test_finpreprocesado.csv", sep=';', index=False)


In [4]:
# Cargar el fd_test_finpreprocesado.csv
import pandas as pd
fd_test_finpreprocesado = pd.read_csv("/content/drive/MyDrive/Deep_Learning_2024/Notebooks/data/fd_test_finpreprocesado.csv", sep=';')

In [6]:
import numpy as np
# Aplicar log a las nuevas variables para que tengan la misma escala
fd_test_finpreprocesado['price_per_person_log'] = np.log(fd_test_finpreprocesado['price_per_person'])
fd_test_finpreprocesado['price_per_neighborhood_log'] = np.log(fd_test_finpreprocesado['price_per_neighborhood'])
fd_test_finpreprocesado['avg_price_by_property_type_log'] = np.log(fd_test_finpreprocesado['avg_price_by_property_type'])

In [7]:
# Eliminar las columnas mas correlacionadas entre si de las reseñas
fd_test_finpreprocesado.drop(columns=['price_per_person', 'price_per_neighborhood','avg_price_by_property_type'], inplace=True)

In [8]:
fd_test_finpreprocesado.shape

(622, 33)