# Análisis de Regresión Logística - 10 Casos Corregidos sobre Datos de Cambridge

In [None]:
# Importamos librerías necesarias
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score


In [None]:
# Cargamos datos
df = pd.read_csv("Datos_limpios_Cambridge.csv")

# Limpieza básica
df.fillna(method="ffill", inplace=True)
df.fillna(method="bfill", inplace=True)

# Limpieza de columnas específicas
df['price'] = df['price'].replace('[\$,]', '', regex=True).astype(float)

def limpiar_bathrooms(bathroom):
    if pd.isnull(bathroom):
        return np.nan
    number = ''.join([c for c in str(bathroom) if c.isdigit() or c == '.'])
    try:
        return float(number)
    except:
        return np.nan

df['bathrooms_text'] = df['bathrooms_text'].apply(limpiar_bathrooms)

df['host_acceptance_rate'] = df['host_acceptance_rate'].replace('%','',regex=True)
df['host_response_rate'] = df['host_response_rate'].replace('%','',regex=True)

df['host_acceptance_rate'] = pd.to_numeric(df['host_acceptance_rate'], errors='coerce')
df['host_response_rate'] = pd.to_numeric(df['host_response_rate'], errors='coerce')
df['calculated_host_listings_count'] = pd.to_numeric(df['calculated_host_listings_count'], errors='coerce')

# Corrección de variables binarias
df['host_is_superhost'] = df['host_is_superhost'].map({'t':1, 'f':0})
df['instant_bookable'] = df['instant_bookable'].map({'t':1, 'f':0})


In [None]:
# Función para convertir variable continua a dicotómica
def convertir_dicotomica(df, columna, umbral=None):
    if umbral is None:
        umbral = df[columna].median()
    return (df[columna] >= umbral).astype(int)

# Crear variables binarias
df['reviews_per_month_bin'] = convertir_dicotomica(df, 'reviews_per_month')
df['availability_bin'] = convertir_dicotomica(df, 'availability_365')
df['review_score_cleanliness_bin'] = convertir_dicotomica(df, 'review_scores_cleanliness')
df['review_score_location_bin'] = convertir_dicotomica(df, 'review_scores_location')
df['price_bin'] = convertir_dicotomica(df, 'price')


In [None]:
# Lista para almacenar resultados
resultados = []


In [None]:
# Caso 1
X1 = df[['host_acceptance_rate', 'number_of_reviews']]
y1 = df['host_is_superhost']

# Convertir y a binaria si no lo es
if y1.dtype != 'int64' and y1.dtype != 'bool' and y1.nunique() > 2:
    y1 = convertir_dicotomica(df, 'host_is_superhost')

# Eliminar filas con NaN
data1 = pd.concat([X1, y1], axis=1).dropna()
X1 = data1[['host_acceptance_rate', 'number_of_reviews']]
y1 = data1['host_is_superhost']

X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.3, random_state=None)

scaler1 = StandardScaler()
X1_train = scaler1.fit_transform(X1_train)
X1_test = scaler1.transform(X1_test)

modelo1 = LogisticRegression()
modelo1.fit(X1_train, y1_train)
y1_pred = modelo1.predict(X1_test)

resultados.append({
    'Caso': 1,
    'Dependiente': 'host_is_superhost',
    'Independientes': 'host_acceptance_rate, number_of_reviews',
    'Precisión': precision_score(y1_test, y1_pred),
    'Exactitud': accuracy_score(y1_test, y1_pred),
    'Sensibilidad': recall_score(y1_test, y1_pred)
})


In [None]:
# Caso 2
X2 = df[['host_response_rate', 'beds']]
y2 = df['instant_bookable']

# Convertir y a binaria si no lo es
if y2.dtype != 'int64' and y2.dtype != 'bool' and y2.nunique() > 2:
    y2 = convertir_dicotomica(df, 'instant_bookable')

# Eliminar filas con NaN
data2 = pd.concat([X2, y2], axis=1).dropna()
X2 = data2[['host_response_rate', 'beds']]
y2 = data2['instant_bookable']

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.3, random_state=None)

scaler2 = StandardScaler()
X2_train = scaler2.fit_transform(X2_train)
X2_test = scaler2.transform(X2_test)

modelo2 = LogisticRegression()
modelo2.fit(X2_train, y2_train)
y2_pred = modelo2.predict(X2_test)

resultados.append({
    'Caso': 2,
    'Dependiente': 'instant_bookable',
    'Independientes': 'host_response_rate, beds',
    'Precisión': precision_score(y2_test, y2_pred),
    'Exactitud': accuracy_score(y2_test, y2_pred),
    'Sensibilidad': recall_score(y2_test, y2_pred)
})


In [None]:
# Caso 3
X3 = df[['availability_365', 'bedrooms']]
y3 = df['reviews_per_month_bin']

# Convertir y a binaria si no lo es
if y3.dtype != 'int64' and y3.dtype != 'bool' and y3.nunique() > 2:
    y3 = convertir_dicotomica(df, 'reviews_per_month_bin')

# Eliminar filas con NaN
data3 = pd.concat([X3, y3], axis=1).dropna()
X3 = data3[['availability_365', 'bedrooms']]
y3 = data3['reviews_per_month_bin']

X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size=0.3, random_state=None)

scaler3 = StandardScaler()
X3_train = scaler3.fit_transform(X3_train)
X3_test = scaler3.transform(X3_test)

modelo3 = LogisticRegression()
modelo3.fit(X3_train, y3_train)
y3_pred = modelo3.predict(X3_test)

resultados.append({
    'Caso': 3,
    'Dependiente': 'reviews_per_month_bin',
    'Independientes': 'availability_365, bedrooms',
    'Precisión': precision_score(y3_test, y3_pred),
    'Exactitud': accuracy_score(y3_test, y3_pred),
    'Sensibilidad': recall_score(y3_test, y3_pred)
})


In [None]:
# Caso 4
X4 = df[['accommodates', 'price']]
y4 = df['availability_bin']

# Convertir y a binaria si no lo es
if y4.dtype != 'int64' and y4.dtype != 'bool' and y4.nunique() > 2:
    y4 = convertir_dicotomica(df, 'availability_bin')

# Eliminar filas con NaN
data4 = pd.concat([X4, y4], axis=1).dropna()
X4 = data4[['accommodates', 'price']]
y4 = data4['availability_bin']

X4_train, X4_test, y4_train, y4_test = train_test_split(X4, y4, test_size=0.3, random_state=None)

scaler4 = StandardScaler()
X4_train = scaler4.fit_transform(X4_train)
X4_test = scaler4.transform(X4_test)

modelo4 = LogisticRegression()
modelo4.fit(X4_train, y4_train)
y4_pred = modelo4.predict(X4_test)

resultados.append({
    'Caso': 4,
    'Dependiente': 'availability_bin',
    'Independientes': 'accommodates, price',
    'Precisión': precision_score(y4_test, y4_pred),
    'Exactitud': accuracy_score(y4_test, y4_pred),
    'Sensibilidad': recall_score(y4_test, y4_pred)
})


In [None]:
# Caso 5
X5 = df[['bedrooms', 'number_of_reviews']]
y5 = df['review_score_cleanliness_bin']

# Convertir y a binaria si no lo es
if y5.dtype != 'int64' and y5.dtype != 'bool' and y5.nunique() > 2:
    y5 = convertir_dicotomica(df, 'review_score_cleanliness_bin')

# Eliminar filas con NaN
data5 = pd.concat([X5, y5], axis=1).dropna()
X5 = data5[['bedrooms', 'number_of_reviews']]
y5 = data5['review_score_cleanliness_bin']

X5_train, X5_test, y5_train, y5_test = train_test_split(X5, y5, test_size=0.3, random_state=None)

scaler5 = StandardScaler()
X5_train = scaler5.fit_transform(X5_train)
X5_test = scaler5.transform(X5_test)

modelo5 = LogisticRegression()
modelo5.fit(X5_train, y5_train)
y5_pred = modelo5.predict(X5_test)

resultados.append({
    'Caso': 5,
    'Dependiente': 'review_score_cleanliness_bin',
    'Independientes': 'bedrooms, number_of_reviews',
    'Precisión': precision_score(y5_test, y5_pred),
    'Exactitud': accuracy_score(y5_test, y5_pred),
    'Sensibilidad': recall_score(y5_test, y5_pred)
})


In [None]:
# Caso 6
X6 = df[['beds', 'bathrooms_text']]
y6 = df['review_score_location_bin']

# Convertir y a binaria si no lo es
if y6.dtype != 'int64' and y6.dtype != 'bool' and y6.nunique() > 2:
    y6 = convertir_dicotomica(df, 'review_score_location_bin')

# Eliminar filas con NaN
data6 = pd.concat([X6, y6], axis=1).dropna()
X6 = data6[['beds', 'bathrooms_text']]
y6 = data6['review_score_location_bin']

X6_train, X6_test, y6_train, y6_test = train_test_split(X6, y6, test_size=0.3, random_state=None)

scaler6 = StandardScaler()
X6_train = scaler6.fit_transform(X6_train)
X6_test = scaler6.transform(X6_test)

modelo6 = LogisticRegression()
modelo6.fit(X6_train, y6_train)
y6_pred = modelo6.predict(X6_test)

resultados.append({
    'Caso': 6,
    'Dependiente': 'review_score_location_bin',
    'Independientes': 'beds, bathrooms_text',
    'Precisión': precision_score(y6_test, y6_pred),
    'Exactitud': accuracy_score(y6_test, y6_pred),
    'Sensibilidad': recall_score(y6_test, y6_pred)
})


In [None]:
# Caso 7
X7 = df[['accommodates', 'reviews_per_month']]
y7 = df['price_bin']

# Convertir y a binaria si no lo es
if y7.dtype != 'int64' and y7.dtype != 'bool' and y7.nunique() > 2:
    y7 = convertir_dicotomica(df, 'price_bin')

# Eliminar filas con NaN
data7 = pd.concat([X7, y7], axis=1).dropna()
X7 = data7[['accommodates', 'reviews_per_month']]
y7 = data7['price_bin']

X7_train, X7_test, y7_train, y7_test = train_test_split(X7, y7, test_size=0.3, random_state=None)

scaler7 = StandardScaler()
X7_train = scaler7.fit_transform(X7_train)
X7_test = scaler7.transform(X7_test)

modelo7 = LogisticRegression()
modelo7.fit(X7_train, y7_train)
y7_pred = modelo7.predict(X7_test)

resultados.append({
    'Caso': 7,
    'Dependiente': 'price_bin',
    'Independientes': 'accommodates, reviews_per_month',
    'Precisión': precision_score(y7_test, y7_pred),
    'Exactitud': accuracy_score(y7_test, y7_pred),
    'Sensibilidad': recall_score(y7_test, y7_pred)
})


In [None]:
# Caso 8
X8 = df[['number_of_reviews', 'bedrooms']]
y8 = df['host_acceptance_rate']

# Convertir y a binaria si no lo es
if y8.dtype != 'int64' and y8.dtype != 'bool' and y8.nunique() > 2:
    y8 = convertir_dicotomica(df, 'host_acceptance_rate')

# Eliminar filas con NaN
data8 = pd.concat([X8, y8], axis=1).dropna()
X8 = data8[['number_of_reviews', 'bedrooms']]
y8 = data8['host_acceptance_rate']

X8_train, X8_test, y8_train, y8_test = train_test_split(X8, y8, test_size=0.3, random_state=None)

scaler8 = StandardScaler()
X8_train = scaler8.fit_transform(X8_train)
X8_test = scaler8.transform(X8_test)

modelo8 = LogisticRegression()
modelo8.fit(X8_train, y8_train)
y8_pred = modelo8.predict(X8_test)

resultados.append({
    'Caso': 8,
    'Dependiente': 'host_acceptance_rate',
    'Independientes': 'number_of_reviews, bedrooms',
    'Precisión': precision_score(y8_test, y8_pred),
    'Exactitud': accuracy_score(y8_test, y8_pred),
    'Sensibilidad': recall_score(y8_test, y8_pred)
})


In [None]:
# Caso 9
X9 = df[['availability_365', 'bathrooms_text']]
y9 = df['host_response_rate']

# Convertir y a binaria si no lo es
if y9.dtype != 'int64' and y9.dtype != 'bool' and y9.nunique() > 2:
    y9 = convertir_dicotomica(df, 'host_response_rate')

# Eliminar filas con NaN
data9 = pd.concat([X9, y9], axis=1).dropna()
X9 = data9[['availability_365', 'bathrooms_text']]
y9 = data9['host_response_rate']

X9_train, X9_test, y9_train, y9_test = train_test_split(X9, y9, test_size=0.3, random_state=None)

scaler9 = StandardScaler()
X9_train = scaler9.fit_transform(X9_train)
X9_test = scaler9.transform(X9_test)

modelo9 = LogisticRegression()
modelo9.fit(X9_train, y9_train)
y9_pred = modelo9.predict(X9_test)

resultados.append({
    'Caso': 9,
    'Dependiente': 'host_response_rate',
    'Independientes': 'availability_365, bathrooms_text',
    'Precisión': precision_score(y9_test, y9_pred),
    'Exactitud': accuracy_score(y9_test, y9_pred),
    'Sensibilidad': recall_score(y9_test, y9_pred)
})


In [None]:
# Caso 10
X10 = df[['reviews_per_month', 'bedrooms']]
y10 = df['calculated_host_listings_count']

# Convertir y a binaria si no lo es
if y10.dtype != 'int64' and y10.dtype != 'bool' and y10.nunique() > 2:
    y10 = convertir_dicotomica(df, 'calculated_host_listings_count')

# Eliminar filas con NaN
data10 = pd.concat([X10, y10], axis=1).dropna()
X10 = data10[['reviews_per_month', 'bedrooms']]
y10 = data10['calculated_host_listings_count']

X10_train, X10_test, y10_train, y10_test = train_test_split(X10, y10, test_size=0.3, random_state=None)

scaler10 = StandardScaler()
X10_train = scaler10.fit_transform(X10_train)
X10_test = scaler10.transform(X10_test)

modelo10 = LogisticRegression()
modelo10.fit(X10_train, y10_train)
y10_pred = modelo10.predict(X10_test)

resultados.append({
    'Caso': 10,
    'Dependiente': 'calculated_host_listings_count',
    'Independientes': 'reviews_per_month, bedrooms',
    'Precisión': precision_score(y10_test, y10_pred),
    'Exactitud': accuracy_score(y10_test, y10_pred),
    'Sensibilidad': recall_score(y10_test, y10_pred)
})


In [None]:
# Mostrar resultados finales
tabla_resultados = pd.DataFrame(resultados)
tabla_resultados
