In [50]:
# =========================================================
# IMPORTS + DATA
# =========================================================
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

from imblearn.over_sampling import RandomOverSampler

# --- Carga (ajusta el nombre si corresponde) ---
df = pd.read_csv('estocolmofinal.csv')

# Utilidad: mapea boolean-like strings a 0/1
BOOL_MAP = {'t':1,'f':0,'true':1,'false':0,'yes':1,'no':0,'True':1,'False':0,'Yes':1,'No':0}

def clean_money(series):
    return pd.to_numeric(series.astype(str).str.replace(r'[\$,]', '', regex=True), errors='coerce')

def clean_percent(series):
    return pd.to_numeric(series.astype(str).str.replace('%', '', regex=False), errors='coerce')

def encode_bool(series):
    return pd.to_numeric(series.astype(str).str.strip().map(BOOL_MAP).fillna(series), errors='coerce')


def split_scale(X, y, test_size=0.3, random_state=42, try_stratify=True):
    strat = y if (try_stratify and y.nunique() >= 2) else None
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=strat
    )
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test  = scaler.transform(X_test)
    return X_train, X_test, y_train, y_test

def eval_and_print(y_test, y_pred, title=None):
    if title: print(title)
    print('Matriz de Confusión:\n', confusion_matrix(y_test, y_pred))
    print('Precisión:', precision_score(y_test, y_pred, pos_label=1, zero_division=0))
    print('Exactitud:', accuracy_score(y_test, y_pred))
    print('Sensibilidad:', recall_score(y_test, y_pred, pos_label=1, zero_division=0))
    print('F1:', f1_score(y_test, y_pred, pos_label=1, zero_division=0))



In [51]:
# =========================================================
# CASO 1: instant_bookable ~ price + reviews_per_month  (AJUSTE: class_weight)
# =========================================================
data = df[['instant_bookable','price','reviews_per_month']].copy()
data['instant_bookable'] = encode_bool(data['instant_bookable'])
data['price'] = clean_money(data['price'])
data = data.dropna(subset=['instant_bookable','price','reviews_per_month'])

X = data[['price','reviews_per_month']]
y = data['instant_bookable'].astype(int)

X_train, X_test, y_train, y_test = split_scale(X, y)
modelo = LogisticRegression(max_iter=1000, class_weight='balanced')
modelo.fit(X_train, y_train)
y_pred = modelo.predict(X_test)
eval_and_print(y_test, y_pred, title='CASO 1')

CASO 1
Matriz de Confusión:
 [[1101  255]
 [ 150   89]]
Precisión: 0.25872093023255816
Exactitud: 0.7460815047021944
Sensibilidad: 0.3723849372384937
F1: 0.3053173241852487


In [52]:

# =========================================================
# CASO 2: host_is_superhost ~ host_acceptance_rate + host_response_rate  (AJUSTE: Oversampling)
# =========================================================
data = df[['host_is_superhost','host_acceptance_rate','host_response_rate']].copy()
data['host_is_superhost'] = encode_bool(data['host_is_superhost'])
data['host_acceptance_rate'] = clean_percent(data['host_acceptance_rate'])
data['host_response_rate']  = clean_percent(data['host_response_rate'])
data = data.dropna(subset=['host_is_superhost','host_acceptance_rate','host_response_rate'])

X = data[['host_acceptance_rate','host_response_rate']]
y = data['host_is_superhost'].astype(int)

# split sin escalar aún
strat = y if y.nunique() >= 2 else None
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=strat
)

# Oversampling SOLO en train
ros = RandomOverSampler(random_state=42)
X_train_res, y_train_res = ros.fit_resample(X_train, y_train)

# Escalado con un solo scaler (se ajusta en X_train_res y se usa en ambos)
scaler = StandardScaler()
X_train_res = scaler.fit_transform(X_train_res)
X_test = scaler.transform(X_test)

modelo = LogisticRegression(max_iter=1000)
modelo.fit(X_train_res, y_train_res)
y_pred = modelo.predict(X_test)
eval_and_print(y_test, y_pred, title='CASO 2')


CASO 2
Matriz de Confusión:
 [[417 371]
 [ 37 213]]
Precisión: 0.3647260273972603
Exactitud: 0.6069364161849711
Sensibilidad: 0.852
F1: 0.5107913669064749


In [53]:
# =========================================================
# CASO 3: high_availability (~ availability_30>15 OR availability_90>45) ~ price + availability_30 + availability_90
# =========================================================
data = df[['price','availability_30','availability_90']].copy()
data['price'] = clean_money(data['price'])
data['high_availability'] = ((data['availability_30'] > 15) | (data['availability_90'] > 45)).astype(int)
data = data.dropna(subset=['price','availability_30','availability_90'])

X = data[['price','availability_30','availability_90']]
y = data['high_availability'].astype(int)

X_train, X_test, y_train, y_test = split_scale(X, y)
modelo = LogisticRegression(max_iter=1000)
modelo.fit(X_train, y_train)
y_pred = modelo.predict(X_test)
eval_and_print(y_test, y_pred, title='CASO 3')


CASO 3
Matriz de Confusión:
 [[1002   23]
 [  27  543]]
Precisión: 0.9593639575971732
Exactitud: 0.9686520376175548
Sensibilidad: 0.9526315789473684
F1: 0.9559859154929577


In [54]:
# =========================================================
# CASO 4: host_has_profile_pic ~ host_identity_verified
# (Nota: si el target queda con una sola clase tras limpieza, se salta entrenar)
# =========================================================
data = df[['host_has_profile_pic','host_identity_verified']].copy()
data['host_has_profile_pic']   = encode_bool(data['host_has_profile_pic'])
data['host_identity_verified'] = encode_bool(data['host_identity_verified'])
data = data.dropna(subset=['host_has_profile_pic','host_identity_verified'])

X = data[['host_identity_verified']]
y = data['host_has_profile_pic'].astype(int)

if y.nunique() >= 2:
    X_train, X_test, y_train, y_test = split_scale(X, y)
    modelo = LogisticRegression(max_iter=1000)
    modelo.fit(X_train, y_train)
    y_pred = modelo.predict(X_test)
    eval_and_print(y_test, y_pred, title='CASO 4')
else:
    print('CASO 4: target con una sola clase tras limpieza; no es posible entrenar un clasificador binario.')


CASO 4
Matriz de Confusión:
 [[   0   68]
 [   0 1468]]
Precisión: 0.9557291666666666
Exactitud: 0.9557291666666666
Sensibilidad: 1.0
F1: 0.9773635153129161


In [55]:
# =========================================================
# CASO 5: property_type(Entire home=1, otro=0) ~ accommodates + bathrooms
# =========================================================
data = df[['property_type','accommodates','bathrooms']].copy()
data = data.dropna(subset=['property_type','accommodates','bathrooms'])
data['property_type_bin'] = data['property_type'].astype(str).str.contains('Entire', case=False).astype(int)

X = data[['accommodates','bathrooms']]
y = data['property_type_bin'].astype(int)

X_train, X_test, y_train, y_test = split_scale(X, y)
modelo = LogisticRegression(max_iter=1000)
modelo.fit(X_train, y_train)
y_pred = modelo.predict(X_test)
eval_and_print(y_test, y_pred, title='CASO 5')

CASO 5
Matriz de Confusión:
 [[ 103  190]
 [  34 1268]]
Precisión: 0.869684499314129
Exactitud: 0.8595611285266458
Sensibilidad: 0.9738863287250384
F1: 0.9188405797101449


In [56]:
# =========================================================
# CASO 6: room_type(Private room=1, otro=0) ~ price + accommodates
# =========================================================
data = df[['room_type','price','accommodates']].copy()
data['price'] = clean_money(data['price'])
data = data.dropna(subset=['room_type','price','accommodates'])
data['room_type_bin'] = data['room_type'].astype(str).str.contains('Private room', case=False).astype(int)

X = data[['price','accommodates']]
y = data['room_type_bin'].astype(int)

X_train, X_test, y_train, y_test = split_scale(X, y)
modelo = LogisticRegression(max_iter=1000)
modelo.fit(X_train, y_train)
y_pred = modelo.predict(X_test)
eval_and_print(y_test, y_pred, title='CASO 6')


CASO 6
Matriz de Confusión:
 [[1301   28]
 [ 163  103]]
Precisión: 0.7862595419847328
Exactitud: 0.8802507836990595
Sensibilidad: 0.38721804511278196
F1: 0.5188916876574308


In [57]:


# =========================================================
# CASO 7: host_identity_verified ~ host_is_superhost + host_total_listings_count
# =========================================================
data = df[['host_identity_verified','host_is_superhost','host_total_listings_count']].copy()
data['host_identity_verified']   = encode_bool(data['host_identity_verified'])
data['host_is_superhost']        = encode_bool(data['host_is_superhost'])
data = data.dropna(subset=['host_identity_verified','host_is_superhost','host_total_listings_count'])

X = data[['host_is_superhost','host_total_listings_count']]
y = data['host_identity_verified'].astype(int)

if y.nunique() >= 2:
    X_train, X_test, y_train, y_test = split_scale(X, y)
    modelo = LogisticRegression(max_iter=1000)
    modelo.fit(X_train, y_train)
    y_pred = modelo.predict(X_test)
    eval_and_print(y_test, y_pred, title='CASO 7')
else:
    print('CASO 7: target con una sola clase tras limpieza; no es posible entrenar un clasificador binario.')

CASO 7
Matriz de Confusión:
 [[   0  179]
 [   0 1348]]
Precisión: 0.8827766863130321
Exactitud: 0.8827766863130321
Sensibilidad: 1.0
F1: 0.9377391304347826


In [58]:
# =========================================================
# CASO 8: review_scores_rating_bin (~>90) ~ reviews_per_month + price
# (con fallback a mediana o quantiles si no hay separación)
# =========================================================
data = df[['review_scores_rating','reviews_per_month','price']].copy()
data['price'] = clean_money(data['price'])
data = data.dropna(subset=['review_scores_rating','reviews_per_month','price'])

umbral = 90
y_tmp = (data['review_scores_rating'] > umbral).astype(int)
if y_tmp.nunique() < 2:
    umbral = float(data['review_scores_rating'].median())
    y_tmp = (data['review_scores_rating'] > umbral).astype(int)
if y_tmp.nunique() < 2:
    y_tmp = pd.qcut(data['review_scores_rating'], q=2, labels=[0,1]).astype(int)

X = data[['reviews_per_month','price']]
y = y_tmp

X_train, X_test, y_train, y_test = split_scale(X, y)
modelo = LogisticRegression(max_iter=1000)
modelo.fit(X_train, y_train)
y_pred = modelo.predict(X_test)
eval_and_print(y_test, y_pred, title='CASO 8')

CASO 8
Matriz de Confusión:
 [[972   0]
 [623   0]]
Precisión: 0.0
Exactitud: 0.6094043887147336
Sensibilidad: 0.0
F1: 0.0


In [59]:
# =========================================================
# CASO 9: availability_365_bin(>200=1) ~ price + accommodates + bathrooms
# =========================================================
data = df[['availability_365','price','accommodates','bathrooms']].copy()
data['price'] = clean_money(data['price'])
data['availability_365_bin'] = (pd.to_numeric(data['availability_365'], errors='coerce') > 200).astype(int)
data = data.dropna(subset=['availability_365_bin','price','accommodates','bathrooms'])

X = data[['price','accommodates','bathrooms']]
y = data['availability_365_bin'].astype(int)

X_train, X_test, y_train, y_test = split_scale(X, y)
modelo = LogisticRegression(max_iter=1000)
modelo.fit(X_train, y_train)
y_pred = modelo.predict(X_test)
eval_and_print(y_test, y_pred, title='CASO 9')

CASO 9
Matriz de Confusión:
 [[1030    3]
 [ 559    3]]
Precisión: 0.5
Exactitud: 0.6476489028213166
Sensibilidad: 0.005338078291814947
F1: 0.01056338028169014


In [60]:


# =========================================================
# CASO 10 (NUEVO): high_demand (number_of_reviews_ltm > 0) ~ price + accommodates
# Con fallback a number_of_reviews > 0 si ltm es monoclase
# Ajuste de clases: class_weight='balanced'
# =========================================================

# Prepara data
cols = ['number_of_reviews_ltm', 'number_of_reviews', 'price', 'accommodates']
data = df[cols].copy()

# Limpia price
data['price'] = clean_money(data['price'])

# Target principal: high_demand_ltm
data['high_demand_ltm'] = (pd.to_numeric(data['number_of_reviews_ltm'], errors='coerce') > 0).astype(int)

# Quita NA de features
data = data.dropna(subset=['price', 'accommodates'])

# Evalúa si hay ambas clases
y = data['high_demand_ltm'].astype(int)
target_used = 'high_demand_ltm'

# Fallback si monoclase: usa number_of_reviews > 0
if y.nunique() < 2:
    data['high_demand_all'] = (pd.to_numeric(data['number_of_reviews'], errors='coerce') > 0).astype(int)
    data = data.dropna(subset=['high_demand_all'])
    y = data['high_demand_all'].astype(int)
    target_used = 'high_demand_all'

# Verificación final
if y.nunique() < 2:
    raise ValueError("CASO 10: Tanto 'number_of_reviews_ltm' como 'number_of_reviews' quedaron monoclase. Elige otro target.")

# Features
X = data[['price', 'accommodates']]

# Split + escala
X_train, X_test, y_train, y_test = split_scale(X, y)

# Modelo balanceado
modelo10 = LogisticRegression(max_iter=1000, class_weight='balanced')
modelo10.fit(X_train, y_train)
y_pred10 = modelo10.predict(X_test)

# Reporte
eval_and_print(y_test, y_pred10, title=f"CASO 10 (target = {target_used})")


CASO 10 (target = high_demand_ltm)
Matriz de Confusión:
 [[390 274]
 [478 453]]
Precisión: 0.6231086657496562
Exactitud: 0.5285266457680251
Sensibilidad: 0.4865735767991407
F1: 0.5464414957780458


In [61]:
# ============================
# MÉTRICAS Y UTILIDADES
# ============================
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score
from imblearn.over_sampling import RandomOverSampler
import pandas as pd
import numpy as np

def get_metrics_dict(y_true, y_pred):
    return {
        "Precisión":   precision_score(y_true, y_pred, pos_label=1, zero_division=0),
        "Exactitud":   accuracy_score(y_true, y_pred),
        "Sensibilidad":recall_score(y_true, y_pred, pos_label=1, zero_division=0),
        "F1":          f1_score(y_true, y_pred, pos_label=1, zero_division=0)
    }



In [62]:
# ============================
# CASO 1: instant_bookable (balanced)
# ============================
data = df[['instant_bookable','price','reviews_per_month']].copy()
data['instant_bookable'] = encode_bool(data['instant_bookable'])
data['price'] = clean_money(data['price'])
data = data.dropna(subset=['instant_bookable','price','reviews_per_month'])

X = data[['price','reviews_per_month']]
y = data['instant_bookable'].astype(int)

# split + escala
X_train, X_test, y_train, y_test = split_scale(X, y)

# SIN ajuste
m1_sin = LogisticRegression(max_iter=1000)
m1_sin.fit(X_train, y_train)
y1_sin = m1_sin.predict(X_test)
met1_sin = get_metrics_dict(y_test, y1_sin)

# CON ajuste (balanced)
m1_con = LogisticRegression(max_iter=1000, class_weight='balanced')
m1_con.fit(X_train, y_train)
y1_con = m1_con.predict(X_test)
met1_con = get_metrics_dict(y_test, y1_con)

df_caso1 = pd.DataFrame({
    "Métrica": list(met1_sin.keys()),
    "SIN AJUSTE": list(met1_sin.values()),
    "CON AJUSTE": list(met1_con.values())
}).round(3)

print("\n📊 CASO 1: instant_bookable ~ price + reviews_per_month")
display(df_caso1)



📊 CASO 1: instant_bookable ~ price + reviews_per_month


Unnamed: 0,Métrica,SIN AJUSTE,CON AJUSTE
0,Precisión,0.364,0.259
1,Exactitud,0.848,0.746
2,Sensibilidad,0.017,0.372
3,F1,0.032,0.305


In [63]:

# ============================
# CASO 2: host_is_superhost (Oversampling)
# ============================
data = df[['host_is_superhost','host_acceptance_rate','host_response_rate']].copy()
data['host_is_superhost']  = encode_bool(data['host_is_superhost'])
data['host_acceptance_rate'] = clean_percent(data['host_acceptance_rate'])
data['host_response_rate']  = clean_percent(data['host_response_rate'])
data = data.dropna(subset=['host_is_superhost','host_acceptance_rate','host_response_rate'])

X = data[['host_acceptance_rate','host_response_rate']]
y = data['host_is_superhost'].astype(int)

# split (sin escalar aún para aplicar ROS)
X_tr, X_te, y_tr, y_te = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y if y.nunique()>=2 else None
)

# SIN ajuste
sc_sin = StandardScaler()
X_tr_sin = sc_sin.fit_transform(X_tr)
X_te_sin = sc_sin.transform(X_te)
m2_sin = LogisticRegression(max_iter=1000)
m2_sin.fit(X_tr_sin, y_tr)
y2_sin = m2_sin.predict(X_te_sin)
met2_sin = get_metrics_dict(y_te, y2_sin)

# CON ajuste (Oversampling SOLO en train)
ros = RandomOverSampler(random_state=42)
X_tr_ros, y_tr_ros = ros.fit_resample(X_tr, y_tr)
sc_con = StandardScaler()
X_tr_con = sc_con.fit_transform(X_tr_ros)
X_te_con = sc_con.transform(X_te)
m2_con = LogisticRegression(max_iter=1000)
m2_con.fit(X_tr_con, y_tr_ros)
y2_con = m2_con.predict(X_te_con)
met2_con = get_metrics_dict(y_te, y2_con)

df_caso2 = pd.DataFrame({
    "Métrica": list(met2_sin.keys()),
    "SIN AJUSTE": list(met2_sin.values()),
    "CON AJUSTE": list(met2_con.values())
}).round(3)

print("\n📊 CASO 2: host_is_superhost ~ host_acceptance_rate + host_response_rate (Oversampling)")
display(df_caso2)



📊 CASO 2: host_is_superhost ~ host_acceptance_rate + host_response_rate (Oversampling)


Unnamed: 0,Métrica,SIN AJUSTE,CON AJUSTE
0,Precisión,0.0,0.365
1,Exactitud,0.759,0.607
2,Sensibilidad,0.0,0.852
3,F1,0.0,0.511


In [64]:

# ============================
# CASO 10 (NUEVO): high_demand ~ price + accommodates (balanced)
# Target: number_of_reviews_ltm>0; fallback a number_of_reviews>0 si monoclase
# ============================
cols = ['number_of_reviews_ltm','number_of_reviews','price','accommodates']
data = df[cols].copy()
data['price'] = clean_money(data['price'])
data = data.dropna(subset=['price','accommodates'])

# target principal
data['high_demand_ltm'] = (pd.to_numeric(data['number_of_reviews_ltm'], errors='coerce') > 0).astype(int)
y = data['high_demand_ltm'].astype(int)
target_used = 'high_demand_ltm'

# fallback si monoclase
if y.nunique() < 2:
    data['high_demand_all'] = (pd.to_numeric(data['number_of_reviews'], errors='coerce') > 0).astype(int)
    data = data.dropna(subset=['high_demand_all'])
    y = data['high_demand_all'].astype(int)
    target_used = 'high_demand_all'

if y.nunique() < 2:
    raise ValueError("CASO 10: También quedó monoclase usando number_of_reviews. Elige otro target.")

X = data[['price','accommodates']]

# split + escala
X_train, X_test, y_train, y_test = split_scale(X, y)

# SIN ajuste
m10_sin = LogisticRegression(max_iter=1000)
m10_sin.fit(X_train, y_train)
y10_sin = m10_sin.predict(X_test)
met10_sin = get_metrics_dict(y_test, y10_sin)

# CON ajuste (balanced)
m10_con = LogisticRegression(max_iter=1000, class_weight='balanced')
m10_con.fit(X_train, y_train)
y10_con = m10_con.predict(X_test)
met10_con = get_metrics_dict(y_test, y10_con)

df_caso10 = pd.DataFrame({
    "Métrica": list(met10_sin.keys()),
    "SIN AJUSTE": list(met10_sin.values()),
    "CON AJUSTE": list(met10_con.values())
}).round(3)

print(f"\n📊 CASO 10: {target_used} ~ price + accommodates")
display(df_caso10)



📊 CASO 10: high_demand_ltm ~ price + accommodates


Unnamed: 0,Métrica,SIN AJUSTE,CON AJUSTE
0,Precisión,0.584,0.623
1,Exactitud,0.584,0.529
2,Sensibilidad,0.998,0.487
3,F1,0.737,0.546


In [65]:


# ============================
# RESUMEN FINAL (para tu reporte)
# ============================
tabla_final = pd.concat([
    df_caso1.assign(Caso='1. instant_bookable'),
    df_caso2.assign(Caso='2. host_is_superhost'),
    df_caso10.assign(Caso='10. high_demand')
])[['Caso','Métrica','SIN AJUSTE','CON AJUSTE']].reset_index(drop=True)

print("\n📈 TABLA FINAL COMPARATIVA (3 ajustes: SIN vs CON)")
display(tabla_final)



📈 TABLA FINAL COMPARATIVA (3 ajustes: SIN vs CON)


Unnamed: 0,Caso,Métrica,SIN AJUSTE,CON AJUSTE
0,1. instant_bookable,Precisión,0.364,0.259
1,1. instant_bookable,Exactitud,0.848,0.746
2,1. instant_bookable,Sensibilidad,0.017,0.372
3,1. instant_bookable,F1,0.032,0.305
4,2. host_is_superhost,Precisión,0.0,0.365
5,2. host_is_superhost,Exactitud,0.759,0.607
6,2. host_is_superhost,Sensibilidad,0.0,0.852
7,2. host_is_superhost,F1,0.0,0.511
8,10. high_demand,Precisión,0.584,0.623
9,10. high_demand,Exactitud,0.584,0.529
