In [1]:
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, r2_score
from scipy.optimize import curve_fit

st.set_page_config(page_title="Airbnb Dashboard (Blue)", layout="wide")

PRIMARY = "#1E88E5"
PALETTE = ["#1565C0", "#1E88E5", "#42A5F5", "#90CAF9", "#BBDEFB"]
HEATMAP_SCALE = [[0.0, "#E3F2FD"], [1.0, "#1565C0"]]
px.defaults.template = "plotly_white"

st.markdown(
    f"""
    <style>
      .stTabs [data-baseweb="tab"] {{ font-weight: 600; }}
      h1, h2, h3 {{ color: {PRIMARY}; }}
      .css-18e3th9 {{ padding-top: 1rem; }}
    </style>
    """,
    unsafe_allow_html=True
)

def guess_col(df, preferred, regex=None):
    import re
    for p in preferred:
        if p in df.columns:
            return p
    if regex:
        pat = re.compile(regex, re.I)
        for c in df.columns:
            if pat.search(c):
                return c
    return None

def detect_binary_cols(df):
    bins = []
    for c in df.columns:
        vals = pd.unique(df[c].dropna())
        if len(vals) == 2:
            bins.append(c)
    return bins

def sanitize_numeric(df):
    return df.replace([np.inf, -np.inf], np.nan)

st.sidebar.title("Configuración de datos")

@st.cache_data(show_spinner=False)
def ingest_csv(uploaded):
    if uploaded is not None:
        return pd.read_csv(uploaded)
    try:
        return pd.read_csv("valencia_trabajo.csv")
    except Exception:
        return pd.DataFrame()

csv_file = st.sidebar.file_uploader("Sube tu CSV (opcional)", type=["csv"])
df = ingest_csv(csv_file)
if df.empty:
    st.error("No pude cargar datos. Sube un CSV o coloca 'valencia_trabajo.csv' junto a este archivo.")
    st.stop()

df = sanitize_numeric(df)
df_num = df.select_dtypes(include=[np.number])
df_cat = df.select_dtypes(exclude=[np.number])
bin_cols = detect_binary_cols(df)

col_price = guess_col(df, ["price"], r"\bprice|precio\b")
col_lat   = guess_col(df, ["latitude", "lat"], r"lat")
col_lon   = guess_col(df, ["longitude", "lon", "lng", "longitud"], r"lon|lng|long")
col_room  = guess_col(df, ["room_type"], r"room|habita|tipo.*hab") or guess_col(df, ["property_type"], r"propiedad|property")
col_beds  = guess_col(df, ["beds", "bedrooms"], r"bed")
col_acc   = guess_col(df, ["accommodates"], r"accom|capacidad")
col_av365 = guess_col(df, ["availability_365"], r"avail|dispon")
col_rev   = guess_col(df, ["review_scores_value", "review_scores_rating"], r"review.*(value|rating)")
col_super = guess_col(df, ["host_is_superhost_bin", "host_is_superhost"], r"superhost")

if col_super:
    if df[col_super].dtype == object:
        if set(df[col_super].dropna().astype(str).str.lower().unique()) <= set(["t","f","true","false"]):
            df[col_super + "_bin"] = df[col_super].astype(str).str.lower().isin(["t","true"]).astype(int)
            col_super = col_super + "_bin"
    elif df[col_super].dtype == bool:
        df[col_super + "_bin"] = df[col_super].astype(int)
        col_super = col_super + "_bin"

view = st.sidebar.selectbox(
    "Selecciona vista",
    ["Resumen", "Exploración", "Regresión Lineal", "Regresión No Lineal", "Regresión Logística"]
)

if view == "Resumen":
    st.title("Radiografía del dataset")
    c1, c2, c3, c4 = st.columns(4)
    c1.metric("Filas", f"{len(df):,}")
    c2.metric("Columnas", df.shape[1])
    c3.metric("Numéricas", df_num.shape[1])
    c4.metric("Categóricas", df_cat.shape[1])
    if col_price and col_price in df_num.columns:
        st.subheader("Distribución de precios")
        fig = px.histogram(df, x=col_price, nbins=50, color_discrete_sequence=[PALETTE[1]])
        fig.update_layout(margin=dict(l=0,r=0,t=30,b=0))
        st.plotly_chart(fig, use_container_width=True)
    else:
        st.info("No se encontró una columna de precio (price). Usa la vista 'Exploración'.")
    if col_lat and col_lon and col_lat in df.columns and col_lon in df.columns:
        st.subheader("Ubicación de alojamientos")
        df_map = df.dropna(subset=[col_lat, col_lon])
        color_col = col_price if col_price in df.columns else None
        size_col  = col_rev if col_rev in df.columns else None
        fig_map = px.scatter_mapbox(
            df_map,
            lat=col_lat, lon=col_lon,
            color=color_col,
            size=size_col,
            color_continuous_scale="Blues",
            hover_data=[c for c in [col_room, col_beds, col_acc] if c],
            zoom=10, height=500
        )
        fig_map.update_layout(mapbox_style="open-street-map", margin=dict(l=0,r=0,t=0,b=0))
        st.plotly_chart(fig_map, use_container_width=True)
    else:
        st.info("No hay columnas de latitud/longitud para el mapa.")
    if col_room and col_rev and col_room in df.columns and col_rev in df.columns:
        st.subheader("Evaluación promedio por tipo")
        prom = df.groupby(col_room)[col_rev].mean(numeric_only=True).sort_values(ascending=False).reset_index()
        fig = px.bar(prom, x=col_room, y=col_rev, color_discrete_sequence=[PALETTE[0]])
        st.plotly_chart(fig, use_container_width=True)
    if col_room and col_av365 and col_room in df.columns and col_av365 in df.columns:
        st.subheader("Disponibilidad (días/año) por tipo")
        disp = df.groupby(col_room)[col_av365].mean(numeric_only=True).sort_values(ascending=False).reset_index()
        fig = px.bar(disp, x=col_room, y=col_av365, color_discrete_sequence=[PALETTE[2]])
        st.plotly_chart(fig, use_container_width=True)
    if col_room and col_super and col_room in df.columns and col_super in df.columns:
        st.subheader("Proporción de superhosts por tipo")
        ratio = df.groupby(col_room)[col_super].mean(numeric_only=True).reset_index()
        fig = px.bar(ratio, x=col_room, y=col_super, color_discrete_sequence=[PALETTE[3]])
        st.plotly_chart(fig, use_container_width=True)
    metric = col_acc if col_acc in df.columns else (col_beds if col_beds in df.columns else None)
    if col_room and metric:
        st.subheader(f"Promedio de {metric} por tipo")
        cap = df.groupby(col_room)[metric].mean(numeric_only=True).reset_index()
        fig = px.bar(cap, x=col_room, y=metric, color_discrete_sequence=[PALETTE[1]])
        st.plotly_chart(fig, use_container_width=True)

elif view == "Exploración":
    st.title("Exploración rápida")
    tipo = st.sidebar.radio("Tipo de variable", ["Categórica", "Numérica"])
    if tipo == "Categórica" and len(df_cat.columns) > 0:
        var = st.sidebar.selectbox("Columna", options=sorted(df_cat.columns))
        chart = st.sidebar.radio("Gráfico", ["Barras", "Pastel", "Dona"])
        tabla = df[var].value_counts(dropna=False).nlargest(15).reset_index()
        tabla.columns = ["categoría", "frecuencia"]
        if chart == "Barras":
            fig = px.bar(tabla, x="categoría", y="frecuencia", color_discrete_sequence=[PALETTE[0]])
        elif chart == "Pastel":
            fig = px.pie(tabla, names="categoría", values="frecuencia", color_discrete_sequence=PALETTE)
        else:
            fig = px.pie(tabla, names="categoría", values="frecuencia", hole=0.45, color_discrete_sequence=PALETTE)
        st.plotly_chart(fig, use_container_width=True)
    elif tipo == "Numérica" and len(df_num.columns) > 0:
        var = st.sidebar.selectbox("Columna", options=sorted(df_num.columns))
        chart = st.sidebar.radio("Gráfico", ["Histograma", "Boxplot"])
        if chart == "Histograma":
            fig = px.histogram(df, x=var, nbins=50, color_discrete_sequence=[PALETTE[1]])
        else:
            fig = px.box(df, y=var, color_discrete_sequence=[PALETTE[2]])
        st.plotly_chart(fig, use_container_width=True)
    else:
        st.info("No hay variables del tipo seleccionado.")

elif view == "Regresión Lineal":
    st.title("Modelado lineal")
    if len(df_num.columns) < 2:
        st.warning("Se requieren al menos dos columnas numéricas.")
        st.stop()
    y_var = st.sidebar.selectbox("Objetivo (Y)", options=sorted(df_num.columns))
    x_var = st.sidebar.selectbox("Predictor (X)", options=[c for c in sorted(df_num.columns) if c != y_var])
    st.subheader("Simple")
    X = df[[x_var]].dropna()
    y = df.loc[X.index, y_var]
    lr = LinearRegression().fit(X, y)
    y_hat = lr.predict(X)
    r2 = r2_score(y, y_hat)
    st.write(f"R²: {r2:.4f}")
    fig = px.scatter(df, x=x_var, y=y_var, color_discrete_sequence=[PALETTE[0]])
    order = np.argsort(X[x_var].values)
    fig.add_traces(px.line(x=X[x_var].values[order], y=y_hat[order], color_discrete_sequence=[PALETTE[1]]).data)
    st.plotly_chart(fig, use_container_width=True)
    st.subheader("Múltiple")
    x_vars = st.sidebar.multiselect("Predictores (X)", options=[c for c in sorted(df_num.columns) if c != y_var], default=[x_var])
    if x_vars:
        Xm = df[x_vars].dropna()
        ym = df.loc[Xm.index, y_var]
        lr2 = LinearRegression().fit(Xm, ym)
        ypm = lr2.predict(Xm)
        r2m = r2_score(ym, ypm)
        st.write(f"R² (múltiple): {r2m:.4f}")
    else:
        st.info("Selecciona al menos un predictor para el modelo múltiple.")

elif view == "Regresión No Lineal":
    st.title("Ajustes no lineales")
    if len(df_num.columns) < 2:
        st.warning("Se requieren al menos dos columnas numéricas.")
        st.stop()
    y_var = st.sidebar.selectbox("Objetivo (Y)", options=sorted(df_num.columns), key="nly")
    x_var = st.sidebar.selectbox("Predictor (X)", options=[c for c in sorted(df_num.columns) if c != y_var], key="nlx")
    model = st.sidebar.selectbox("Modelo", ["Cuadrática", "Exponencial"])
    sub = df[[x_var, y_var]].dropna()
    if len(sub) < 5:
        st.warning("Muy pocos datos válidos para ajustar.")
        st.stop()
    x = sub[x_var].values
    y = sub[y_var].values
    if model == "Cuadrática":
        def f(z, a, b, c): return a*z**2 + b*z + c
    else:
        def f(z, a, b, c): return a*np.exp(-b*z) + c
    try:
        params, _ = curve_fit(f, x, y, maxfev=10000)
        y_fit = f(x, *params)
        r2 = r2_score(y, y_fit)
        st.write(f"R²: {r2:.4f}")
        fig = px.scatter(sub, x=x_var, y=y_var, color_discrete_sequence=[PALETTE[0]])
        order = np.argsort(x)
        fig.add_traces(px.line(x=x[order], y=y_fit[order], color_discrete_sequence=[PALETTE[1]]).data)
        st.plotly_chart(fig, use_container_width=True)
    except Exception as e:
        st.error(f"No se pudo ajustar el modelo: {e}")

elif view == "Regresión Logística":
    st.title("Clasificación logística (Y binaria)")
    if not bin_cols:
        st.info("No se detectaron columnas binarias automáticamente (exactamente 2 valores).")
        st.stop()
    y_var = st.sidebar.selectbox("Dependiente (binaria)", options=sorted(bin_cols))
    default_x = [col_price] if (col_price and col_price in df_num.columns) else list(df_num.columns[:2])
    x_vars = st.sidebar.multiselect("Independientes (numéricas)", options=sorted(df_num.columns), default=default_x)
    if not x_vars:
        st.warning("Selecciona al menos una X.")
        st.stop()
    y_raw = df[y_var]
    if y_raw.dtype == bool:
        y_enc = y_raw.astype(int)
    elif y_raw.dtype == object:
        uniq = sorted([str(v) for v in pd.unique(y_raw.dropna())])
        if len(uniq) == 2:
            mapping = {uniq[0]: 0, uniq[1]: 1}
            y_enc = y_raw.astype(str).map(mapping)
        else:
            st.error("La variable Y seleccionada no parece binaria.")
            st.stop()
    else:
        y_enc = y_raw
    data = pd.concat([df[x_vars], y_enc], axis=1).dropna()
    X = data[x_vars].values
    y = data[y_var].values.astype(int)
    if len(data) < 20:
        st.warning("Muy pocos datos para entrenar.")
        st.stop()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42, stratify=y)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test  = scaler.transform(X_test)
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    cm  = confusion_matrix(y_test, y_pred)
    st.write(f"Exactitud: {acc:.4f}")
    labels = sorted(pd.unique(y))
    heat = go.Figure(
        data=go.Heatmap(
            z=cm,
            x=[f"Pred {l}" for l in labels],
            y=[f"Real {l}" for l in labels],
            colorscale=HEATMAP_SCALE,
            hoverinfo="z",
            zmin=0
        )
    )
    annotations = []
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            val = cm[i, j]
            annotations.append(
                dict(
                    x=f"Pred {labels[j]}",
                    y=f"Real {labels[i]}",
                    text=str(val),
                    showarrow=False,
                    font=dict(color="white" if val > cm.max()/2 else "black", size=12)
                )
            )
    heat.update_layout(title="Matriz de confusión", annotations=annotations, width=520, height=520)
    st.plotly_chart(heat, use_container_width=False)
    try:
        p0 = precision_score(y_test, y_pred, pos_label=labels[0])
        p1 = precision_score(y_test, y_pred, pos_label=labels[1])
        st.write(f"Precisión clase {labels[0]}: {p0:.4f}")
        st.write(f"Precisión clase {labels[1]}: {p1:.4f}")
    except Exception:
        pass


2025-10-26 20:03:37.317 
  command:

    streamlit run C:\Users\hecto\AppData\Roaming\Python\Python313\site-packages\ipykernel_launcher.py [ARGUMENTS]
2025-10-26 20:03:37.324 No runtime found, using MemoryCacheStorageManager
2025-10-26 20:03:37.329 No runtime found, using MemoryCacheStorageManager
2025-10-26 20:03:38.120 Session state does not function when running a script without `streamlit run`

*scatter_mapbox* is deprecated! Use *scatter_map* instead. Learn more at: https://plotly.com/python/mapbox-to-maplibre/

