# Projet Immobilier — Notebook Unifié

Ce notebook centralise **préparation des données**, **widgets d’exploration**, **analyse des loyers en IDF**, et **accessibilité aux gares IDF** dans **une seule interface**.

> Conseil : exécutez les cellules dans l’ordre. La première section ci‑dessous fournit une *UI unifiée* pour piloter toutes les vues. Les sections originales des 4 notebooks sont conservées plus bas, telles quelles, pour assurer la compatibilité et faciliter le debug.


### Dépendances
Assurez‑vous d'avoir ces bibliothèques installées dans l'environnement du notebook :
- `pandas`, `numpy`
- `ipywidgets` (et activer l'extension Jupyter classique au besoin), `IPython.display`
- `matplotlib`, `plotly` (si utilisé par vos notebooks d'origine)
- `pyarrow` (si vous lisez/écrivez des Parquet)


In [12]:
# ==================== APP NOTEBOOK UNIFIÉE (COLLER/EXÉCUTER) ====================
import pandas as pd, numpy as np, matplotlib.pyplot as plt, ipywidgets as W
from IPython.display import display, clear_output
from pathlib import Path

# -------- 1) CHARGEMENT DATA (adapte si tu as déjà un df propre) --------
def _find_raw():
    for d in [Path("./data"), Path("../data"), Path("/mnt/data"), Path(".")]:
        if d.exists():
            for p in d.rglob("*"):
                n = p.name.lower()
                if p.is_file() and (("dvf" in n or "valeurs_foncieres" in n) and p.suffix.lower() in [".csv",".txt"]):
                    return p
    return None

def _guess_sep(fp: Path):
    with open(fp, "r", encoding="utf-8", errors="ignore") as f:
        h=f.readline()
    return ";" if h.count(";")>=h.count(",") else ","

def load_data():
    # Priorité à un parquet déjà prêt
    clean = Path("./data_clean/dvf_clean.parquet")
    if clean.exists():
        return pd.read_parquet(clean)
    alt = Path("/mnt/data/dvf_clean.parquet")
    if alt.exists():
        return pd.read_parquet(alt)

    raw = _find_raw()
    if raw is None:
        return pd.DataFrame()
    sep = _guess_sep(raw)
    dtype_hint = {
        "valeur_fonciere":"float64","surface_reelle_bati":"float64","nombre_pieces_principales":"float64",
        "code_postal":"string","code_commune":"string","nom_commune":"string","type_local":"string","annee":"Int64"
    }
    df = pd.read_csv(raw, sep=sep, dtype=dtype_hint, low_memory=False, encoding="utf-8", na_values=["","NA","NaN"])
    if "valeur_fonciere" in df and "surface_reelle_bati" in df:
        vf = pd.to_numeric(df["valeur_fonciere"], errors="coerce")
        sr = pd.to_numeric(df["surface_reelle_bati"], errors="coerce")
        df["prix_m2"] = vf/sr
    return df

STATE = {"df": load_data(), "df_f": None}

# -------- 2) FILTRES GLOBAUX --------
df0 = STATE["df"]
years = sorted([int(x) for x in df0["annee"].dropna().unique().tolist()]) if "annee" in df0.columns else []
yr_min, yr_max = (years[0], years[-1]) if years else (2000, 2025)

flt_year    = W.IntRangeSlider(description="Années", min=yr_min, max=yr_max, value=(yr_min, yr_max), step=1, continuous_update=False, layout=W.Layout(width="360px"))
flt_type    = W.SelectMultiple(options=sorted(df0["type_local"].dropna().unique().tolist()) if "type_local" in df0.columns else [], description="Type", rows=4, layout=W.Layout(width="360px"))
flt_commune = W.Combobox(options=sorted(df0["nom_commune"].dropna().unique().tolist())[:500] if "nom_commune" in df0.columns else [], placeholder="Commune (optionnel)", layout=W.Layout(width="360px"))
btn_apply   = W.Button(description="Appliquer")
btn_reset   = W.Button(description="Réinit")

# -------- 3) PANNEAUX D’AFFICHAGE --------
out_data = W.Output(); out_w56 = W.Output(); out_loyers = W.Output(); out_gares = W.Output()

# ====== REMPLACE UNIQUEMENT LE CONTENU DE CES 3 FONCTIONS AVEC TES VISU ======
def panel_widgets_5_6(df: pd.DataFrame, out: W.Output):
    """Colle ici ton code des widgets 5 & 6."""
    with out:
        clear_output()
        if df is None or df.empty:
            print("Aucune donnée filtrée.")
            return
        if {"valeur_fonciere","surface_reelle_bati"} <= set(df.columns):
            plt.figure()
            sam = df[["valeur_fonciere","surface_reelle_bati"]].dropna().sample(min(3000, len(df)), random_state=42)
            plt.scatter(sam["surface_reelle_bati"], sam["valeur_fonciere"], s=6)
            plt.title("Valeur foncière vs Surface (échantillon)")
            plt.xlabel("Surface (m²)"); plt.ylabel("Valeur (€)")
            plt.show()
        else:
            print("Ajoute ici tes graphes existants (colonnes manquantes pour l’exemple).")

def panel_loyers_idf(df: pd.DataFrame, out: W.Output):
    """Colle ici ton code ‘analyse loyers IDF’."""
    with out:
        clear_output()
        if df is None or df.empty:
            print("Aucune donnée filtrée.")
            return
        if "code_postal" in df.columns and "prix_m2" in df.columns:
            t = df.copy()
            t["dept"] = t["code_postal"].astype(str).str[:2]
            idf = {"75","77","78","91","92","93","94","95"}
            t = t[t["dept"].isin(idf)]
            if t.empty:
                print("Pas de lignes IDF après filtres.")
                return
            g = t.groupby("dept")["prix_m2"].median().sort_index()
            display(g.to_frame("prix_m2_median"))
            plt.figure(); g.plot(kind="bar"); plt.title("Prix/m² médian (IDF)"); plt.xlabel("Département"); plt.ylabel("€/m²"); plt.show()
        else:
            print("Ajoute ici ton analyse loyers (code_postal / prix_m2 requis pour l’exemple).")

def panel_access_gares(df: pd.DataFrame, out: W.Output):
    """Colle ici ton code ‘accessibilité gares IDF’ (jointure sur table gares)."""
    with out:
        clear_output()
        print("Place ici ta jointure DF <-> Gares + tes visus (carte, histogrammes, etc.).")
# ==============================================================================

# -------- 4) PANEL DONNÉES & KPI (garde si utile) --------
def panel_data(df: pd.DataFrame, out: W.Output):
    with out:
        clear_output()
        if df is None or df.empty:
            print("Aucune donnée filtrée.")
            return
        n = len(df); cols = len(df.columns)
        print(f"Lignes: {n:,} | Colonnes: {cols}")
        if "prix_m2" in df:  print("Médiane prix/m²:", f"{np.nanmedian(df['prix_m2']):,.0f}")
        if "surface_reelle_bati" in df: print("Médiane surface:", f"{np.nanmedian(df['surface_reelle_bati']):,.1f} m²")
        display(df.head(10))
        if "prix_m2" in df:
            plt.figure()
            df["prix_m2"].dropna().clip(upper=df["prix_m2"].quantile(0.99)).hist(bins=40)
            plt.title("Distribution prix/m²"); plt.xlabel("€/m²"); plt.ylabel("Fréquence"); plt.show()

# -------- 5) FILTRAGE GLOBAL + REFRESH --------
def _apply(_=None):
    df = STATE["df"]
    if not isinstance(df, pd.DataFrame) or df.empty:
        STATE["df_f"] = None
    else:
        m = pd.Series(True, index=df.index)
        if "annee" in df:
            y0, y1 = flt_year.value
            m &= df["annee"].fillna(-1).astype("Int64").between(y0, y1)
        if flt_type.value and "type_local" in df:
            m &= df["type_local"].isin(list(flt_type.value))
        if flt_commune.value and "nom_commune" in df:
            m &= df["nom_commune"].fillna("").str.lower().eq(str(flt_commune.value).strip().lower())
        STATE["df_f"] = df[m].copy()

    panel_data(STATE["df_f"], out_data)
    panel_widgets_5_6(STATE["df_f"], out_w56)
    panel_loyers_idf(STATE["df_f"], out_loyers)
    panel_access_gares(STATE["df_f"], out_gares)

def _reset(_=None):
    if "annee" in STATE["df"].columns:
        ys = sorted([int(x) for x in STATE["df"]["annee"].dropna().unique().tolist()])
        if ys:
            flt_year.min, flt_year.max = ys[0], ys[-1]
            flt_year.value = (ys[0], ys[-1])
    flt_type.value = tuple()
    flt_commune.value = ""
    _apply()

btn_apply.on_click(_apply); btn_reset.on_click(_reset)

# -------- 6) LAYOUT --------
sidebar = W.VBox([W.HTML("<h3>Filtres</h3>"), flt_year, flt_type, flt_commune, W.HBox([btn_apply, btn_reset])],
                 layout=W.Layout(width="380px"))
tabs = W.Tab(children=[out_data, out_w56, out_loyers, out_gares])
for i, t in enumerate(["Données & KPI", "Widgets 5&6", "Loyers IDF", "Accès gares IDF"]):
    tabs.set_title(i, t)
display(W.HBox([sidebar, tabs]))
_apply()
# ===============================================================================


HBox(children=(VBox(children=(HTML(value='<h3>Filtres</h3>'), IntRangeSlider(value=(2025, 2025), continuous_up…

---

## Sections originales

---

### Import automatique de `01_prepare_data.ipynb`

## Imports & chemins

In [13]:
# ===== Projet Immobilier — Chargement, Nettoyage, Codes Postaux (robuste) + Dashboard =====
import os, pandas as pd, numpy as np
from pathlib import Path

RAW_DIR   = os.path.abspath(os.path.join("..", "data", "raw"))
CLEAN_DIR = os.path.abspath(os.path.join("..", "data", "clean"))
Path(CLEAN_DIR).mkdir(parents=True, exist_ok=True)

def to_num_fr(s: pd.Series) -> pd.Series:
    s = s.astype(str).str.replace("\u00A0","", regex=False)
    s = s.str.replace(" ", "", regex=False).str.replace(",", ".", regex=False)
    return pd.to_numeric(s, errors="coerce")

def parse_dates_robust(s: pd.Series) -> pd.Series:
    x = (s.astype(str).str.replace("\u00A0"," ", regex=False)
                 .str.strip().str.replace(r"[^0-9/\-]", "", regex=True))
    out = pd.to_datetime(x, errors="coerce", dayfirst=True)
    mask = out.isna()
    if mask.any():
        out.loc[mask] = pd.to_datetime(x.loc[mask], errors="coerce", format="%Y-%m-%d")
    return out

def load_dvf_idf(dvf_path: str, out_fp: str) -> pd.DataFrame:
    if os.path.exists(out_fp):
        return pd.read_parquet(out_fp)

    usecols = ["Date mutation","Nature mutation","Valeur fonciere",
               "Code postal","Commune","Code departement","Code commune",
               "Type local","Surface reelle bati","Nombre pieces principales"]
    idf = ("75","77","78","91","92","93","94","95")

    chunks, tried = [], []
    for enc in ("cp1252","latin1","utf-8","utf-8-sig"):
        try:
            for chunk in pd.read_csv(dvf_path, sep="|", usecols=usecols,
                                     encoding=enc, dtype=str, chunksize=200_000):
                # Renommage
                chunk = chunk.rename(columns={
                    "Date mutation":"date_mutation",
                    "Valeur fonciere":"valeur_fonciere",
                    "Surface reelle bati":"surface_reelle_bati",
                    "Commune":"nom_commune",
                    "Code postal":"code_postal",
                    "Type local":"type_local",
                    "Code departement":"code_departement",
                    "Code commune":"code_commune_3",
                })

                # Île-de-France
                m_idf = chunk["code_departement"].astype(str).str.strip().str[:2].isin(idf)
                chunk = chunk[m_idf]

                # Types et conversions
                chunk["date_mutation"]       = parse_dates_robust(chunk["date_mutation"])
                chunk["annee"]               = chunk["date_mutation"].dt.year
                chunk["valeur_fonciere"]     = to_num_fr(chunk["valeur_fonciere"])
                chunk["surface_reelle_bati"] = to_num_fr(chunk["surface_reelle_bati"])

                # Codes & prix/m²
                chunk["code_departement"]     = chunk["code_departement"].astype(str).str.strip().str.zfill(2)
                chunk["code_commune_3"]       = chunk["code_commune_3"].astype(str).str.strip().str.zfill(3)
                chunk["code_commune_insee"]   = chunk["code_departement"] + chunk["code_commune_3"]
                chunk["prix_m2"]              = chunk["valeur_fonciere"] / chunk["surface_reelle_bati"]

                # Sélection minimale
                keep = chunk[[
                    "date_mutation","annee","valeur_fonciere","surface_reelle_bati","prix_m2",
                    "nom_commune","code_commune_insee","code_postal"
                ]]

                chunks.append(keep)
            break
        except Exception as e:
            tried.append((enc, str(e)))
            continue

    if not chunks:
        raise RuntimeError(f"Lecture DVF vide. Encodings testés: {tried}")

    df = pd.concat(chunks, ignore_index=True)

    # Filtres de cohérence
    df = df.dropna(subset=["date_mutation","annee","valeur_fonciere","surface_reelle_bati","prix_m2","nom_commune"])
    df = df[(df["surface_reelle_bati"] > 8) & (df["valeur_fonciere"] > 1000) & (df["prix_m2"].between(100, 30000))]
    df.to_parquet(out_fp, index=False)
    return df

# --- Chargement DVF ---
dvf_path = os.path.join(RAW_DIR, "DVF_2025_S1.txt")
clean_fp = os.path.join(CLEAN_DIR, "dvf_clean.parquet")
dvf = load_dvf_idf(dvf_path, clean_fp)

# --- Jointure Codes Postaux (hyper-robuste encodage + noms/structures variés) ---
import re

codes_csv = os.path.join(RAW_DIR, "Base_codes_postaux.csv")

def _norm_colname(c: str) -> str:
    return re.sub(r"[^a-z0-9]+", "_", c.lower()).strip("_")

def read_codes_postaux(path: str) -> pd.DataFrame:
    # 1) Lecture robuste (encodage + séparateur auto)
    encodings = ["cp1252", "latin1", "utf-8", "utf-8-sig"]
    last_err = None
    for enc in encodings:
        try:
            dfc = pd.read_csv(path, sep=None, engine="python", encoding=enc, dtype=str)
            break
        except Exception as e:
            last_err = e
    else:
        raise last_err

    # 2) Normalisation des noms de colonnes
    orig_cols = list(dfc.columns)
    norm_cols = [_norm_colname(c) for c in dfc.columns]
    dfc.columns = norm_cols

    # 3) Alias possibles
    insee_aliases   = {
        "code_commune_insee","code_insee","insee","insee_code",
        "codecommuneinsee","codgeo","code_commune_insee_2020","code_commune_insee_2024"
    }
    postal_aliases  = {
        "code_postal","cp","postal","postal_code","codepostal","code_postal_commune"
    }
    dept_aliases    = {"code_departement","departement","dep","code_dept","dept","code_dep"}
    com3_aliases    = {"code_commune","codecom","codcom","code_comm","code_commune_3"}

    # 4) Détection colonnes
    cols_set = set(dfc.columns)

    def pick_any(candidates:set[str]) -> str|None:
        for c in candidates:
            if c in cols_set:
                return c
        return None

    col_insee  = pick_any(insee_aliases)
    col_cp     = pick_any(postal_aliases)
    col_dept   = pick_any(dept_aliases)
    col_com3   = pick_any(com3_aliases)

    # 5) Si INSEE manquant, mais dept + com3 dispos -> construire
    if col_insee is None and (col_dept is not None and col_com3 is not None):
        # sécuriser formats
        dfc[col_dept] = dfc[col_dept].astype(str).str.strip().str.zfill(2)
        dfc[col_com3] = dfc[col_com3].astype(str).str.strip().str.zfill(3)
        dfc["code_commune_insee"] = dfc[col_dept] + dfc[col_com3]
        col_insee = "code_commune_insee"

    # 6) Si CP introuvable mais colonne nom proche "code_postal" existe avec typos courantes
    if col_cp is None:
        # Heuristique: chercher une colonne qui contient "cp" ou "postal"
        candidates = [c for c in dfc.columns if ("postal" in c or c == "cp" or c.endswith("_cp"))]
        col_cp = candidates[0] if candidates else None

    # 7) Validation
    if col_insee is None or col_cp is None:
        # Aide au debug: montrer colonnes disponibles
        raise ValueError(
            "Le fichier codes postaux doit contenir INSEE et CP.\n"
            f"Colonnes disponibles (normalisées): {dfc.columns.tolist()}\n"
            "Essayé: INSEE in {insee_aliases} ou (dept in {dept_aliases} + com3 in {com3_aliases}); "
            f"CP in {postal_aliases}."
        )

    # 8) Sélection + nettoyage minimal
    out = dfc[[col_insee, col_cp]].rename(columns={col_insee: "code_commune_insee", col_cp: "code_postal"}).copy()
    out["code_commune_insee"] = out["code_commune_insee"].astype(str).str.strip()
    out["code_postal"]        = out["code_postal"].astype(str).str.strip()
    out = out.dropna(subset=["code_commune_insee","code_postal"])
    # garder CP à 5 chiffres si applicable
    out["code_postal"] = out["code_postal"].str.extract(r"(\d{5})", expand=False).fillna(out["code_postal"])
    return out

codes = read_codes_postaux(codes_csv)

# Choix d’un CP de référence par INSEE (mode sinon 1er)
cp_ref = (codes.groupby("code_commune_insee")["code_postal"]
               .agg(lambda s: s.mode().iat[0] if not s.mode().empty else s.iloc[0])
               .rename("code_postal_ref")
               .reset_index())

# Merge sécurisé
dvf = dvf.drop(columns=["code_postal"], errors="ignore").merge(cp_ref, on="code_commune_insee", how="left")
dvf = dvf.rename(columns={"code_postal_ref":"code_postal"})
dvf.to_parquet(clean_fp, index=False)


# ================================== Dashboard interactif ==================================
import ipywidgets as W
import matplotlib.pyplot as plt
from IPython.display import display, clear_output

df = pd.read_parquet(clean_fp).copy()

def iqr_bounds(s, k=2.0):
    q1, q3 = s.quantile(0.25), s.quantile(0.75)
    iqr = q3 - q1
    return q1 - k*iqr, q3 + k*iqr

def apply_outliers(d, use_iqr=True):
    if (not use_iqr) or (len(d) < 50):
        return d
    lo, hi = iqr_bounds(d["prix_m2"], k=2.0)
    return d[d["prix_m2"].between(lo, hi)]

def filter_by_surface(d, s_range):
    smin, smax = s_range
    return d[d["surface_reelle_bati"].between(smin, smax)]

def compute_yield(d, loyer_m2):
    out = d.copy()
    out["revenu_annuel"] = loyer_m2 * out["surface_reelle_bati"] * 12.0
    out["yield_brut"]    = out["revenu_annuel"] / out["valeur_fonciere"]
    return out

# Widgets
w_surface = W.IntRangeSlider(value=(20, 80), min=10, max=200, step=1,
                             description="Surface (m²)", continuous_update=False)
w_loyer   = W.FloatSlider(value=22.0, min=5.0, max=45.0, step=0.5,
                          readout_format=".1f", description="Loyer €/m²", continuous_update=False)
w_topn    = W.IntSlider(value=15, min=5, max=50, step=1, description="Top N", continuous_update=False)
w_out     = W.Checkbox(value=True, description="Filtrer outliers (IQR)")
w_show_hist = W.Checkbox(value=True, description="Histogramme prix/m²")

out = W.Output()

def render(_=None):
    with out:
        clear_output(wait=True)
        d = filter_by_surface(df, w_surface.value)
        d = apply_outliers(d, w_out.value)
        d = compute_yield(d, w_loyer.value)

        nb     = len(d)
        p50    = float(d["prix_m2"].median()) if nb else np.nan
        y_med  = float(d["yield_brut"].median()) if nb else np.nan
        html = f"""
        <div>
          <h3>KPIs</h3>
          <ul>
            <li>Transactions : {nb:,}</li>
            <li>Prix/m² (médiane) : {p50:,.0f} €</li>
            <li>Rendement brut (médiane) : {y_med*100:,.1f} %</li>
          </ul>
        </div>
        """
        display(W.HTML(html))

        if nb:
            # Top N communes par rendement médian
            top = (d.groupby("nom_commune", as_index=False)["yield_brut"]
                     .median().sort_values("yield_brut", ascending=False).head(w_topn.value))
            display(top)

            # Scatter prix/m² vs surface
            plt.figure(figsize=(6,4))
            d.sample(min(2000, nb), random_state=0).plot(kind="scatter", x="surface_reelle_bati", y="prix_m2")
            plt.title("Prix/m² vs Surface (échantillon)")
            plt.xlabel("Surface (m²)"); plt.ylabel("Prix/m² (€)")
            plt.show()

            # Histogramme prix/m² (optionnel)
            if w_show_hist.value:
                plt.figure(figsize=(6,4))
                d["prix_m2"].dropna().clip(0, d["prix_m2"].quantile(0.99)).plot(kind="hist", bins=40)
                plt.title("Distribution du prix/m² (troncature au 99e centile)")
                plt.xlabel("Prix/m² (€)")
                plt.ylabel("Fréquence")
                plt.show()

controls = W.VBox([
    W.HTML("<h3>Paramètres d’analyse</h3>"),
    W.HBox([w_surface, w_loyer]),
    W.HBox([w_topn, w_out, w_show_hist])
])

ui = W.HBox([controls, out], layout=W.Layout(align_items="flex-start"))
display(ui)

for w in (w_surface, w_loyer, w_topn, w_out, w_show_hist):
    w.observe(render, "value")

render()


HBox(children=(VBox(children=(HTML(value='<h3>Paramètres d’analyse</h3>'), HBox(children=(IntRangeSlider(value…

In [14]:
# ============================= Dashboard esthétique (widgets + UI) =============================
import ipywidgets as W
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display, clear_output, HTML

# -- Thème & helpers UI -------------------------------------------------------------------------
CSS = """
<style>
:root{
  --bg:#0f172a;         /* slate-900 */
  --panel:#111827;      /* gray-900 */
  --card:#131a2a;       /* custom */
  --muted:#9ca3af;      /* gray-400 */
  --text:#e5e7eb;       /* gray-200 */
  --accent:#22d3ee;     /* cyan-400 */
  --accent2:#60a5fa;    /* blue-400 */
}
div.app-wrap{
  background:linear-gradient(135deg,#0f172a 0%,#0b1220 100%);
  color:var(--text);
  padding:16px 16px 24px 16px; border-radius:16px;
  box-shadow:0 10px 30px rgba(0,0,0,.35);
  font-family: ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto, Ubuntu, Cantarell, Noto Sans, Helvetica Neue, Arial;
}
.app-title{
  font-size:20px; font-weight:600; letter-spacing:.2px; margin:0 0 8px 0;
}
.app-subtitle{
  color:var(--muted); font-size:13px; margin:0 0 14px 0;
}
.grid-2{
  display:grid; grid-template-columns: 1.1fr 1fr; gap:16px; align-items:start;
}
.card{
  background:var(--panel); border:1px solid rgba(255,255,255,.06);
  border-radius:14px; padding:12px 12px;
}
.controls-grid{
  display:grid; grid-template-columns:1fr 1fr; gap:10px;
}
.kpis{ display:grid; grid-template-columns: repeat(3, 1fr); gap:10px; }
.kpi{
  background:var(--card); border:1px solid rgba(255,255,255,.06);
  border-radius:12px; padding:12px;
}
.kpi .label{ color:var(--muted); font-size:12px; }
.kpi .value{ font-size:22px; font-weight:600; color:var(--accent); }
.tbl-title{ font-size:14px; font-weight:600; margin:12px 0 6px 0; }
.footer-note{ color:var(--muted); font-size:12px; margin-top:8px; }
</style>
"""

def fmt_int(x):
    return f"{int(x):,}".replace(",", " ")

def kpi_card(label, value_html):
    return f"""
    <div class="kpi">
      <div class="label">{label}</div>
      <div class="value">{value_html}</div>
    </div>
    """

display(HTML(CSS + '<div class="app-wrap"><div class="app-title">Analyse DVF — Île-de-France</div><div class="app-subtitle">Nettoyage, filtres interactifs & exploration</div></div>'))

# -- Données -------------------------------------------------------------------------------------
df = pd.read_parquet(clean_fp).copy()
# sécurité colonnes
for c in ["prix_m2","surface_reelle_bati","valeur_fonciere", "annee", "nom_commune", "code_postal"]:
    if c not in df.columns:
        df[c] = np.nan

# -- Widgets -------------------------------------------------------------------------------------
w_surface = W.IntRangeSlider(value=(20, 80), min=10, max=200, step=1,
                             description="Surface (m²)", continuous_update=False, readout=True,
                             style={'description_width':'110px'}, layout=W.Layout(width='100%'))
w_loyer   = W.FloatSlider(value=22.0, min=5.0, max=45.0, step=0.5,
                          readout_format=".1f", description="Loyer €/m²", continuous_update=False,
                          style={'description_width':'110px'}, layout=W.Layout(width='100%'))
w_topn    = W.IntSlider(value=15, min=5, max=50, step=1,
                        description="Top N", continuous_update=False,
                        style={'description_width':'110px'}, layout=W.Layout(width='100%'))
w_iqr     = W.ToggleButtons(options=[('IQR on', True), ('IQR off', False)],
                            value=True, description="Outliers",
                            style={'description_width':'110px'})
w_hist    = W.Checkbox(value=True, description="Histogramme prix/m²")
w_year    = W.SelectionRangeSlider(
              options=sorted(df["annee"].dropna().astype(int).unique()),
              index=(0, max(0, len(df["annee"].dropna().unique())-1)),
              description="Années", continuous_update=False,
              layout=W.Layout(width='100%'),
              style={'description_width':'110px'}
           ) if df["annee"].notna().any() else W.Label("Années : n/a")

communes = ["(Toutes)"] + sorted(df["nom_commune"].dropna().astype(str).unique().tolist())
w_commune = W.Dropdown(options=communes, value="(Toutes)", description="Commune",
                       layout=W.Layout(width='100%'), style={'description_width':'110px'})

w_btn_export = W.Button(description="Exporter (CSV)", button_style='',
                        tooltip="Exporte le dataset filtré en CSV dans le dossier clean/",
                        layout=W.Layout(width='160px'))
w_msg = W.HTML("")

# -- Sorties -------------------------------------------------------------------------------------
out_overview = W.Output()
out_table    = W.Output()
out_plot1    = W.Output()
out_plot2    = W.Output()

# -- Filtres & calculs ---------------------------------------------------------------------------
def iqr_bounds(s, k=2.0):
    q1, q3 = s.quantile(0.25), s.quantile(0.75)
    iqr = q3 - q1
    return q1 - k*iqr, q3 + k*iqr

def apply_filters(d):
    # surface
    smin, smax = w_surface.value
    d = d[d["surface_reelle_bati"].between(smin, smax)]
    # années
    if isinstance(w_year, W.SelectionRangeSlider):
        y0, y1 = w_year.value
        d = d[(d["annee"]>=y0) & (d["annee"]<=y1)]
    # commune
    if w_commune.value != "(Toutes)":
        d = d[d["nom_commune"] == w_commune.value]
    # outliers
    if w_iqr.value and len(d) >= 50 and d["prix_m2"].notna().any():
        lo, hi = iqr_bounds(d["prix_m2"], k=2.0)
        d = d[d["prix_m2"].between(lo, hi)]
    # yield
    loyer_m2 = w_loyer.value
    d = d.assign(revenu_annuel = loyer_m2 * d["surface_reelle_bati"] * 12.0)
    d = d.assign(yield_brut = d["revenu_annuel"] / d["valeur_fonciere"])
    return d

# -- Rendu ---------------------------------------------------------------------------------------
def render(_=None):
    with out_overview:
        clear_output(wait=True)
        d = apply_filters(df)

        nb = len(d)
        p50 = float(d["prix_m2"].median()) if d["prix_m2"].notna().any() else np.nan
        ymd = float(d["yield_brut"].median()) if d["yield_brut"].notna().any() else np.nan
        s50 = float(d["surface_reelle_bati"].median()) if d["surface_reelle_bati"].notna().any() else np.nan

        kpis_html = '<div class="kpis">' + \
            kpi_card("Transactions", fmt_int(nb) if nb==nb else "–") + \
            kpi_card("Prix/m² médiane", f"{p50:,.0f} €".replace(",", " ") if p50==p50 else "–") + \
            kpi_card("Rendement brut médian", f"{ymd*100:,.1f} %".replace(",", " ") if ymd==ymd else "–") + \
        '</div>'

        display(HTML(f'<div class="card">{kpis_html}<div class="footer-note">Filtrage interactif à gauche. Export CSV disponible.</div></div>'))

    with out_table:
        clear_output(wait=True)
        d = apply_filters(df)
        # Top N communes par rendement médian
        if len(d):
            top = (d.groupby(["nom_commune","code_postal"], as_index=False)["yield_brut"]
                     .median().sort_values("yield_brut", ascending=False).head(w_topn.value))
            top["yield_brut_%"] = (top["yield_brut"]*100).round(2)
            to_show = top[["nom_commune","code_postal","yield_brut_%"]].rename(
                columns={"nom_commune":"Commune","code_postal":"CP","yield_brut_%":"Yield brut (%)"}
            )
            html = to_show.style.hide(axis="index") \
                               .set_table_styles([{'selector':'th','props':'text-align:left; padding:6px 8px;'},
                                                  {'selector':'td','props':'padding:6px 8px;'}]) \
                               .bar(subset=["Yield brut (%)"], vmin=to_show["Yield brut (%)"].min(),
                                    vmax=to_show["Yield brut (%)"].max()) \
                               .to_html()
            display(HTML('<div class="tbl-title">Top communes (médiane du rendement)</div>'))
            display(HTML(f'<div class="card">{html}</div>'))
        else:
            display(HTML('<div class="card">Aucune donnée avec ces filtres.</div>'))

    # Graphes: Plotly si possible, sinon Matplotlib
    try:
        import plotly.express as px
        with out_plot1:
            clear_output(wait=True)
            d = apply_filters(df)
            if len(d):
                ds = d.sample(min(4000, len(d)), random_state=0)
                fig = px.scatter(ds, x="surface_reelle_bati", y="prix_m2",
                                 hover_data=["nom_commune","code_postal","annee"],
                                 trendline="lowess", height=420)
                fig.update_layout(margin=dict(l=10,r=10,t=10,b=10), template="plotly_dark")
                fig.show()
            else:
                display(HTML('<div class="card">—</div>'))

        with out_plot2:
            clear_output(wait=True)
            d = apply_filters(df)
            if len(d) and w_hist.value and d["prix_m2"].notna().any():
                cut = d["prix_m2"].clip(upper=d["prix_m2"].quantile(0.99))
                fig = px.histogram(cut, x=cut, nbins=45, height=380)
                fig.update_layout(margin=dict(l=10,r=10,t=10,b=10), template="plotly_dark")
                fig.update_xaxes(title="Prix/m² (€)")
                fig.update_yaxes(title="Fréquence")
                fig.show()
            else:
                display(HTML('<div class="card">—</div>'))

    except Exception:
        with out_plot1:
            clear_output(wait=True)
            d = apply_filters(df)
            if len(d):
                ds = d.sample(min(4000, len(d)), random_state=0)
                plt.figure(figsize=(6.8,4.2))
                plt.scatter(ds["surface_reelle_bati"], ds["prix_m2"], s=12, alpha=.6)
                plt.title("Prix/m² vs Surface (échantillon)")
                plt.xlabel("Surface (m²)"); plt.ylabel("Prix/m² (€)")
                plt.grid(alpha=.2); plt.tight_layout()
                plt.show()
            else:
                display(HTML('<div class="card">—</div>'))

        with out_plot2:
            clear_output(wait=True)
            d = apply_filters(df)
            if len(d) and w_hist.value and d["prix_m2"].notna().any():
                cut = d["prix_m2"].clip(upper=d["prix_m2"].quantile(0.99))
                plt.figure(figsize=(6.8,3.8))
                plt.hist(cut, bins=45)
                plt.title("Distribution du prix/m² (troncature 99e centile)")
                plt.xlabel("Prix/m² (€)"); plt.ylabel("Fréquence")
                plt.grid(alpha=.2); plt.tight_layout()
                plt.show()
            else:
                display(HTML('<div class="card">—</div>'))

# -- Export CSV ----------------------------------------------------------------------------------
def on_export_clicked(_):
    d = apply_filters(df)
    out_fp = os.path.join(CLEAN_DIR, "dvf_filtre_export.csv")
    d.to_csv(out_fp, index=False)
    w_msg.value = f'<span style="color:#22d3ee">Exporté : {out_fp}</span>'

w_btn_export.on_click(on_export_clicked)

# -- Mise en page (onglets) ----------------------------------------------------------------------
controls_left = W.VBox([
    W.HTML('<div class="card"><div class="controls-grid"></div></div>'),  # placeholder css card
    W.VBox([
        W.HTML('<div class="card"><div class="app-subtitle" style="margin:0 0 10px 0;">Paramètres</div>'),
        W.VBox([
            w_surface, w_loyer, w_topn, w_iqr, w_hist, w_year, w_commune,
            W.HBox([w_btn_export, w_msg])
        ], layout=W.Layout(padding="0 6px 8px 6px"))
    ])
])

tab = W.Tab(children=[
    W.VBox([out_overview]),
    W.VBox([out_table]),
    W.VBox([out_plot1]),
    W.VBox([out_plot2]),
])
tab.set_title(0, "Vue d’ensemble")
tab.set_title(1, "Communes")
tab.set_title(2, "Dispersion")
tab.set_title(3, "Distribution")

layout = W.HBox([
    W.HTML('<div class="app-wrap" style="width:16px;visibility:hidden"></div>'),
    W.VBox([controls_left], layout=W.Layout(min_width="360px", width="380px")),
    W.VBox([tab], layout=W.Layout(flex="1 1 auto", width="100%"))
])
display(layout)

# -- Observers -----------------------------------------------------------------------------------
for w in (w_surface, w_loyer, w_topn, w_iqr, w_hist, w_commune):
    w.observe(render, "value")
if isinstance(w_year, W.SelectionRangeSlider):
    w_year.observe(render, "value")

render()


HBox(children=(HTML(value='<div class="app-wrap" style="width:16px;visibility:hidden"></div>'), VBox(children=…

## Chargement + typage + IDF + prix/m²

In [15]:
import os, re, unicodedata as ud
import pandas as pd

CLEAN_DIR = os.path.abspath(os.path.join("..", "data", "clean"))
os.makedirs(CLEAN_DIR, exist_ok=True)
codes_parquet = os.path.join(CLEAN_DIR, "codes_postaux.parquet")

def _norm_name(c: str) -> str:
    # normalise: strip, retire '#', enlève accents, lower, remplace non-alnum par '_', compresse '__'
    c = (c or "").strip().lstrip("#")
    c = ud.normalize("NFKD", c).encode("ascii","ignore").decode("ascii")
    c = re.sub(r"[^A-Za-z0-9]+", "_", c).strip("_").lower()
    c = re.sub(r"_+", "_", c)
    return c

def _find_col(cols, candidates):
    """retourne le premier nom présent parmi les 'candidates' (déjà normalisés)"""
    for cand in candidates:
        if cand in cols:
            return cand
    return None

def read_codes_postaux_robust(path, cache_fp=codes_parquet, force_reload=False):
    # 0) Cache parquet ultra-rapide
    if os.path.exists(cache_fp) and not force_reload:
        dfp = pd.read_parquet(cache_fp)
        # vérifie qu'on a bien l'INSEE
        if "code_commune_insee" in dfp.columns:
            return dfp

    # 1) Lecture rapide (moteur C) – encodage cp1252 (classique), sinon latin-1
    try:
        raw = pd.read_csv(path, sep=";", dtype=str, encoding="cp1252", engine="c", low_memory=True)
    except UnicodeDecodeError:
        raw = pd.read_csv(path, sep=";", dtype=str, encoding="latin-1", engine="c", low_memory=True)

    # 2) Normalise tous les noms de colonnes
    norm_cols = [_norm_name(c) for c in raw.columns]
    raw.columns = norm_cols
    cols = set(norm_cols)

    # 3) Détection robuste des colonnes utiles
    insee_col = _find_col(cols, ["code_commune_insee", "code_insee_commune", "codeinsee", "insee", "code_commune"])
    cp_col    = _find_col(cols, ["code_postal", "cp", "codepostal"])
    nom_col   = _find_col(cols, ["nom_de_la_commune", "nom_commune", "commune", "nom"])
    lib_col   = _find_col(cols, ["libelle_d_acheminement", "libelle_acheminement", "libelle_d_acheminement"])
    l5_col    = _find_col(cols, ["ligne_5", "ligne5"])

    if insee_col is None:
        # aide au debug : affiche les colonnes rencontrées
        raise KeyError(
            "Impossible de trouver la colonne INSEE. Colonnes normalisées lues : "
            + ", ".join(sorted(cols))
        )

    # 4) Sous-sélection & renommage stable
    keep_map = {}
    keep_map[insee_col] = "code_commune_insee"
    if cp_col:  keep_map[cp_col]  = "code_postal"
    if nom_col: keep_map[nom_col] = "nom_commune"
    if lib_col: keep_map[lib_col] = "libelle_acheminement"
    if l5_col:  keep_map[l5_col]  = "ligne_5"

    df = raw[list(keep_map.keys())].rename(columns=keep_map)

    # 5) Formats
    df["code_commune_insee"] = df["code_commune_insee"].astype(str).str.strip().str.zfill(5)
    if "code_postal" in df.columns:
        df["code_postal"] = df["code_postal"].astype(str).str.strip().str.zfill(5)

    # 6) Réduction à 1 ligne par INSEE (code postal “référence” choisi par ordre)
    # 6) Réduction à 1 ligne par INSEE (choix d’un code postal “référence” par ordre)
    agg_dict = {}
    if "code_postal" in df.columns:
        agg_dict["code_postal"] = "first"
    if "nom_commune" in df.columns:
        agg_dict["nom_commune"] = "first"
    if "libelle_acheminement" in df.columns:
        agg_dict["libelle_acheminement"] = "first"

    # Si aucune colonne à agréger n'est dispo, on garde juste la liste unique des INSEE
    if not agg_dict:
        df_ref = df[["code_commune_insee"]].drop_duplicates().copy()
    else:
        sort_cols = ["code_commune_insee"] + (["code_postal"] if "code_postal" in df.columns else [])
        df_ref = (
            df.sort_values(sort_cols)
            .groupby("code_commune_insee", as_index=False)
            .agg(agg_dict)                # <-- dict, pas **kwargs
        )

    df_ref.to_parquet(cache_fp, index=False)
    return df_ref
codes = read_codes_postaux_robust(codes_csv, force_reload=True)
print("CODES ->", codes.shape, list(codes.columns))
codes.head()



CODES -> (35007, 4) ['code_commune_insee', 'code_postal', 'nom_commune', 'libelle_acheminement']


Unnamed: 0,code_commune_insee,code_postal,nom_commune,libelle_acheminement
0,1001,1400,L ABERGEMENT CLEMENCIAT,L ABERGEMENT CLEMENCIAT
1,1002,1640,L ABERGEMENT DE VAREY,L ABERGEMENT DE VAREY
2,1004,1500,AMBERIEU EN BUGEY,AMBERIEU EN BUGEY
3,1005,1330,AMBERIEUX EN DOMBES,AMBERIEUX EN DOMBES
4,1006,1300,AMBLEON,AMBLEON


## Lecture de la base codes postaux

In [16]:
def load_dvf_chunked(dvf_path, clean_fp):
    # Si un parquet propre existe déjà : le lire directement (plus rapide)
    if os.path.exists(clean_fp):
        df = pd.read_parquet(clean_fp)
        return df

    usecols = [
        "Date mutation","Nature mutation","Valeur fonciere",
        "Code postal","Commune","Code departement","Code commune",
        "Type local","Surface reelle bati","Nombre pieces principales"
    ]
    idf_prefix = ("75","77","78","91","92","93","94","95")

    keep = []
    chunksize = 200_000
    # encodage DVF souvent cp1252
    for i, chunk in enumerate(pd.read_csv(
        dvf_path, sep="|", dtype=str, usecols=usecols,
        chunksize=chunksize, low_memory=True, engine="c", encoding="cp1252"
    )):
        # vente + IDF
        chunk = chunk[chunk["Nature mutation"].fillna("").str.contains("Vente", case=False, na=False)]
        chunk = chunk[chunk["Code departement"].astype(str).str.startswith(idf_prefix)]
        if chunk.empty:
            continue

        # num + filtres
        chunk["Valeur fonciere"] = to_num_fr(chunk["Valeur fonciere"])
        chunk["Surface reelle bati"] = pd.to_numeric(chunk["Surface reelle bati"], errors="coerce")

        chunk = chunk[(chunk["Surface reelle bati"] > 8) & (chunk["Valeur fonciere"] > 1000)]
        chunk["prix_m2"] = chunk["Valeur fonciere"] / chunk["Surface reelle bati"]
        chunk = chunk[chunk["prix_m2"].between(100, 30000)]

        # dates + renommage
        chunk["Date mutation"] = pd.to_datetime(chunk["Date mutation"], errors="coerce")
        chunk = chunk.rename(columns={
            "Date mutation":"date_mutation",
            "Valeur fonciere":"valeur_fonciere",
            "Surface reelle bati":"surface_reelle_bati",
            "Commune":"nom_commune",
            "Code postal":"code_postal",
            "Type local":"type_local",
            "Code departement":"code_departement",
            "Code commune":"code_commune_3"
        })

        # construire code INSEE (dept 2 + commune 3) — valable en IDF
        chunk["code_departement"] = chunk["code_departement"].astype(str).str.strip().str.zfill(2)
        chunk["code_commune_3"]  = chunk["code_commune_3"].astype(str).str.strip().str.zfill(3)
        chunk["code_commune_insee"] = chunk["code_departement"] + chunk["code_commune_3"]

        keep.append(chunk[[
            "date_mutation","valeur_fonciere","surface_reelle_bati","prix_m2",
            "nom_commune","code_postal","type_local","code_commune_insee"
        ]])

        if (i+1) % 5 == 0:
            print(f"[Progression] {i+1} chunks traités...")

    if not keep:
        raise RuntimeError("Aucune ligne DVF retenue après filtrage.")

    df = pd.concat(keep, ignore_index=True)
    df["annee"] = pd.to_datetime(df["date_mutation"], errors="coerce").dt.year

    # Merge codes postaux via INSEE → 1 ligne par INSEE côté référentiel
    codes_ref = (codes
                 .dropna(subset=["code_commune_insee"])
                 .sort_values(["code_commune_insee","code_postal"])
                 .groupby("code_commune_insee", as_index=False)
                 .agg(code_postal_ref=("code_postal","first"),
                      nom_commune_ref=("nom_commune","first"),
                      libelle_acheminement_ref=("libelle_acheminement","first")))

    df = df.merge(codes_ref, on="code_commune_insee", how="left")
    # harmoniser nom_commune si manquants
    if "nom_commune" not in df.columns or df["nom_commune"].isna().mean() > 0.5:
        df["nom_commune"] = df["nom_commune_ref"].fillna(df.get("nom_commune", pd.Series(index=df.index)))

    # colonnes finales minimales
    df_min = df[[
        "date_mutation","annee","valeur_fonciere","surface_reelle_bati","prix_m2",
        "nom_commune","code_commune_insee","code_postal_ref"
    ]].rename(columns={"code_postal_ref":"code_postal"})

    df_min.to_parquet(clean_fp, index=False)
    print(f"[OK] Sauvegardé → {clean_fp} ({len(df_min):,} lignes)")
    return df_min

clean_fp = os.path.join(CLEAN_DIR, "dvf_clean.parquet")
dvf = load_dvf_chunked(dvf_txt, clean_fp)
print("DVF ->", dvf.shape, list(dvf.columns))


NameError: name 'dvf_txt' is not defined

In [None]:
import os
clean_fp = os.path.join("..", "data", "clean", "dvf_clean.parquet")
if os.path.exists(clean_fp):
    os.remove(clean_fp)
    print("Deleted:", clean_fp)
import os, pandas as pd

RAW_DIR   = os.path.abspath(os.path.join("..", "data", "raw"))
CLEAN_DIR = os.path.abspath(os.path.join("..", "data", "clean"))
os.makedirs(CLEAN_DIR, exist_ok=True)

raw_txt  = os.path.join(RAW_DIR, "DVF_2025_S1.txt")
clean_fp = os.path.join(CLEAN_DIR, "dvf_clean.parquet")

def to_num_fr(s: pd.Series) -> pd.Series:
    s = s.astype(str).str.replace("\u00A0","", regex=False)  # NBSP
    s = s.str.replace(" ", "", regex=False).str.replace(",", ".", regex=False)
    return pd.to_numeric(s, errors="coerce")

def load_dvf_chunked(dvf_path=raw_txt, out_fp=clean_fp):
    if os.path.exists(out_fp):
        return pd.read_parquet(out_fp)

    usecols = [
        "Date mutation","Nature mutation","Valeur fonciere",
        "Code postal","Commune","Code departement","Code commune",
        "Type local","Surface reelle bati","Nombre pieces principales"
    ]
    idf_prefix = ("75","77","78","91","92","93","94","95")
    chunksize = 200_000
    keep = []

    # Try UTF-8 first (file shows UTF-8 symptoms), then fallback CP1252
    tried = []
    for enc in ("utf-8", "cp1252"):
        try:
            reader = pd.read_csv(
                dvf_path, sep="|", dtype=str, usecols=usecols,
                chunksize=chunksize, low_memory=True, engine="c", encoding=enc
            )
            tried.append(enc)
            for i, chunk in enumerate(reader):
                # Filter Vente + IDF early
                chunk = chunk[chunk["Nature mutation"].fillna("").str.contains("Vente", case=False, na=False)]
                chunk = chunk[chunk["Code departement"].astype(str).str.startswith(idf_prefix)]
                if chunk.empty:
                    continue

                # Numeric conversions
                chunk["Valeur fonciere"]     = to_num_fr(chunk["Valeur fonciere"])
                chunk["Surface reelle bati"] = pd.to_numeric(chunk["Surface reelle bati"], errors="coerce")

                # Date parse — explicit dd/mm/yyyy
                dm = chunk["Date mutation"].astype(str).str.strip()
                dt = pd.to_datetime(dm, format="%d/%m/%Y", errors="coerce")

                # Small fallback for rare ISO rows (yyyy-mm-dd)
                m = dt.isna()
                if m.any():
                    dt.loc[m] = pd.to_datetime(dm.loc[m], errors="coerce", dayfirst=True)
                chunk["Date mutation"] = dt

                # Keep only plausible values
                chunk = chunk[(chunk["Surface reelle bati"] > 8) & (chunk["Valeur fonciere"] > 1000)]
                chunk["prix_m2"] = chunk["Valeur fonciere"] / chunk["Surface reelle bati"]
                chunk = chunk[chunk["prix_m2"].between(100, 30000)]

                # Build INSEE key (IDF => dept 2 + commune 3)
                chunk = chunk.rename(columns={
                    "Date mutation":"date_mutation",
                    "Valeur fonciere":"valeur_fonciere",
                    "Surface reelle bati":"surface_reelle_bati",
                    "Commune":"nom_commune",
                    "Code postal":"code_postal",
                    "Type local":"type_local",
                    "Code departement":"code_departement",
                    "Code commune":"code_commune_3"
                })
                chunk["code_departement"]   = chunk["code_departement"].astype(str).str.strip().str.zfill(2)
                chunk["code_commune_3"]     = chunk["code_commune_3"].astype(str).str.strip().str.zfill(3)
                chunk["code_commune_insee"] = chunk["code_departement"] + chunk["code_commune_3"]

                keep.append(chunk[[
                    "date_mutation","valeur_fonciere","surface_reelle_bati","prix_m2",
                    "nom_commune","code_postal","type_local","code_commune_insee"
                ]])

                if (i+1) % 5 == 0:
                    print(f"[{enc}] chunks: {i+1}  rows kept: ~{sum(len(k) for k in keep):,}")
            break
        except Exception as e:
            tried.append(f"{enc} (failed: {e.__class__.__name__})")
            continue

    if not keep:
        raise RuntimeError(f"DVF read yielded no rows. Tried encodings: {tried}")

    df = pd.concat(keep, ignore_index=True)
    df["annee"] = pd.to_datetime(df["date_mutation"], errors="coerce").dt.year
    df.to_parquet(out_fp, index=False)
    print(f"[OK] Saved → {out_fp}  rows={len(df):,}")
    return df

dvf = load_dvf_chunked()
print(dvf.shape, dvf["date_mutation"].isna().mean(), dvf["annee"].isna().mean())



In [None]:
import re
import numpy as np
import pandas as pd

def parse_dates_robust(s: pd.Series) -> pd.Series:
    """Nettoyage + parsing multi-formats pour DVF date_mutation."""
    # 1) Normalise la chaîne (retire NBSP, trim, garde chiffres et /-)
    x = (s.astype(str)
           .str.replace("\u00A0"," ", regex=False)   # NBSP
           .str.strip()
           .str.replace(r"[^0-9/\-]", "", regex=True))

    out = pd.Series(pd.NaT, index=s.index, dtype="datetime64[ns]")

    # 2) yyyy-mm-dd
    m1 = x.str.match(r"^\d{4}-\d{2}-\d{2}$", na=False)
    if m1.any():
        out.loc[m1] = pd.to_datetime(x.loc[m1], format="%Y-%m-%d", errors="coerce")

    # 3) dd/mm/yyyy
    m2 = x.str.match(r"^\d{1,2}/\d{1,2}/\d{4}$", na=False)
    if m2.any():
        out.loc[m2] = pd.to_datetime(x.loc[m2], format="%d/%m/%Y", errors="coerce")

    # 4) dd/mm/yy → suppose 00–69 => 2000–2069, 70–99 => 1970–1999
    m3 = x.str.match(r"^\d{1,2}/\d{1,2}/\d{2}$", na=False)
    if m3.any():
        tmp = x.loc[m3].copy()
        # ajoute le siècle
        yy = tmp.str.extract(r"(\d{2})$")[0].astype(int)
        century = np.where(yy >= 70, "19", "20")
        # reconstruit en dd/mm/yyyy
        tmp_full = tmp.str.replace(r"/(\d{2})$", lambda m: f"/{century[tmp.index.get_loc(m.start())]}{m.group(1)}", regex=True)
        out.loc[m3] = pd.to_datetime(tmp_full, format="%d/%m/%Y", errors="coerce")

    # 5) Dernière passe permissive sur le reste (dayfirst=True)
    rem = out.isna()
    if rem.any():
        out.loc[rem] = pd.to_datetime(x.loc[rem], errors="coerce", dayfirst=True)

    return out

# --- applique sur ton df déjà chargé (depuis dvf_clean.parquet) ---
before = dvf.get("annee")
dvf["date_mutation"] = parse_dates_robust(dvf["date_mutation"])
dvf["annee"] = dvf["date_mutation"].dt.year

print("Taux de dates invalides après parse robuste :",
      dvf["date_mutation"].isna().mean().round(4))

# Diagnostique rapide si encore élevé
if dvf["date_mutation"].isna().mean() > 0.1:
    print("Exemples de valeurs non parsées :")
    print(dvf.loc[dvf["date_mutation"].isna(), "date_mutation"].astype(str).value_counts().head(10))

# Sauvegarde propre
dvf.to_parquet(clean_fp, index=False)
print("✅ Parquet regénéré :", clean_fp)


In [None]:
import re
import pandas as pd

# 1) Retrouver la colonne date quel que soit son nom
name_map = {c.lower().strip(): c for c in dvf.columns}
date_candidates = ["date_mutation", "date mutation", "date_mut", "date"]
date_col = next((name_map[k] for k in date_candidates if k in name_map), None)

if date_col is None:
    # tentative heuristique: première colonne contenant 'date'
    date_col = next((c for c in dvf.columns if re.search(r"\bdate\b", c, flags=re.I)), None)

if date_col is None:
    raise KeyError(f"Aucune colonne date trouvée dans dvf. Colonnes: {list(dvf.columns)}")

# 2) (Re)calculer l'année de façon sûre
# Recalcule proprement l'année avec format français (jour en premier)
dvf["annee"] = pd.to_datetime(dvf["date_mutation"], errors="coerce", dayfirst=True).dt.year

# Vérifie la validité
nan_rate = dvf["annee"].isna().mean()
print(f"Taux de dates invalides après correction : {nan_rate:.1%}")

# Sauvegarde propre
dvf.to_parquet(clean_fp, index=False)
print("✅ Parquet mis à jour avec dates corrigées :", clean_fp)

# 3) Optionnel: si beaucoup de NaN d'année, signale le problème de parsing
nan_rate = dvf["annee"].isna().mean()
if nan_rate > 0.5:
    print(f"⚠️ Plus de 50% des dates sont invalides ({nan_rate:.0%}). Vérifie le format de '{date_col}'.")

# 4) Sauvegarder à nouveau le parquet avec 'annee' pour accélérer les prochains runs
from pathlib import Path
CLEAN_DIR = Path(CLEAN_DIR)
CLEAN_DIR.mkdir(parents=True, exist_ok=True)
clean_fp = CLEAN_DIR / "dvf_clean.parquet"
dvf.to_parquet(clean_fp, index=False)
print("✅ 'annee' ajoutée et parquet mis à jour →", clean_fp)



In [None]:
display(dvf.head(5))
print(dvf["annee"].value_counts().sort_index().tail(10))
print(dvf["nom_commune"].value_counts().head(10))


popo


In [None]:
import os, warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from ipywidgets import IntRangeSlider, FloatSlider, IntSlider, Checkbox, HBox, VBox, Output, HTML
from IPython.display import display, clear_output

warnings.filterwarnings("ignore")

CLEAN_DIR = os.path.abspath(os.path.join("..", "data", "clean"))
df = pd.read_parquet(os.path.join(CLEAN_DIR, "dvf_clean.parquet"))

# colonnes minimales attendues
needed = {"date_mutation","annee","valeur_fonciere","surface_reelle_bati","prix_m2","nom_commune"}
missing = needed - set(df.columns)
assert not missing, f"Colonnes manquantes: {missing}"

# filtres de base (par sécurité)
df = df.dropna(subset=["surface_reelle_bati","valeur_fonciere","prix_m2","nom_commune"]).copy()
df = df[(df["surface_reelle_bati"] > 8) & (df["valeur_fonciere"] > 1000) & (df["prix_m2"].between(100, 30000))]

len(df), df["annee"].min(), df["annee"].max()


In [None]:
w_surface = IntRangeSlider(value=(20, 80), min=10, max=200, step=1,
                           description="Surface (m²)", continuous_update=False)

w_loyer = FloatSlider(value=22.0, min=5.0, max=45.0, step=0.5,
                      readout_format=".1f", description="Loyer €/m²", continuous_update=False)

w_topn = IntSlider(value=15, min=5, max=50, step=1, description="Top N", continuous_update=False)
w_out  = Checkbox(value=True, description="Filtrer outliers (IQR)")

out = Output()
display(VBox([HBox([w_surface, w_loyer]), HBox([w_topn, w_out]), out]))


In [None]:
def iqr_bounds(s, k=2.0):
    q1, q3 = s.quantile(0.25), s.quantile(0.75)
    iqr = q3 - q1
    return q1 - k*iqr, q3 + k*iqr

def apply_outliers(d, use_iqr=True):
    if not use_iqr or len(d) < 50:
        return d
    lo, hi = iqr_bounds(d["prix_m2"], k=2.0)
    return d[d["prix_m2"].between(lo, hi)]

def filter_by_surface(d: pd.DataFrame, s_range):
    smin, smax = s_range
    return d[d["surface_reelle_bati"].between(smin, smax)]

def compute_yield(d: pd.DataFrame, loyer_m2: float) -> pd.DataFrame:
    res = d.copy()
    res["revenu_annuel"] = loyer_m2 * res["surface_reelle_bati"] * 12.0
    res["yield_brut"]    = res["revenu_annuel"] / res["valeur_fonciere"]
    return res

def render(_=None):
    with out:
        clear_output(wait=True)

        d = filter_by_surface(df, w_surface.value)
        d = apply_outliers(d, w_out.value)
        d = compute_yield(d, w_loyer.value)

        # Classement communes par rendement médian
        rank = (d.groupby("nom_commune", dropna=False)
                  .agg(nb=("prix_m2","count"),
                       prix_m2_med=("prix_m2","median"),
                       surf_med=("surface_reelle_bati","median"),
                       yield_brut_med=("yield_brut","median"),
                       yield_p90=("yield_brut", lambda s: s.quantile(0.90)))
                  .sort_values("yield_brut_med", ascending=False)
                  .head(w_topn.value)
                  .reset_index())

        display(HTML("<h4>Top communes (rendement brut médian)</h4>"))
        display(rank)

        # Histogramme (cap à 25% pour lisibilité)
        plt.figure()
        d["yield_brut"].dropna().clip(upper=0.25).plot(kind="hist", bins=40)
        plt.xlabel("Rendement brut")
        plt.ylabel("Nombre de ventes")
        plt.title("Distribution des rendements (Surface + Loyer €/m²)")
        plt.show()

w_surface.observe(render, "value")
w_loyer.observe(render, "value")
w_topn.observe(render, "value")
w_out.observe(render, "value")
import pandas as pd
import matplotlib.pyplot as plt
from ipywidgets import IntRangeSlider, FloatSlider, Checkbox, IntSlider, Output, VBox, HBox, HTML
from IPython.display import display, clear_output

# --- Widgets ---
w_surface = IntRangeSlider(
    value=[15, 60],
    min=10, max=200, step=5,
    description='Surface (m²)',
    continuous_update=False,
    layout={'width': '400px'}
)

w_loyer = FloatSlider(
    value=25.0, min=10, max=60, step=0.5,
    description='Loyer €/m²/mois',
    continuous_update=False,
    layout={'width': '400px'}
)

w_topn = IntSlider(
    value=10, min=5, max=50, step=5,
    description='Top N',
    continuous_update=False,
    layout={'width': '400px'}
)

w_out = Checkbox(
    value=True,
    description='Exclure outliers (IQR)',
)

out = Output()

# --- Assemblage des contrôles ---
controls = VBox([
    HTML("<h3>Paramètres d’analyse</h3>"),
    HBox([w_surface, w_loyer]),
    HBox([w_topn, w_out])
])

display(controls, out)

render()


In [None]:
# ==== Dashboard Surface & Loyer (Widgets 5 et 6) ====
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import ipywidgets as W
from IPython.display import display, clear_output, HTML

# --------- Helpers métriques ---------
def iqr_bounds(s, k=2.0):
    q1, q3 = s.quantile(0.25), s.quantile(0.75)
    iqr = q3 - q1
    return q1 - k*iqr, q3 + k*iqr

def apply_outliers(d, use_iqr=True):
    if not use_iqr or len(d) < 50:
        return d
    lo, hi = iqr_bounds(d["prix_m2"], k=2.0)
    return d[d["prix_m2"].between(lo, hi)]

def filter_by_surface(d: pd.DataFrame, s_range):
    smin, smax = s_range
    return d[d["surface_reelle_bati"].between(smin, smax)]

def compute_yield(d: pd.DataFrame, loyer_m2: float) -> pd.DataFrame:
    res = d.copy()
    res["revenu_annuel"] = loyer_m2 * res["surface_reelle_bati"] * 12.0
    res["yield_brut"]    = res["revenu_annuel"] / res["valeur_fonciere"]
    return res

def kpis_html(d):
    nb = f"{len(d):,}".replace(",", " ")
    p50 = (d["prix_m2"].median() if len(d) else np.nan)
    y50 = (d["yield_brut"].median()*100 if len(d) else np.nan)
    return HTML(f"""
    <div style='display:flex;gap:28px;margin:8px 0 12px 0;font-family:system-ui,Segoe UI,Roboto,Arial'>
      <div><div style="opacity:.7">Nb ventes</div><div style="font-size:18px">{nb}</div></div>
      <div><div style="opacity:.7">Prix/m² médian</div><div style="font-size:18px">{p50:.0f} €</div></div>
      <div><div style="opacity:.7">Rendement médian</div><div style="font-size:18px">{y50:.2f} %</div></div>
    </div>""")

# --------- Widgets (5 & 6 + options utiles) ---------
w_surface = W.IntRangeSlider(value=[20, 80], min=10, max=200, step=1,
                             description='Surface (m²)', continuous_update=False, layout=W.Layout(width='360px'))
w_loyer   = W.FloatSlider(value=22.0, min=5.0, max=45.0, step=0.5,
                          readout_format=".1f", description='Loyer €/m²', continuous_update=False, layout=W.Layout(width='360px'))
w_topn    = W.IntSlider(value=15, min=5, max=50, step=1, description='Top N', continuous_update=False, layout=W.Layout(width='360px'))
w_out     = W.Checkbox(value=True, description='Exclure outliers (IQR)')

# (option) filtre par type_local si présent
type_opts = sorted(df["type_local"].dropna().unique()) if "type_local" in df.columns else []
w_type    = W.SelectMultiple(options=type_opts, value=tuple(type_opts) if type_opts else (),
                             description='Type', rows=min(6, len(type_opts))) if type_opts else None

# Actions
btn_reset = W.Button(description="Réinitialiser filtres")
btn_export = W.Button(description="Exporter le classement (CSV)")

# Zones d’affichage
out_table = W.Output()
out_plot1 = W.Output()
out_plot2 = W.Output()
out_kpi   = W.Output()

# --------- Rendu principal ---------
def render(_=None):
    with out_table:
        clear_output(wait=True)
    with out_plot1:
        clear_output(wait=True)
    with out_plot2:
        clear_output(wait=True)
    with out_kpi:
        clear_output(wait=True)

    # 1) filtrage surface + (option) type
    d = filter_by_surface(df, w_surface.value)
    if w_type and len(w_type.value) > 0:
        d = d[d["type_local"].isin(w_type.value)]

    # 2) outliers IQR
    d = apply_outliers(d, w_out.value)

    # 3) rendement
    d = compute_yield(d, w_loyer.value)

    # 4) KPIs
    with out_kpi:
        display(kpis_html(d))

    # 5) Classement communes
    rank = (d.groupby("nom_commune", dropna=False)
              .agg(nb=("prix_m2","count"),
                   prix_m2_med=("prix_m2","median"),
                   surf_med=("surface_reelle_bati","median"),
                   yield_brut_med=("yield_brut","median"),
                   yield_p90=("yield_brut", lambda s: s.quantile(0.90)))
              .sort_values("yield_brut_med", ascending=False)
              .head(w_topn.value)
              .reset_index())

    # 6) Tableau
    with out_table:
        display(HTML("<h4>Top communes (rendement brut médian)</h4>"))
        display(rank)

    # 7) Graphes (Matplotlib uniquement)
    # 7a) Histogramme des rendements
    with out_plot1:
        plt.figure(figsize=(6,4))
        d["yield_brut"].dropna().clip(upper=0.25).plot(kind="hist", bins=40)
        plt.xlabel("Rendement brut")
        plt.ylabel("Nombre de ventes")
        plt.title("Distribution des rendements (Surface & Loyer)")
        plt.show()

    # 7b) Barplot Top N par rendement médian
    with out_plot2:
        plt.figure(figsize=(7.5,4.5))
        x = rank["nom_commune"].astype(str)
        y = (rank["yield_brut_med"]*100).round(2)
        plt.bar(x, y)
        plt.xticks(rotation=60, ha="right")
        plt.ylabel("Rendement médian (%)")
        plt.title("Top communes par rendement médian")
        plt.tight_layout()
        plt.show()

    # 8) Stocker le dernier ranking pour export
    btn_export._last_rank = rank.copy()

# --------- Actions boutons ---------
def on_reset(_):
    w_surface.value = (20, 80)
    w_loyer.value   = 22.0
    w_topn.value    = 15
    w_out.value     = True
    if w_type:
        w_type.value = tuple(type_opts)
    render()

def on_export(_):
    rank = getattr(btn_export, "_last_rank", None)
    if rank is None or rank.empty:
        with out_table:
            display(HTML("<i>Aucun classement à exporter.</i>"))
        return
    # Export dans data/clean
    out_csv = os.path.join(CLEAN_DIR, "classement_communes_surface_loyer.csv")
    rank.to_csv(out_csv, index=False, encoding="utf-8")
    with out_table:
        display(HTML(f"<b>Exporté :</b> {out_csv}"))

btn_reset.on_click(on_reset)
btn_export.on_click(on_export)

# --------- Layout / Interface ---------
controls_left = [
    HTML("<h3 style='margin:0 0 6px 0'>Filtres</h3>"),
    w_surface, w_loyer, w_topn, w_out
]
if w_type:
    controls_left.insert(2, w_type)

box_left = W.VBox(controls_left + [W.HBox([btn_reset, btn_export])],
                  layout=W.Layout(width="400px"))

box_right = W.VBox([
    out_kpi,
    out_table,
    W.HBox([out_plot1, out_plot2])
])

ui = W.HBox([box_left, box_right], layout=W.Layout(align_items="flex-start"))
display(ui)

# Observers
w_surface.observe(render, "value")
w_loyer.observe(render, "value")
w_topn.observe(render, "value")
w_out.observe(render, "value")
if w_type:
    w_type.observe(render, "value")

# Premier rendu
render()


---

### Import automatique de `02_widget5_6.ipynb`

## Imports, chargement robuste et normalisation des colonnes

In [None]:
# --- Imports
import os, warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from ipywidgets import IntRangeSlider, FloatSlider, HBox, Output, HTML
from IPython.display import display, clear_output

warnings.filterwarnings("ignore")

CLEAN_DIR = os.path.abspath(os.path.join("..", "data", "clean"))
RAW_DIR   = os.path.abspath(os.path.join("..", "data", "raw"))
clean_fp  = os.path.join(CLEAN_DIR, "dvf_clean.parquet")
raw_txt   = os.path.join(RAW_DIR, "DVF_2025_S1.txt")  # ton fichier texte DVF renommé

def _to_num(s):
    return pd.to_numeric(s.astype(str).str.replace(",", ".", regex=False), errors="coerce")

def load_dvf():
    # 1) Essaye d'abord le parquet propre
    if os.path.exists(clean_fp):
        df = pd.read_parquet(clean_fp)
    else:
        # 2) Sinon, fallback rapide sur le .txt DVF (séparateur |)
        usecols = [
            "Date mutation","Nature mutation","Valeur fonciere",
            "Code postal","Commune","Code departement","Code commune",
            "Type local","Surface reelle bati","Nombre pieces principales"
        ]
        df = pd.read_csv(raw_txt, sep="|", dtype=str, low_memory=False)
        df = df[[c for c in usecols if c in df.columns]].copy()

        # Typage & filtres minimum
        df["Valeur fonciere"] = _to_num(df["Valeur fonciere"])
        df["Surface reelle bati"] = _to_num(df["Surface reelle bati"])
        df["Date mutation"] = pd.to_datetime(df["Date mutation"], errors="coerce")

        # Ventes + Île-de-France
        df = df[df["Nature mutation"].fillna("").str.contains("Vente", case=False, na=False)]
        idf_prefix = ("75","77","78","91","92","93","94","95")
        df = df[df["Code departement"].astype(str).str.startswith(idf_prefix)]

        # prix/m²
        df["prix_m2"] = df["Valeur fonciere"] / df["Surface reelle bati"]
        df = df[(df["Surface reelle bati"] > 8) & (df["prix_m2"].between(100, 30000))]

        # Sauvegarde clean (optionnel)
        os.makedirs(CLEAN_DIR, exist_ok=True)
        df.rename(columns={
            "Date mutation":"date_mutation",
            "Valeur fonciere":"valeur_fonciere",
            "Surface reelle bati":"surface_reelle_bati",
            "Commune":"nom_commune",
            "Code postal":"code_postal",
            "Type local":"type_local"
        }, inplace=True)
        df.to_parquet(clean_fp, index=False)

    # Normalisation des noms attendus par le dashboard
    rename_map = {
        "Date mutation":"date_mutation",
        "Valeur fonciere":"valeur_fonciere",
        "Surface reelle bati":"surface_reelle_bati",
        "Commune":"nom_commune",
        "Code postal":"code_postal",
        "Type local":"type_local"
    }
    df = df.rename(columns=rename_map)

    # Colonnes minimales
    needed = ["date_mutation","valeur_fonciere","surface_reelle_bati","prix_m2","nom_commune"]
    # Si prix_m2 absent (cas parquet externe), on le recalcule
    if "prix_m2" not in df.columns and all(c in df.columns for c in ["valeur_fonciere","surface_reelle_bati"]):
        df["prix_m2"] = df["valeur_fonciere"] / df["surface_reelle_bati"]

    # Nettoyage final minimal
    df = df.dropna(subset=["surface_reelle_bati","valeur_fonciere","prix_m2","nom_commune"]).copy()
    df = df[(df["surface_reelle_bati"] > 8) & (df["valeur_fonciere"] > 1000) & (df["prix_m2"].between(100, 30000))]
    df["annee"] = pd.to_datetime(df["date_mutation"], errors="coerce").dt.year
    return df[needed + ["annee"]]

df = load_dvf()
len(df), df.columns.tolist()


## Widgets 5 (surface) & 6 (loyer €/m²)

In [None]:
w_surface = IntRangeSlider(
    value=(20, 80),
    min=10, max=200, step=1,
    description="Surface (m²)",
    continuous_update=False
)

# Valeur par défaut réaliste pour IDF (tu peux ajuster)
w_loyer = FloatSlider(
    value=22.0, min=5.0, max=45.0, step=0.5,
    readout_format=".1f",
    description="Loyer €/m²",
    continuous_update=False
)

out = Output()
display(HBox([w_surface, w_loyer]), out)


## Fonctions + rendu (tableau Top communes + histogramme)

In [None]:
def filter_by_surface(d: pd.DataFrame, s_range):
    smin, smax = s_range
    return d[d["surface_reelle_bati"].between(smin, smax)]

def compute_yield(d: pd.DataFrame, loyer_m2: float) -> pd.DataFrame:
    res = d.copy()
    # Revenu annuel estimé (brut)
    res["revenu_annuel"] = loyer_m2 * res["surface_reelle_bati"] * 12.0
    res["yield_brut"] = res["revenu_annuel"] / res["valeur_fonciere"]
    return res

def render(_=None):
    with out:
        clear_output(wait=True)
        d = filter_by_surface(df, w_surface.value)
        d = compute_yield(d, w_loyer.value)

        # Classement communes par rendement médian
        g = (d.groupby("nom_commune", dropna=False)
               .agg(nb=("prix_m2","count"),
                    prix_m2_med=("prix_m2","median"),
                    surf_med=("surface_reelle_bati","median"),
                    yield_brut_med=("yield_brut","median"))
               .sort_values("yield_brut_med", ascending=False)
               .head(15)
               .reset_index())

        display(HTML("<h4>Top 15 communes (rendement brut médian)</h4>"))
        display(g)

        # Histogramme des rendements (capé à 25% pour lisibilité)
        plt.figure()
        d["yield_brut"].dropna().clip(upper=0.25).plot(kind="hist", bins=40)
        plt.xlabel("Rendement brut")
        plt.ylabel("Nombre de ventes")
        plt.title("Distribution des rendements (Surface + Loyer €/m²)")
        plt.show()

w_surface.observe(render, "value")
w_loyer.observe(render, "value")
render()


---

### Import automatique de `03_analyse_loyers_IDF.ipynb`

# 03 — Analyse des loyers (Île-de-France)

Notebook interactif pour explorer le fichier de loyers `pred-app12-mef-dhup_2024.csv`.
**Objectifs** : charger, nettoyer, filtrer IDF, visualiser distribution et proposer un widget pour lister
communes dont le loyer prédit est sous la moyenne (ou selon un seuil choisi).


In [None]:

import os, warnings
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
from ipywidgets import interact, widgets, HBox, VBox, fixed
warnings.filterwarnings('ignore')

# Chemins — modifiables
default_paths = [
    os.path.join("data","raw","pred-app12-mef-dhup_2024.csv"),
    os.path.join("..","data","raw","pred-app12-mef-dhup_2024.csv"),
    r"C:/Users/Victor/DataScience/Projet-Data-science-Investissement-immobilier/pred-app12-mef-dhup_2024.csv"
]

csv_path = None
for p in default_paths:
    if os.path.exists(p):
        csv_path = p; break

print('Chemin utilisé pour le fichier loyers :', csv_path or "Aucun fichier trouvé automatiquement — modifiez csv_path manuellement")


## 1) Chargement et nettoyage initial
- Lecture robuste (latin-1 ou utf-8)
- Conversion du code INSEE en str
- Extraction numérique du champ `loypredm2` et création de `loy_num`


In [None]:

# Lecture flexible
if csv_path is None:
    raise FileNotFoundError("Aucun fichier de loyers trouvé automatiquement. Déposez pred-app12-mef-dhup_2024.csv dans data/raw/ ou modifiez csv_path.")

# Essayer plusieurs encodages/sep si nécessaire
try:
    df_raw = pd.read_csv(csv_path, sep=None, engine="python", dtype=str, encoding="utf-8")
except Exception as e:
    df_raw = pd.read_csv(csv_path, sep=None, engine="python", dtype=str, encoding="latin-1")

print("Taille initiale:", df_raw.shape)
display(df_raw.head(3))


In [None]:

# Standardisation colonnes (vérifie noms)
print("Colonnes disponibles:", list(df_raw.columns))

# On s'assure que la colonne INSEE_C existe
possible_insee = [c for c in df_raw.columns if c.lower().startswith("insee") or c.lower().startswith("commune")]
print("Colonnes candidates INSEE/commune:", possible_insee)


In [None]:

# Nettoyage / extraction numérique loypredm2 -> loy_num
col_loy = None
for c in df_raw.columns:
    if 'loy' in c.lower() and 'm2' in c.lower():
        col_loy = c; break
if col_loy is None:
    # heuristique: trouver colonnes contenant 'loypred' ou 'loy'
    for c in df_raw.columns:
        if 'loypred' in c.lower() or 'loy' in c.lower():
            col_loy = c; break

if col_loy is None:
    raise ValueError("Impossible de détecter une colonne de loyer. Vérifie le fichier source.")

print('Colonne loyer détectée :', col_loy)

df = df_raw.copy()
# Nettoyage du champ loyer : remplacer virgule, extraire float
df['loy_raw'] = df[col_loy].astype(str)
df['loy_num'] = df['loy_raw'].str.replace(',', '.', regex=False).str.extract(r'([0-9]+\.?[0-9]*)')[0].astype(float)
# colonne INSEE
insee_candidates = [c for c in df.columns if c.lower().startswith('insee')]
if len(insee_candidates)>0:
    df['INSEE_C'] = df[insee_candidates[0]].astype(str).str.strip()
else:
    # tenter 'CODGEO' ou 'COM' ou 'INSEE_C'
    for c in ['CODGEO','COM','code_commune','code_INSEE','INSEE_C']:
        if c in df.columns:
            df['INSEE_C'] = df[c].astype(str).str.strip(); break

print('Total lignes après extraction loy_num:', len(df))
df[['INSEE_C','loy_raw','loy_num']].head(5)


## 2) Filtrer Île-de-France
On considère comme préfixes IDF : 75,77,78,91,92,93,94,95 (code départemental en début du code INSEE / COM).


In [None]:

# Filtrage IDF
idf_prefix = ('75','77','78','91','92','93','94','95')

if 'INSEE_C' not in df.columns:
    raise ValueError("Colonne INSEE_C manquante. Vérifie le fichier source ou adapte le notebook.")

df['INSEE_C'] = df['INSEE_C'].astype(str).str.zfill(5)
df_idf = df[df['INSEE_C'].str.startswith(idf_prefix, na=False)].copy()
print(f"Lignes totales: {len(df)} | Lignes IDF: {len(df_idf)}")

# moyenne IDF
mean_val = df_idf['loy_num'].mean(skipna=True)
print(f"Moyenne loypredm2 IDF: {mean_val:.2f} €/m²")


## 3) Visualisations (Matplotlib uniquement)
- Distribution des loyers (histogramme)
- Boxplot par département
- Carte non incluse dans ce notebook (sera faite dans notebook gares si besoin)


In [None]:

# Distribution loyers
plt.figure()
plt.hist(df_idf['loy_num'].dropna(), bins=50)
plt.title('Distribution des loyers prédits (€/m²) — IDF')
plt.xlabel('€/m²'); plt.ylabel('Nombre de lignes')
plt.show()


In [None]:

# Moyenne par département (par code INSEE -> département = 2 premiers digits)
df_idf['dept'] = df_idf['INSEE_C'].str[:2]
dept_stats = df_idf.groupby('dept')['loy_num'].agg(['count','mean','median']).reset_index().sort_values('mean', ascending=False)
plt.figure()
plt.bar(dept_stats['dept'], dept_stats['mean'])
plt.title('Loyer moyen (€/m²) par département — IDF')
plt.xlabel('Département'); plt.ylabel('Loyer moyen €/m²')
plt.show()

display(dept_stats.head(20))


## 4) Widgets interactifs
- Sélecteur de département
- Slider de seuil (<= moyenne par défaut)
- Bouton pour exporter la shortlist


In [None]:

# Widgets
dept_options = ['(Tous)'] + dept_stats['dept'].tolist()
dept_dd = widgets.Dropdown(options=dept_options, description='Dépt:')
seuil = widgets.FloatSlider(value=mean_val, min=0, max=max(df_idf['loy_num'].dropna().max(), mean_val*2), step=0.5, description='Seuil €/m²:')
topn = widgets.IntSlider(value=20, min=5, max=200, step=5, description='Top N:')

def shortlist(dept, seuil_val, topn_val):
    d = df_idf.copy()
    if dept != '(Tous)':
        d = d[d['INSEE_C'].str.startswith(dept)]
    d = d[d['loy_num'] <= seuil_val].copy()
    # agrégation commune si colonne existante
    comm_col = None
    for c in ['commune', 'nom_commune', 'NOM_COM', 'LIBGEO', 'LIBELLE_COMMUNE']:
        if c in d.columns:
            comm_col = c; break
    if comm_col is None:
        # regroupe sur INSEE
        grp = d.groupby('INSEE_C').agg(nb=('loy_num','count'), loy_med=('loy_num','median')).reset_index().sort_values('loy_med')
    else:
        grp = d.groupby(comm_col).agg(nb=('loy_num','count'), loy_med=('loy_num','median')).reset_index().sort_values('loy_med')
    display(grp.head(topn_val))
    return grp.head(topn_val)

out = widgets.interactive_output(shortlist, {'dept':dept_dd, 'seuil_val':seuil, 'topn_val':topn})
display(HBox([dept_dd, seuil, topn]))
display(out)


## 5) Exporter les résultats filtrés
Tu peux sauvegarder la shortlist (CSV) sur disque.


In [None]:

# Fonction d'export
def export_shortlist(df_short, fname='shortlist_loyers_idf.csv'):
    outp = os.path.join('data','clean', fname)
    os.makedirs(os.path.dirname(outp), exist_ok=True)
    df_short.to_csv(outp, index=False, encoding='utf-8-sig')
    print('Exporté →', outp)

# Exemple d'usage :
# grp = shortlist('(Tous)', mean_val, 20)
# export_shortlist(grp)
print('Cellule prête pour export.')


---

### Import automatique de `04_accessibilite_gare_IDF.ipynb`

# 04 — Analyse Accessibilité des gares

Notebook pour explorer `accessibilite-en-gare.csv` et produire :
- filtrage par niveau d'accessibilité
- stats par département
- top gares/communes accessibles
- widgets interactifs


In [None]:

import os, warnings
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
from ipywidgets import interact, widgets, HBox, VBox
warnings.filterwarnings('ignore')

# chemins par défaut (modifiables)
default_paths = [
    os.path.join("data","raw","accessibilite-en-gare.csv"),
    os.path.join("..","data","raw","accessibilite-en-gare.csv"),
    r"C:/Users/Victor/DataScience/Projet-Data-science-Investissement-immobilier/accessibilite-en-gare.csv"
]

csv_path = None
for p in default_paths:
    if os.path.exists(p):
        csv_path = p; break

print('Chemin utilisé pour accessibilite:', csv_path or "Aucun fichier trouvé automatiquement — modifiez csv_path si besoin")


## Chargement et typage

In [None]:

if csv_path is None:
    raise FileNotFoundError("Aucun fichier accessibilite-en-gare.csv trouvé. Déposez le fichier dans data/raw/")

df_raw = pd.read_csv(csv_path, sep=';', engine='python', dtype=str)
print("Taille initiale:", df_raw.shape)
display(df_raw.head(3))
print("Colonnes:", list(df_raw.columns))


In [None]:

# conversion accessibility_level_id
col_acc = None
for c in df_raw.columns:
    if 'accessibility_level' in c.lower() or 'accessibilit' in c.lower():
        col_acc = c; break
if col_acc is None:
    raise ValueError("Impossible de trouver la colonne d'accessibilité dans le fichier.")

df_raw[col_acc] = pd.to_numeric(df_raw[col_acc], errors='coerce')
print('Colonne accessibilité détectée :', col_acc)
df = df_raw.copy()


## Filtrage — niveau minimal d'accessibilité

In [None]:

# distribution
print("Distribution niveaux accessibilité:")
display(df[col_acc].value_counts().sort_index())

# Par défaut, on garde niveaux >=3
df_filtered = df[df[col_acc] >= 3].copy()
print("Gares avec niveau >=3 :", len(df_filtered))


## Visualisations (Matplotlib)
- Histogramme des niveaux
- Top communes par nombre de gares accessibles


In [None]:

# histogramme des niveaux (toutes gares)
plt.figure()
plt.hist(df[col_acc].dropna().astype(int), bins=range(int(df[col_acc].min()), int(df[col_acc].max())+2))
plt.title('Histogramme des niveaux d accessibilité')
plt.xlabel('Niveau accessibilité'); plt.ylabel('Nombre de gares')
plt.show()


In [None]:

# Top communes avec le plus de gares accessibles (colonne commune possible)
comm_col = None
for c in ['stop_name','commune','nom_commune','town','locality']:
    if c in df.columns:
        comm_col = c; break
if comm_col is None:
    # fallback sur stop_point_id ou autre
    comm_col = 'stop_point_id'

top_comm = df_filtered.groupby(comm_col).size().reset_index(name='count').sort_values('count', ascending=False).head(20)
display(top_comm)
plt.figure()
plt.bar(top_comm[comm_col].astype(str), top_comm['count'])
plt.title('Top gares / commune (niveau >=3)')
plt.xlabel('Commune/Gare'); plt.ylabel('Nombre de gares')
plt.xticks(rotation=45, ha='right')
plt.show()


## Widgets interactifs
- Slider niveau minimal (1–5)
- Filtre texte sur nom de gare / commune
- Export CSV des gares filtrées


In [None]:

level_slider = widgets.IntSlider(value=3, min=int(df[col_acc].min()), max=int(df[col_acc].max()), step=1, description='Niveau >=')
text_filter = widgets.Text(value='', description='Filtre nom:')
topn = widgets.IntSlider(value=20, min=5, max=200, step=5, description='Top N:')

def explore(level_min, filt_text, topn_val):
    d = df.copy()
    d = d[d[col_acc] >= level_min]
    if filt_text.strip():
        mask = d.apply(lambda row: filt_text.lower() in ' '.join([str(row.get(c,'')) for c in d.columns if isinstance(row.get(c,''), str)]).lower(), axis=1)
        d = d[mask]
    # agrégation par commune/gare
    grp = d.groupby(comm_col).size().reset_index(name='count').sort_values('count', ascending=False)
    display(grp.head(topn_val))
    print(f"Total gares correspondant: {len(d)}")
    return d, grp

out = widgets.interactive_output(explore, {'level_min': level_slider, 'filt_text': text_filter, 'topn_val': topn})
display(HBox([level_slider, text_filter, topn]))
display(out)


## Export des gares filtrées

In [None]:

def export_gare(df_out, fname='gares_accessibles_filtered.csv'):
    outp = os.path.join('data','clean', fname)
    os.makedirs(os.path.dirname(outp), exist_ok=True)
    df_out.to_csv(outp, index=False, encoding='utf-8-sig')
    print('Exporté →', outp)

# Exemple:
# d, grp = explore(3, '', 20)
# export_gare(d)
print('Cellule prête pour export.')
