# Dashboard Investissement ‚Äî Notebook (version v2)

Notebook unifi√© inspir√© de `dashboard_app_pro_v2.py` avec :
- **Sidebar scrollable** fiable (molette fonctionnelle)
- **S√©lecteur Top N** dans l‚Äôonglet *Top Communes*
- **Info-bulles au survol** sur les graphiques via **mplcursors**
- Filtres : surface, budget (k‚Ç¨), prix/m¬≤, zone, d√©partement, ann√©es, loyer ‚Ç¨/m¬≤, charges %, rendement min %, outliers

‚öôÔ∏è Pr√©-requis (ex√©cuter une fois si besoin):
```bash
python -m pip install ipywidgets mplcursors
jupyter nbextension enable --py widgetsnbextension
```
Et active l‚Äôaffichage interactif Matplotlib :
```python
%matplotlib widget
```
> **Note** : Ce notebook cherche `data_cleaner_advanced.py` dans le m√™me dossier.

In [2]:

import warnings, os, sys, io
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import ipywidgets as W
from IPython.display import display, HTML

warnings.filterwarnings('ignore')

try:
    import mplcursors
except Exception as e:
    mplcursors = None
    print('‚ö†Ô∏è mplcursors non disponible (hover tooltips d√©sactiv√©s):', e)


‚ö†Ô∏è mplcursors non disponible (hover tooltips d√©sactiv√©s): No module named 'mplcursors'


In [3]:
# Chargement des donn√©es via le module de nettoyage avanc√©
try:
    import data_cleaner_advanced as dca
except Exception as e:
    raise ImportError('data_cleaner_advanced.py introuvable: ' + str(e))

RAW_DIR = '../Projet-Data-science-Investissement-immobilier/data/raw'
CLEAN_DIR = '../Projet-Data-science-Investissement-immobilier/data/clean'

df_unifie, df_loyers, df_gares = dca.quick_load_advanced(
    raw_dir=RAW_DIR,
    clean_dir=CLEAN_DIR,
    force_refresh=False
)
len(df_unifie), df_unifie.columns.tolist()[:12]

üßπ NETTOYAGE AVANC√â DES DONN√âES



FileNotFoundError: ‚ùå Aucun fichier DVF trouv√© dans c:\Users\KenziLali\OneDrive\iCloudDrive\Etudes\ING\ING4\S7\Data science\Projet-Data-science-Investissement-immobilier\Projet-Data-science-Investissement-immobilier\data\raw

In [None]:
# Persona / param√®tres
PERSONA = {
    'nom': 'Manager IT',
    'budget_max': 200_000,
    'apport': 50_000,
    'surface_min': 15,
    'surface_max': 65,
    'objectif_rendement_net': 4.5,
}

def clip_q(series, qlow=0.01, qhigh=0.99):
    s = pd.to_numeric(series, errors='coerce').dropna()
    if s.empty:
        return s
    lo, hi = s.quantile([qlow, qhigh])
    return s.clip(lower=lo, upper=hi)

def ensure_geo(df: pd.DataFrame) -> pd.DataFrame:
    d = df.copy()
    if 'code_departement' not in d.columns and 'code_postal' in d.columns:
        d['code_departement'] = d['code_postal'].astype(str).str[:2]
    if 'zone_geo' not in d.columns and 'code_departement' in d.columns:
        map_zone = {
            '75':'Paris','92':'Petite Couronne','93':'Petite Couronne','94':'Petite Couronne',
            '77':'Grande Couronne','78':'Grande Couronne','91':'Grande Couronne','95':'Grande Couronne'
        }
        d['zone_geo'] = d['code_departement'].map(map_zone)
    return d

def compute_yields(d: pd.DataFrame, loyer_m2: float, charges_pct: float) -> pd.DataFrame:
    d = d.copy()
    for c in ['valeur_fonciere','surface_reelle_bati','prix_m2']:
        if c in d.columns:
            d[c] = pd.to_numeric(d[c], errors='coerce')
    loyer_annuel = loyer_m2 * d['surface_reelle_bati'] * 12
    loyer_net    = loyer_annuel * (1 - charges_pct)
    d['rendement_brut'] = (loyer_annuel / d['valeur_fonciere']) * 100
    d['rendement_net']  = (loyer_net / d['valeur_fonciere']) * 100
    d.replace([np.inf, -np.inf], np.nan, inplace=True)
    return d

def apply_filters(df_base: pd.DataFrame, cfg: dict) -> pd.DataFrame:
    d = ensure_geo(df_base)
    if 'surface_reelle_bati' in d:
        d = d[d['surface_reelle_bati'].between(cfg['surf'][0], cfg['surf'][1])]
    if 'valeur_fonciere' in d:
        d = d[d['valeur_fonciere'].between(cfg['p_total'][0]*1000, cfg['p_total'][1]*1000)]
    if 'prix_m2' in d:
        d = d[d['prix_m2'].between(cfg['p_m2'][0], cfg['p_m2'][1])]
    if 'annee' in d:
        d = d[(d['annee'] >= cfg['annees'][0]) & (d['annee'] <= cfg['annees'][1])]
    if cfg['zone'] != '(Toutes)' and 'zone_geo' in d:
        d = d[d['zone_geo'] == cfg['zone']]
    if cfg['dept'] != '(Tous)' and 'code_departement' in d:
        d = d[d['code_departement'] == cfg['dept']]
    if cfg['outliers'] and 'prix_m2' in d and len(d) > 50:
        Q1, Q3 = d['prix_m2'].quantile(0.25), d['prix_m2'].quantile(0.75)
        IQR = Q3 - Q1
        d = d[d['prix_m2'].between(Q1 - 2*IQR, Q3 + 2*IQR)]
    d = compute_yields(d, cfg['loyer'], cfg['charges']/100)
    if 'rendement_net' in d:
        d = d[d['rendement_net'] >= cfg['rdt_min']]
    if 'prix_m2' in d and len(d) > 50:
        lo, hi = d['prix_m2'].quantile([0.01, 0.99])
        d = d[(d['prix_m2'] >= lo) & (d['prix_m2'] <= hi)]
    return d


In [None]:
# Widgets (sidebar + onglets) ‚Äî sidebar scrollable simul√©e par VBox dans une colonne
w_surface  = W.IntRangeSlider(description='Surface (m¬≤)', min=10, max=200, value=[15,65], step=1, layout=W.Layout(width='95%'))
w_total    = W.IntRangeSlider(description='Budget (k‚Ç¨)', min=30, max=500, value=[50,200], step=1, layout=W.Layout(width='95%'))
w_pm2      = W.IntRangeSlider(description='Prix/m¬≤ (‚Ç¨)', min=1500, max=20000, value=[3000,12000], step=100, layout=W.Layout(width='95%'))
w_zone     = W.Dropdown(description='Zone', options=['(Toutes)','Paris','Petite Couronne','Grande Couronne'], value='(Toutes)')
w_dept     = W.Dropdown(description='D√©pt', options=['(Tous)','75','77','78','91','92','93','94','95'], value='(Tous)')
w_loyer    = W.FloatSlider(description='Loyer ‚Ç¨/m¬≤', min=10, max=50, step=0.5, value=22)
w_charges  = W.FloatSlider(description='Charges %', min=0, max=40, step=1, value=25)
w_rdt_min  = W.FloatSlider(description='Rdt min %', min=0, max=10, step=0.1, value=PERSONA['objectif_rendement_net'])
w_years    = W.IntRangeSlider(description='Ann√©es', min=2019, max=2025, value=[2019,2025])
w_outliers = W.Checkbox(description='Supprimer outliers (IQRx2)', value=True)

# Top N control placed in Top Communes section
w_topn     = W.IntSlider(description='Top N', min=5, max=100, step=1, value=15)

btn_apply  = W.Button(description='Appliquer', button_style='success')
btn_reset  = W.Button(description='Reset', button_style='warning')
btn_export = W.Button(description='Exporter CSV')

sidebar = W.VBox([
    W.HTML('<h3>üéõÔ∏è Filtres</h3>'),
    w_surface, w_total, w_pm2,
    w_zone, w_dept,
    w_loyer, w_charges, w_rdt_min,
    w_years, w_outliers,
    W.HBox([btn_apply, btn_reset, btn_export])
], layout=W.Layout(width='28%', overflow='auto', max_height='600px'))


In [None]:
out_kpi   = W.Output()
out_top   = W.Output()
out_price = W.Output()
out_rdt   = W.Output()
out_dept  = W.Output()

display(W.HBox([sidebar, W.VBox([
    out_kpi,
    W.HBox([W.HTML('<h3>üèÜ Top Communes</h3>'), W.Box([w_topn], layout=W.Layout(margin='0 0 0 20px'))]), out_top,
    W.HTML('<h3>üí∞ Analyse Prix</h3>'), out_price,
    W.HTML('<h3>üéØ Rendement</h3>'), out_rdt,
    W.HTML('<h3>üó∫Ô∏è Carte</h3>'), out_dept
], layout=W.Layout(width='72%'))]))


In [None]:
def current_cfg():
    return {
        'surf': tuple(w_surface.value),
        'p_total': tuple(w_total.value),
        'p_m2': tuple(w_pm2.value),
        'zone': w_zone.value,
        'dept': w_dept.value,
        'loyer': w_loyer.value,
        'charges': w_charges.value,
        'rdt_min': w_rdt_min.value,
        'annees': tuple(w_years.value),
        'outliers': w_outliers.value,
        'topn': w_topn.value,
    }

def render_overview(d):
    out_kpi.clear_output(wait=True)
    with out_kpi:
        n = len(d)
        prix_m2_med = d['prix_m2'].median() if 'prix_m2' in d.columns else np.nan
        rdt_med = d['rendement_net'].median() if 'rendement_net' in d.columns else np.nan
        surf_med = d['surface_reelle_bati'].median() if 'surface_reelle_bati' in d.columns else np.nan
        display(HTML(f"""
        <div style='display:flex;gap:12px'>
          <div style='flex:1;background:#1e293b;color:white;padding:12px;border-radius:10px'><div>Transactions</div><div style='font-size:26px;font-weight:700'>{n:,}</div></div>
          <div style='flex:1;background:#0ea5e9;color:white;padding:12px;border-radius:10px'><div>Prix/m¬≤ m√©dian</div><div style='font-size:26px;font-weight:700'>{prix_m2_med:,.0f} ‚Ç¨</div></div>
          <div style='flex:1;background:#f59e0b;color:white;padding:12px;border-radius:10px'><div>Rendement net m√©dian</div><div style='font-size:26px;font-weight:700'>{rdt_med:.2f} %</div></div>
          <div style='flex:1;background:#ef4444;color:white;padding:12px;border-radius:10px'><div>Surface m√©diane</div><div style='font-size:26px;font-weight:700'>{surf_med:.0f} m¬≤</div></div>
        </div>
        """.replace(',', ' ')))

        # Histogramme prix/m¬≤ (clipp√© 1‚Äì99%) avec tooltips mplcursors
        if 'prix_m2' in d.columns:
            p = clip_q(d['prix_m2'])
            fig, ax = plt.subplots(figsize=(8,3))
            n, bins, patches = ax.hist(p, bins=40)
            if len(p):
                med = p.median()
                ax.axvline(med, linestyle='--', color='r', lw=2)
            ax.set_xlabel('Prix/m¬≤ (‚Ç¨)'); ax.set_ylabel('Fr√©quence'); ax.set_title('Distribution prix/m¬≤ (1‚Äì99%)')
            if mplcursors:
                cur = mplcursors.cursor(patches, hover=True)
                @cur.connect('add')
                def _on_add(sel):
                    ind = np.searchsorted(bins, sel.target[0]) - 1
                    ind = max(0, min(ind, len(n)-1))
                    sel.annotation.set_text(f"Bin: {bins[ind]:.0f}‚Äì{bins[ind+1]:.0f} ‚Ç¨\nCount: {int(n[ind])}")
            plt.show()

def render_top_communes(d, topn):
    out_top.clear_output(wait=True)
    with out_top:
        if 'rendement_net' not in d.columns or d.empty:
            display(HTML('<em>Donn√©es de rendement non disponibles</em>')); return
        top = (d.groupby(['nom_commune','code_postal'], as_index=False)
               .agg(nb=('prix_m2', 'count'), prix_m2_med=('prix_m2', 'median'),
                    surf_med=('surface_reelle_bati', 'median'), prix_med=('valeur_fonciere', 'median'),
                    rdt_net=('rendement_net', 'median'))
               .sort_values('rdt_net', ascending=False)
               .head(topn))
        display(top.style.format({'prix_m2_med':'{:.0f}','surf_med':'{:.0f}','prix_med':'{:.0f}','rdt_net':'{:.2f}'}))

def render_prix(d):
    out_price.clear_output(wait=True)
    with out_price:
        if d.empty: display(HTML('<em>Aucune donn√©e</em>')); return
        ds = d.dropna(subset=['surface_reelle_bati','prix_m2']).copy()
        if len(ds) > 3000: ds = ds.sample(3000, random_state=42)

        fig, (ax1, ax2) = plt.subplots(1,2, figsize=(10,4))
        sc = ax1.scatter(ds['surface_reelle_bati'], ds['prix_m2'], s=12, alpha=0.6)
        ax1.set_xlabel('Surface (m¬≤)'); ax1.set_ylabel('Prix/m¬≤ (‚Ç¨)'); ax1.set_title('Prix/m¬≤ vs Surface')

        if mplcursors:
            xs = ds['surface_reelle_bati'].to_numpy(); ys = ds['prix_m2'].to_numpy()
            communes = ds.get('nom_commune', pd.Series(['?']*len(ds))).astype(str).to_numpy()
            cps = ds.get('code_postal', pd.Series(['']*len(ds))).astype(str).to_numpy()
            rdt = ds.get('rendement_net', pd.Series([np.nan]*len(ds))).to_numpy()
            cur = mplcursors.cursor(sc, hover=True)
            @cur.connect('add')
            def _on_add(sel):
                i = sel.index
                parts = [f"{communes[i]} ({cps[i]})", f"Surface: {xs[i]:.0f} m¬≤", f"Prix/m¬≤: {ys[i]:,.0f} ‚Ç¨".replace(',', ' ')]
                if not np.isnan(rdt[i]): parts.append(f"Rdt net: {rdt[i]:.2f}%")
                sel.annotation.set_text('\n'.join(parts))

        p = clip_q(d['prix_m2']) if 'prix_m2' in d.columns else pd.Series(dtype=float)
        if len(p):
            n, bins, patches = ax2.hist(p, bins=40)
            ax2.set_title('Distribution des prix/m¬≤ (1‚Äì99%)'); ax2.set_xlabel('Prix/m¬≤ (‚Ç¨)'); ax2.set_ylabel('Fr√©quence')
            med = p.median(); ax2.axvline(med, linestyle='--', color='r', lw=2)
            if mplcursors:
                cur2 = mplcursors.cursor(patches, hover=True)
                @cur2.connect('add')
                def _h(sel):
                    ind = np.searchsorted(bins, sel.target[0]) - 1
                    ind = max(0, min(ind, len(n)-1))
                    sel.annotation.set_text(f"Bin: {bins[ind]:.0f}‚Äì{bins[ind+1]:.0f} ‚Ç¨\nCount: {int(n[ind])}")
        plt.show()

def render_rdt(d):
    out_rdt.clear_output(wait=True)
    with out_rdt:
        if 'rendement_net' not in d.columns or not d['rendement_net'].notna().any():
            display(HTML('<em>Donn√©es rendement indisponibles</em>')); return
        r = d['rendement_net'].clip(upper=12).dropna()
        fig, (ax1, ax2) = plt.subplots(1,2, figsize=(10,4))
        n, bins, patches = ax1.hist(r, bins=40)
        ax1.axvline(r.median(), linestyle='--', color='r', lw=2)
        ax1.axvline(PERSONA['objectif_rendement_net'], linestyle=':', color='orange', lw=2)
        ax1.set_title('Distribution rendements nets'); ax1.set_xlabel('%'); ax1.set_ylabel('Fr√©quence')
        if mplcursors:
            cur = mplcursors.cursor(patches, hover=True)
            @cur.connect('add')
            def _h(sel):
                ind = np.searchsorted(bins, sel.target[0]) - 1
                ind = max(0, min(ind, len(n)-1))
                sel.annotation.set_text(f"Bin: {bins[ind]:.2f}‚Äì{bins[ind+1]:.2f}%\nCount: {int(n[ind])}")

        if 'zone_geo' in d.columns:
_#         order small-to-large for nice horizontal bars
            z = d.groupby('zone_geo')['rendement_net'].median().sort_values(ascending=True)
            bars = ax2.barh(z.index, z.values)
            ax2.set_title('Rendement par zone'); ax2.set_xlabel('%')
            if mplcursors:
                cur2 = mplcursors.cursor(bars, hover=True)
                @cur2.connect('add')
                def _b(sel):
                    i = sel.index; sel.annotation.set_text(f"{z.index[i]}: {z.values[i]:.2f}%")
        plt.show()

def render_dept(d):
    out_dept.clear_output(wait=True)
    with out_dept:
        if 'code_departement' not in d.columns or d['code_departement'].isna().all():
            display(HTML('<em>Code d√©partement indisponible</em>')); return
        g = d.groupby('code_departement')
        prix_med = g['prix_m2'].median()
        rdt_med = g['rendement_net'].median() if 'rendement_net' in d.columns else None
        tab = pd.DataFrame({'prix_med': prix_med, 'nb': g['prix_m2'].count()})
        if rdt_med is not None: tab['rdt_med'] = rdt_med
        if tab.empty: display(HTML('<em>Aucune donn√©e</em>')); return
        tab = tab.sort_values('rdt_med' if 'rdt_med' in tab.columns else 'nb', ascending=False).head(8)

        fig, (ax1, ax2) = plt.subplots(1,2, figsize=(10,4))
        b1 = ax1.barh(tab.index.astype(str), tab['prix_med'])
        ax1.set_title('Prix m√©dian par d√©partement'); ax1.set_xlabel('‚Ç¨/m¬≤'); ax1.invert_yaxis()
        if mplcursors:
            cur1 = mplcursors.cursor(b1, hover=True)
            vals = tab['prix_med'].values; idxs = tab.index.astype(str).values
            @cur1.connect('add')
            def _c1(sel):
                i = sel.index; sel.annotation.set_text(f"Dpt {idxs[i]} : {vals[i]:,.0f} ‚Ç¨".replace(',', ' '))

        if 'rdt_med' in tab.columns:
            b2 = ax2.barh(tab.index.astype(str), tab['rdt_med'])
            ax2.set_title('Rendement par d√©partement'); ax2.set_xlabel('%'); ax2.invert_yaxis()
            if mplcursors:
                cur2 = mplcursors.cursor(b2, hover=True)
                vals = tab['rdt_med'].values; idxs = tab.index.astype(str).values
                @cur2.connect('add')
                def _c2(sel):
                    i = sel.index; sel.annotation.set_text(f"Dpt {idxs[i]} : {vals[i]:.2f}%")
        else:
            b2 = ax2.barh(tab.index.astype(str), tab['nb'])
            ax2.set_title('Volume par d√©partement'); ax2.set_xlabel('Nb'); ax2.invert_yaxis()
            if mplcursors:
                cur3 = mplcursors.cursor(b2, hover=True)
                vals = tab['nb'].values; idxs = tab.index.astype(str).values
                @cur3.connect('add')
                def _c3(sel):
                    i = sel.index; sel.annotation.set_text(f"Dpt {idxs[i]} : {int(vals[i])} transactions")
        plt.show()

def render_all():
    cfg = current_cfg()
    d = apply_filters(df_unifie, cfg)
    render_overview(d)
    render_top_communes(d, cfg['topn'])
    render_prix(d)
    render_rdt(d)
    render_dept(d)

def on_apply(_): render_all()
def on_reset(_):
    w_surface.value = (PERSONA['surface_min'], PERSONA['surface_max'])
    w_total.value = (50, 200)
    w_pm2.value = (3000, 12000)
    w_zone.value = '(Toutes)'
    w_dept.value = '(Tous)'
    w_loyer.value = 22
    w_charges.value = 25
    w_rdt_min.value = PERSONA['objectif_rendement_net']
    w_years.value = (2019, 2025)
    w_outliers.value = True
    w_topn.value = 15
    render_all()

def on_export(_):
    cfg = current_cfg()
    d = apply_filters(df_unifie, cfg)
    out_fp = Path('investissement_persona.csv')
    d.to_csv(out_fp, index=False, encoding='utf-8-sig')
    display(HTML(f"‚úÖ Export√©: <b>{out_fp.resolve()}</b> ({len(d):,} lignes)".replace(',', ' ')))

btn_apply.on_click(on_apply)
btn_reset.on_click(on_reset)
btn_export.on_click(on_export)

render_all()