In [2]:
import numpy as np
import pandas as pd
from typing import List


In [3]:
df = pd.read_csv("../data/df_vf_idf.csv", sep=",", low_memory=False)
df.head()

Unnamed: 0,id_mutation,date_mutation,nature_mutation,valeur_fonciere,adresse_numero,adresse_nom_voie,code_postal,code_commune,nom_commune,code_departement,...,annee,mois,trimestre,annee_trimestre,prix_m2,quartier,arrondissement,quartier_detaille,prix_median_m2,ecart_prix_median_pct
0,2020-621368,2020-07-01,Vente,246600.0,40.0,SEN DES LONGUES RAIES,77670.0,77419,Saint-Mammès,77,...,2020,7,3,2020-T3,2348.571429,Saint-Mammès (77670.0),,Saint-Mammès (77670.0),2502.013809,-6.132755
1,2020-783846,2020-07-01,Vente,184520.0,98.0,AV HENRI BARBUSSE,94240.0,94038,L'Haÿ-les-Roses,94,...,2020,7,3,2020-T3,3417.037037,L'Haÿ-les-Roses (94240.0),,L'Haÿ-les-Roses (94240.0),4109.589041,-16.852099
2,2020-783826,2020-07-01,Vente,164000.0,101.0,AV ROUGET DE LISLE,94400.0,94081,Vitry-sur-Seine,94,...,2020,7,3,2020-T3,4315.789474,Vitry-sur-Seine (94400.0),,Vitry-sur-Seine (94400.0),4396.296296,-1.831242
3,2020-638280,2020-07-01,Vente,2000000.0,42.0,RUE SAINT LOUIS,78000.0,78646,Versailles,78,...,2020,7,3,2020-T3,10928.961749,Versailles (78000.0),,Versailles (78000.0),7068.965517,54.604825
4,2020-638279,2020-07-01,Vente en l'état futur d'achèvement,585000.0,,LE MANET,78180.0,78423,Montigny-le-Bretonneux,78,...,2020,7,3,2020-T3,4875.0,Montigny-le-Bretonneux (78180.0),,Montigny-le-Bretonneux (78180.0),4166.666667,17.0


In [4]:
print(f"Shape initiale: {df.shape}")
print("\n=== VALEURS MANQUANTES INITIALES ===")
print(df.isnull().sum())

Shape initiale: (815674, 41)

=== VALEURS MANQUANTES INITIALES ===
id_mutation                       0
date_mutation                     0
nature_mutation                   0
valeur_fonciere                   0
adresse_numero                15643
adresse_nom_voie                  3
code_postal                       8
code_commune                      0
nom_commune                       0
code_departement                  0
id_parcelle                       0
lot1_numero                  254141
lot1_surface_carrez          499059
lot2_numero                  547700
lot2_surface_carrez          734068
lot3_numero                  784535
lot3_surface_carrez          809744
lot4_numero                  806111
lot4_surface_carrez          814399
lot5_numero                  812204
lot5_surface_carrez          815310
nombre_lots                       0
code_type_local                   0
type_local                        0
surface_reelle_bati               0
nombre_pieces_principales        

In [5]:
colonnes_a_supprimer = [
    'id_mutation', 'adresse_numero', 'id_parcelle',
    'lot1_numero', 'lot2_numero', 'lot3_numero',
    'lot4_numero', 'lot5_numero', 'lot1_surface_carrez',
    'lot2_surface_carrez', 'lot3_surface_carrez',
    'lot4_surface_carrez', 'lot5_surface_carrez',
    'arrondissement', 'nature_culture', 'nature_culture_speciale',
    'annee_trimestre'
]
df_clean = df.drop(columns=colonnes_a_supprimer, errors='ignore')

print("\n=== VALEURS MANQUANTES APRÈS SUPPRESSION ===")
print(df_clean.isnull().sum())


=== VALEURS MANQUANTES APRÈS SUPPRESSION ===
date_mutation                     0
nature_mutation                   0
valeur_fonciere                   0
adresse_nom_voie                  3
code_postal                       8
code_commune                      0
nom_commune                       0
code_departement                  0
nombre_lots                       0
code_type_local                   0
type_local                        0
surface_reelle_bati               0
nombre_pieces_principales         0
surface_terrain              562227
longitude                     11751
latitude                      11751
annee                             0
mois                              0
trimestre                         0
prix_m2                           0
quartier                          0
quartier_detaille                 0
prix_median_m2                17479
ecart_prix_median_pct         17479
dtype: int64


In [6]:
quartier_prix_median = (
    df_clean[df_clean['prix_median_m2'].notna()]
    .groupby('quartier_detaille')['prix_median_m2']
    .first()
    .to_dict()
)


In [7]:
mask_manquant = df_clean['prix_median_m2'].isna()
for idx in df_clean[mask_manquant].index:
    quartier = df_clean.loc[idx, 'quartier_detaille']
    if quartier in quartier_prix_median:
        df_clean.loc[idx, 'prix_median_m2'] = quartier_prix_median[quartier]
    else:
        df_clean.loc[idx, 'prix_median_m2'] = df_clean.loc[idx, 'prix_m2']
        

print(df_clean.isnull().sum())

date_mutation                     0
nature_mutation                   0
valeur_fonciere                   0
adresse_nom_voie                  3
code_postal                       8
code_commune                      0
nom_commune                       0
code_departement                  0
nombre_lots                       0
code_type_local                   0
type_local                        0
surface_reelle_bati               0
nombre_pieces_principales         0
surface_terrain              562227
longitude                     11751
latitude                      11751
annee                             0
mois                              0
trimestre                         0
prix_m2                           0
quartier                          0
quartier_detaille                 0
prix_median_m2                    0
ecart_prix_median_pct         17479
dtype: int64


In [8]:
df_clean['ecart_prix_median_pct'] = (
    (df_clean['prix_m2'] - df_clean['prix_median_m2']) / 
    df_clean['prix_median_m2'] * 100
)

In [9]:
df_clean['ecart_prix_median_pct'] = (
    (df_clean['prix_m2'] - df_clean['prix_median_m2']) / 
    df_clean['prix_median_m2'] * 100
)

# Cellule 7 : Conversion des types
print("\n=== TYPES AVANT CONVERSION ===")
print(df_clean.dtypes)

df_clean['date_mutation'] = pd.to_datetime(df_clean['date_mutation'])
df_clean['code_postal'] = (
    df_clean['code_postal']
    .fillna(0)
    .astype(int)
    .astype(str)
    .str.zfill(5)
)
df_clean['code_postal'] = df_clean['code_postal'].replace('00000', np.nan)
df_clean['code_type_local'] = df_clean['code_type_local'].astype('Int64')
df_clean['code_departement'] = df_clean['code_departement'].astype(str)

colonnes_categoriques = [
    'nature_mutation', 'type_local', 'adresse_nom_voie',
    'nom_commune', 'quartier', 'quartier_detaille'
]
for col in colonnes_categoriques:
    if col in df_clean.columns:
        df_clean[col] = df_clean[col].astype('category')

df_clean['mois'] = pd.Categorical(
    df_clean['mois'],
    categories=range(1, 13),
    ordered=True
)
df_clean['trimestre'] = pd.Categorical(
    df_clean['trimestre'],
    categories=[1, 2, 3, 4],
    ordered=True
)

print("\n=== TYPES APRÈS CONVERSION ===")
print(df_clean.dtypes)


=== TYPES AVANT CONVERSION ===
date_mutation                 object
nature_mutation               object
valeur_fonciere              float64
adresse_nom_voie              object
code_postal                  float64
code_commune                   int64
nom_commune                   object
code_departement               int64
nombre_lots                    int64
code_type_local              float64
type_local                    object
surface_reelle_bati          float64
nombre_pieces_principales    float64
surface_terrain              float64
longitude                    float64
latitude                     float64
annee                          int64
mois                           int64
trimestre                      int64
prix_m2                      float64
quartier                      object
quartier_detaille             object
prix_median_m2               float64
ecart_prix_median_pct        float64
dtype: object

=== TYPES APRÈS CONVERSION ===
date_mutation                datet

In [10]:
train = df_clean[df_clean['annee'].isin([2020, 2021, 2022, 2023, 2024])].copy()
test = df_clean[df_clean['annee'].isin([2025])].copy()

In [11]:
X_train = train.drop(columns=['prix_m2'])
y_train = train['prix_m2']
X_test = test.drop(columns=['prix_m2'])
y_test = test['prix_m2']


In [12]:
print("=== TAILLES DES DATASETS ===")
print(f"Train: {len(train)} lignes ({len(train)/len(df_clean)*100:.1f}%)")
print(f"Test: {len(test)} lignes ({len(test)/len(df_clean)*100:.1f}%)")

print("\n=== RÉPARTITION TEMPORELLE ===")
print("Train:")
print(train.groupby('annee').size())
print("\nTest:")
print(test.groupby('annee').size())

print("\n=== SHAPES X et y ===")
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

print("\n=== TARGET (prix_m2) ===")
print(f"y_train - moyenne: {y_train.mean():.2f}€/m², min: {y_train.min():.2f}€, max: {y_train.max():.2f}€")
print(f"y_test - moyenne: {y_test.mean():.2f}€/m², min: {y_test.min():.2f}€, max: {y_test.max():.2f}€")


=== TAILLES DES DATASETS ===
Train: 764718 lignes (93.8%)
Test: 50956 lignes (6.2%)

=== RÉPARTITION TEMPORELLE ===
Train:
annee
2020     99419
2021    200469
2022    194002
2023    143996
2024    126832
dtype: int64

Test:
annee
2025    50956
dtype: int64

=== SHAPES X et y ===
X_train shape: (764718, 23)
y_train shape: (764718,)
X_test shape: (50956, 23)
y_test shape: (50956,)

=== TARGET (prix_m2) ===
y_train - moyenne: 5940.97€/m², min: 500.24€, max: 19998.89€
y_test - moyenne: 5817.65€/m², min: 501.35€, max: 19988.70€


In [13]:
X_train.to_csv("../data/X_train.csv", index=False)
X_test.to_csv("../data/X_test.csv", index=False)
y_train.to_csv("../data/y_train.csv", index=False)
y_test.to_csv("../data/y_test.csv", index=False)
df_clean.to_csv("../data/df_vf_idf_clean.csv", index=False)

