In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import plotly.express as px
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
import os

path_to_file = ""

df = pd.read_excel(path_to_file + "Immo/Data/BD_resultat_tableau.xlsx")



In [154]:
#Filter data
df = df[df.type_local != "Maison"]
df = df[df.nature_mutation != "Vente en l'état futur d'achèvement"]

In [155]:
#Drop missing values
df = df.dropna(how='any', axis=0)

In [156]:
df['date_mutation'] = pd.to_datetime(df['date_mutation'])
df['annee_mois'] = df['date_mutation'].dt.to_period("M").astype(str)

In [157]:
#Group by arrondissement for price per m²
df_agg = df.groupby(['code_postal', 'annee_mois', 'nature_mutation']).agg({
    'valeur_par_surface_bati': 'mean',
    'nombre_pieces_principales': 'mean',
    'surface_reelle_bati': 'mean',
    'valeur_fonciere': 'mean',
    'longitude': 'mean',
    'latitude': 'mean'
}).reset_index()

In [158]:
#Split features and target
target = 'valeur_par_surface_bati'
x = df_agg.drop(columns=[target, 'valeur_fonciere'])
y = df_agg[target]

In [172]:
#Split train/test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42, stratify=df_agg["code_postal"])

In [173]:
#Separate numeric and categorical features
numeric_features = x.select_dtypes(include='number').columns
categoric_features = x.select_dtypes(exclude='number').columns

In [174]:
#Preprocessing
ohe = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
sc = StandardScaler()

preprocessor = ColumnTransformer([
    ('cat', ohe, categoric_features),
    ('num', sc, numeric_features)
])

In [175]:
#Fit and transform
x_train_processed = preprocessor.fit_transform(x_train)
x_test_processed = preprocessor.transform(x_test)

In [None]:
best_score = -float('inf')
best_seed = None
best_model = None

for seed in range(10, 20):
    rf = RandomForestRegressor(random_state=seed)
    
    grid_search = GridSearchCV(
        rf,
        param_grid={
            "n_estimators": [100, 200, 300],
            "max_depth": [25, 30, 35],
            "min_samples_split": [15, 20, 25],
            "min_samples_leaf": [1, 2]
        },
        cv=5,
        scoring="neg_mean_absolute_error",
        n_jobs=-1
    )
    
    grid_search.fit(x_train_processed, y_train)
    
    if grid_search.best_score_ > best_score:
        best_score = grid_search.best_score_
        best_seed = seed
        best_model = grid_search.best_estimator_

print(f"Best random_state: {best_seed}, Best CV score: {best_score}, Best parameters: {grid_search.best_params_}")

#best yet random_state=2, max_depth=30, min_samples_leaf=1, min_samples_split=15, n_estimators= 100


Best random_state: 11, Best CV score: -574.3892577754872, Best parameters: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 25, 'n_estimators': 300}


In [176]:
#Train model
model = RandomForestRegressor(random_state=2, max_depth=30, min_samples_leaf=1, min_samples_split=15, n_estimators= 100)
model.fit(x_train_processed, y_train)

In [177]:
#Make predictions
y_train_pred = model.predict(x_train_processed)
y_test_pred = model.predict(x_test_processed)

In [178]:
#Evaluate
train_score = model.score(x_train_processed, y_train)
test_score = model.score(x_test_processed, y_test)
mae = mean_absolute_error(y_test, y_test_pred)

print(f"Train R² score: {train_score:.3f}")
print(f"Test R² score: {test_score:.3f}")
print(f"Mean Absolute Error: {mae:.2f} €/m²")


Train R² score: 0.944
Test R² score: 0.840
Mean Absolute Error: 558.32 €/m²


In [179]:
#Vis
results_df = pd.DataFrame({
    'actual_prix_m2': y_test,
    'predicted_prix_m2': y_test_pred,
    'code_postal': x_test['code_postal'].values,
    'annee_mois': x_test['annee_mois'].values
})

#Vis pred vs actual
fig = px.scatter(results_df, x='actual_prix_m2', y='predicted_prix_m2',
                  hover_data=['code_postal', 'annee_mois'],
                  title='Predicted vs Actual Price per m²',
                  labels={
                       'predicted_prix_m2': 'Prix moyen €/m² prédit', 
                       'actual_prix_m2': 'Prix moyen €/m² réel'
                   })
fig.add_scatter(x=[y_test.min(), y_test.max()], 
                 y=[y_test.min(), y_test.max()],
                 mode='lines', name='Perfect Prediction', 
                 line=dict(dash='dash', color='red'))
fig.show()

In [180]:
#For Feature
feature_importance = pd.DataFrame({
    "feature_names": preprocessor.get_feature_names_out(),
    "coefficients": model.feature_importances_
})

In [181]:
#Vis Features
fig1 = px.bar(
    feature_importance.sort_values("coefficients", ascending=False).head(),
    x="feature_names",
    y="coefficients",
    title="Random Forest Feature Importance for Price Prediction",
    labels={        
           'coefficients': 'Coefficients', 
           'feature_names': 'Features'
    }   
)

fig1.update_xaxes(
    tickvals=["num_code_postal", "num_longitude", "num_latitude", "num_surface_reelle_bati", "num_nombre_pieces_principales"],
    ticktext=["Code Postal", "Longitude", "Latitude", "Surface m²", "Nombre de pièces"]
)


fig1.show()

In [182]:
feature_importance_df = (
    feature_importance
    .sort_values("coefficients", ascending=False)
    .reset_index(drop=True)
)

feature_importance_df.head()

Unnamed: 0,feature_names,coefficients
0,num__code_postal,0.738031
1,num__longitude,0.102572
2,num__latitude,0.053309
3,num__surface_reelle_bati,0.045577
4,num__nombre_pieces_principales,0.022205


In [183]:
# Add this to the end of your notebook
import pickle

# Save model
with open("../src/model.pkl", "wb") as f:
    pickle.dump(model, f)

# Save preprocessor
with open("../src/preprocessor.pkl", "wb") as f:
    pickle.dump(preprocessor, f)

In [184]:
import sklearn, pandas, numpy
print(sklearn.__version__, pandas.__version__, numpy.__version__)

1.6.1 2.2.3 2.1.3
