In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import plotly.express as px
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import os

# Use relative path from notebook location
project_root = os.path.dirname(os.path.dirname(os.path.abspath("")))
data_path = os.path.join(project_root, "Data", "BD_resultat_tableau.xlsx")
df = pd.read_excel(data_path)


In [28]:
#Filter data
df = df[df.type_local != "Maison"]
df = df[df.nature_mutation != "Vente en l'état futur d'achèvement"]

In [29]:
#Drop missing values
df = df.dropna(how='any', axis=0)

In [30]:
df['date_mutation'] = pd.to_datetime(df['date_mutation'])
df['annee_mois'] = df['date_mutation'].dt.to_period("M").astype(str)

In [31]:
#Group by arrondissement for price per m²
df_agg = df.groupby(['code_postal', 'annee_mois', 'nature_mutation']).agg({
    'valeur_par_surface_bati': 'mean',
    'nombre_pieces_principales': 'mean',
    'surface_reelle_bati': 'mean',
    'valeur_fonciere': 'mean',
    'longitude': 'mean',
    'latitude': 'mean'
}).reset_index()

In [32]:
#Split features and target
target = 'valeur_par_surface_bati'
x = df_agg.drop(columns=[target, 'valeur_fonciere'])
y = df_agg[target]

In [33]:
#Split train/test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

In [34]:
#Separate numeric and categorical features
numeric_features = x.select_dtypes(include='number').columns
categoric_features = x.select_dtypes(exclude='number').columns

In [35]:
#Preprocessing
ohe = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
sc = StandardScaler()

preprocessor = ColumnTransformer([
    ('cat', ohe, categoric_features),
    ('num', sc, numeric_features)
])

In [36]:
#Fit and transform
x_train_processed = preprocessor.fit_transform(x_train)
x_test_processed = preprocessor.transform(x_test)

In [37]:
#Train model
model = RandomForestRegressor(random_state=42)
model.fit(x_train_processed, y_train)

In [38]:
#Make predictions
y_train_pred = model.predict(x_train_processed)
y_test_pred = model.predict(x_test_processed)

In [39]:
#Evaluate
train_score = model.score(x_train_processed, y_train)
test_score = model.score(x_test_processed, y_test)
mae = mean_absolute_error(y_test, y_test_pred)

print(f"Train R² score: {train_score:.3f}")
print(f"Test R² score: {test_score:.3f}")
print(f"Mean Absolute Error: {mae:.2f} €/m²")

Train R² score: 0.977
Test R² score: 0.853
Mean Absolute Error: 568.60 €/m²


In [40]:
#Vis
results_df = pd.DataFrame({
    'actual_prix_m2': y_test,
    'predicted_prix_m2': y_test_pred,
    'code_postal': x_test['code_postal'].values,
    'annee_mois': x_test['annee_mois'].values
})

#Vis pred vs actual
fig = px.scatter(results_df, x='actual_prix_m2', y='predicted_prix_m2',
                  hover_data=['code_postal', 'annee_mois'],
                  title='Predicted vs Actual Price per m²',
                  labels={
                       'predicted_prix_m2': 'Prix moyen €/m² prédit', 
                       'actual_prix_m2': 'Prix moyen €/m² réel'
                   })
fig.add_scatter(x=[y_test.min(), y_test.max()], 
                 y=[y_test.min(), y_test.max()],
                 mode='lines', name='Perfect Prediction', 
                 line=dict(dash='dash', color='red'))
fig.show()

In [41]:
#For Feature
feature_importance = pd.DataFrame({
    "feature_names": preprocessor.get_feature_names_out(),
    "coefficients": model.feature_importances_
})

In [42]:
#Vis Features
fig1 = px.bar(
    feature_importance.sort_values("coefficients", ascending=False).head(5),
    x="feature_names",
    y="coefficients",
    title="Random Forest Feature Importance for Price Prediction",
    labels={        
           'coefficients': 'Coefficients', 
           'feature_names': 'Features'
    }   
)

fig1.update_xaxes(
    tickvals=["num_code_postal", "num_longitude", "num_latitude", "num_surface_reelle_bati", "num_nombre_pieces_principales"],
    ticktext=["Code Postal", "Longitude", "Latitude", "Surface m²", "Nombre de pièces"]
)


fig1.update_layout(xaxis_tickangle=45)
fig1.show()

In [43]:
# Add this to the end of your notebook
import pickle

# Save model
with open("../src/model.pkl", "wb") as f:
    pickle.dump(model, f)

# Save preprocessor
with open("../src/preprocessor.pkl", "wb") as f:
    pickle.dump(preprocessor, f)

In [47]:
import sklearn, pandas, numpy
print(sklearn.__version__, pandas.__version__, numpy.__version__)

1.6.1 2.2.3 2.1.3
