<h1 style="color:#2192f1"> PARIS HOUSING PRICES PREDICTION</h1>


<h2> <span style="color:#12f095">GOAL : </span> Predict sales prices four each house using : <br><br> <span style="color:#7b55e1; font-style:oblique">Linear Regression<br>Decision Tree Regressor<br> Random Forest Regressor<br>XGBoost Regressor<br></span></h2>



<h2 style="color:#f0e912">IMPORTING LIBRARIES</h2>


In [154]:
import optuna
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from src import evaluation, residual_distribution, run_all_metrics
from sklearn.model_selection import cross_val_score
import warnings
from sklearn.ensemble import RandomForestRegressor
from src import run_all_metrics, register_param

warnings.filterwarnings("ignore", category=UserWarning)
from tqdm import tqdm
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import mean_squared_error
from itertools import product
from sklearn.inspection import PartialDependenceDisplay
from sklearn import preprocessing

print("import successful")


import successful


<h2 style="color:#f0e912"><br>DATA EXPLORATION</h2>


In [None]:
# Load data
data = pd.read_csv("../data/ParisHousing.csv")

In [None]:
# Quick overview
data.head()

In [None]:
# Give information about features ( Entries number, type, features number, etc.)
data.info()

In [None]:
# Divide data
X = data.drop('price', axis=1)
y = data["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

<h2 style="color:#f0e912"><br>DATA VISUALIZATION</h2>


In [None]:
# Combine X_train and y_train for visualization
df_plot = X_train.copy()
df_plot['price'] = y_train

In [None]:
# matrix correlation
matrix = df_plot.corr()
plt.figure(figsize=(10, 10))
sns.heatmap(matrix, annot=True, fmt='.1f', vmin=-1, vmax=1)

In [None]:
spearman_corr = df_plot.corr(method='spearman')
print(spearman_corr['price'])

In [None]:
sns.pairplot(
    df_plot,
    x_vars=['squareMeters', 'numberOfRooms', 'basement', 'attic', 'made'],
    y_vars='price',
    height=4,
    aspect=1
)

<h2 style="color:#f0e912"><br>MODEL TRAINING</h2>


<h3 style="color:#0c99d1">LINEAR REGRESSION</h3>

In [134]:
# Model training and prediction
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred = linear_model.predict(X_test)

In [135]:
# Model evaluation
run_all_metrics(y_pred, y_test, "Linear regression")

📊  Regression model evaluation (Linear regression)
🔹 MAE (Mean Absolute Error) : 1510.03
🔹 MSE (Mean Squared Error)  : 3695708.44
🔹 R2  (Coefficient of Determination)  : 1.00


In [None]:
residual_distribution(y_test, y_pred)

<h3 style="color:#0c99d1">DECISION TREE REGRESSOR</h3>


<h4 style="color:#e8b215">SIMPLE SETTING</h4>


In [None]:
# Model definition
decision_tree_model = DecisionTreeRegressor(random_state=42)

In [None]:
# Cross-validation to test the robustness of the model and its ability to generalize to new data
cross_val_score(decision_tree_model, X_train, y_train, cv=10)

In [None]:
# Train regressor model
decision_tree_model.fit(X_train, y_train)

In [None]:
y_pred_decision_tree = decision_tree_model.predict(X_test)

In [None]:
run_all_metrics(y_pred_decision_tree, y_test, "Decision tree regression")

In [None]:
residual_distribution(y_test, y_pred_decision_tree)

<h4 style="color:#e8b215">GRID SEARCH</h4>


In [None]:
# Dictionary of parameters to test
param_grid = {
    'splitter': ['best', 'random'],
    'max_depth': [None, 5, 8, 10, 12, 17,  20],
    'min_samples_split': [2, 5, 10, 15, 30, 45, 50, 70],
    'min_samples_leaf': [1, 2, 5, 10, 15, 20],
    'max_features': [None,  'sqrt', 'log2'],
    'ccp_alpha': [0.0, 0.01, 0.1]
}

In [None]:
best_score = float('inf')
best_params = None

grid = ParameterGrid(param_grid)

for params in tqdm(grid, desc="Grid Search"):
    model = DecisionTreeRegressor(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = mean_squared_error(y_test, y_pred)

    if score < best_score:
        best_score = score
        best_params = params

In [None]:
print("\nBest MSE:", best_score)
print("Best params:", best_params)

In [None]:
dt_gridsearch_model = DecisionTreeRegressor(**best_params, random_state=42)
dt_gridsearch_model.fit(X_train, y_train)
y_pred_dt_gridsearch = dt_gridsearch_model.predict(X_test)

In [None]:
run_all_metrics(y_pred_dt_gridsearch, y_test, "Decision tree regression")

In [None]:
residual_distribution(y_test, y_pred_dt_gridsearch)

<h3 style="color:#0c99d1">RANDOM FOREST REGRESSOR</h3>


<h4 style="color:#e8b215">BAYESIAN OPTIMISATION (OPTUNA)</h4>


In [147]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 30),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
        'max_features': trial.suggest_float('max_features', 0.1, 1.0),
        'random_state': 42
    }

    model = RandomForestRegressor(**params)
    # CV score négatif car on minimise MSE
    score = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    return -np.mean(score)

In [None]:
# Lancer la recherche
warnings.filterwarnings("ignore", category=UserWarning, module="woodwork")
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

In [151]:
best_params = study.best_params
final_model = RandomForestRegressor(**best_params, random_state=42)
final_model.fit(X_train, y_train)

y_pred_random_forest = final_model.predict(X_test)

In [152]:
run_all_metrics(y_pred_random_forest, y_test, "Decision tree regression")

📊  Regression model evaluation (Decision tree regression)
🔹 MAE (Mean Absolute Error) : 3382.26
🔹 MSE (Mean Squared Error)  : 20477173.04
🔹 R2  (Coefficient of Determination)  : 1.00


In [156]:
register_param("../outputs/models_parameters/models_params", "models_params", study.best_value, study.best_params)

Adding new model models_params with score 19583581.85256026.
Data saved to ../outputs/models_parameters/models_params


<h3 style="color:#0c99d1">XGBOOST</h3>
