<h1 style="color:#2192f1"> PARIS HOUSING PRICES PREDICTION</h1>


<h2> <span style="color:#12f095">GOAL : </span> Predict sales prices four each house using : <br><br> <span style="color:#7b55e1; font-style:oblique">Linear Regression<br>Decision Tree Regressor<br> Random Forest Regressor<br>XGBoost Regressor<br></span></h2>



<h2 style="color:#f0e912">IMPORTING LIBRARIES</h2>


In [23]:
import optuna
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from src import evaluation, residual_distribution, run_all_metrics
from sklearn.model_selection import cross_val_score
import warnings
from sklearn.ensemble import RandomForestRegressor
from src import run_all_metrics, register_param
from tqdm.notebook import tqdm
warnings.filterwarnings("ignore", category=UserWarning)
from tqdm import tqdm
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import mean_squared_error
import xgboost as xgb

print("import successful")



import successful


<h2 style="color:#f0e912"><br>DATA EXPLORATION</h2>


In [5]:
# Load data
data = pd.read_csv("../data/ParisHousing.csv")

In [6]:
# Quick overview
data.head()

Unnamed: 0,squareMeters,numberOfRooms,hasYard,hasPool,floors,cityCode,cityPartRange,numPrevOwners,made,isNewBuilt,hasStormProtector,basement,attic,garage,hasStorageRoom,hasGuestRoom,price
0,75523,3,0,1,63,9373,3,8,2005,0,1,4313,9005,956,0,7,7559081.5
1,80771,39,1,1,98,39381,8,6,2015,1,0,3653,2436,128,1,2,8085989.5
2,55712,58,0,1,19,34457,6,8,2021,0,0,2937,8852,135,1,9,5574642.1
3,32316,47,0,0,6,27939,10,4,2012,0,1,659,7141,359,0,3,3232561.2
4,70429,19,1,1,90,38045,3,7,1990,1,0,8435,2429,292,1,4,7055052.0


In [7]:
# Give information about features ( Entries number, type, features number, etc.)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   squareMeters       10000 non-null  int64  
 1   numberOfRooms      10000 non-null  int64  
 2   hasYard            10000 non-null  int64  
 3   hasPool            10000 non-null  int64  
 4   floors             10000 non-null  int64  
 5   cityCode           10000 non-null  int64  
 6   cityPartRange      10000 non-null  int64  
 7   numPrevOwners      10000 non-null  int64  
 8   made               10000 non-null  int64  
 9   isNewBuilt         10000 non-null  int64  
 10  hasStormProtector  10000 non-null  int64  
 11  basement           10000 non-null  int64  
 12  attic              10000 non-null  int64  
 13  garage             10000 non-null  int64  
 14  hasStorageRoom     10000 non-null  int64  
 15  hasGuestRoom       10000 non-null  int64  
 16  price              1000

In [8]:
# Divide data
X = data.drop('price', axis=1)
y = data["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

<h2 style="color:#f0e912"><br>DATA VISUALIZATION</h2>


In [None]:
# Combine X_train and y_train for visualization
df_plot = X_train.copy()
df_plot['price'] = y_train

In [None]:
# matrix correlation
matrix = df_plot.corr()
plt.figure(figsize=(10, 10))
sns.heatmap(matrix, annot=True, fmt='.1f', vmin=-1, vmax=1)

In [None]:
spearman_corr = df_plot.corr(method='spearman')
print(spearman_corr['price'])

In [None]:
sns.pairplot(
    df_plot,
    x_vars=['squareMeters', 'numberOfRooms', 'basement', 'attic', 'made'],
    y_vars='price',
    height=4,
    aspect=1
)

<h2 style="color:#f0e912"><br>MODEL TRAINING</h2>


<h3 style="color:#0c99d1">LINEAR REGRESSION</h3>

In [None]:
# Model training and prediction
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred = linear_model.predict(X_test)

In [None]:
# Model evaluation
run_all_metrics(y_pred, y_test, "Linear regression")

In [None]:
residual_distribution(y_test, y_pred)

<h3 style="color:#0c99d1">DECISION TREE REGRESSOR</h3>


<h4 style="color:#e8b215">SIMPLE SETTING</h4>


In [None]:
# Model definition
decision_tree_model = DecisionTreeRegressor(random_state=42)

In [None]:
# Cross-validation to test the robustness of the model and its ability to generalize to new data
cross_val_score(decision_tree_model, X_train, y_train, cv=10)

In [None]:
# Train regressor model
decision_tree_model.fit(X_train, y_train)

In [None]:
y_pred_decision_tree = decision_tree_model.predict(X_test)

In [None]:
run_all_metrics(y_pred_decision_tree, y_test, "Decision tree regression")

In [None]:
residual_distribution(y_test, y_pred_decision_tree)

<h4 style="color:#e8b215">GRID SEARCH</h4>


In [None]:
# Dictionary of parameters to test
param_grid = {
    'splitter': ['best', 'random'],
    'max_depth': [None, 5, 8, 10, 12, 17,  20],
    'min_samples_split': [2, 5, 10, 15, 30, 45, 50, 70],
    'min_samples_leaf': [1, 2, 5, 10, 15, 20],
    'max_features': [None,  'sqrt', 'log2'],
    'ccp_alpha': [0.0, 0.01, 0.1]
}

In [None]:
best_score = float('inf')
best_params = None

grid = ParameterGrid(param_grid)

for params in tqdm(grid, desc="Grid Search"):
    model = DecisionTreeRegressor(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = mean_squared_error(y_test, y_pred)

    if score < best_score:
        best_score = score
        best_params = params

In [None]:
print("\nBest MSE:", best_score)
print("Best params:", best_params)

In [None]:
dt_gridsearch_model = DecisionTreeRegressor(**best_params, random_state=42)
dt_gridsearch_model.fit(X_train, y_train)
y_pred_dt_gridsearch = dt_gridsearch_model.predict(X_test)

In [None]:
run_all_metrics(y_pred_dt_gridsearch, y_test, "Decision tree regression")

In [None]:
residual_distribution(y_test, y_pred_dt_gridsearch)

<h3 style="color:#0c99d1">RANDOM FOREST REGRESSOR</h3>


<h4 style="color:#e8b215">BAYESIAN OPTIMISATION (OPTUNA)</h4>


In [None]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 30),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
        'max_features': trial.suggest_float('max_features', 0.1, 1.0),
        'random_state': 42
    }

    model = RandomForestRegressor(**params)
    # CV negative score because we minimize MSE
    score = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    return -np.mean(score)

In [None]:
# Start searchwarnings.filterwarnings("ignore", category=UserWarning, module="woodwork")
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

In [None]:
best_params = study.best_params
final_model = RandomForestRegressor(**best_params, random_state=42)
final_model.fit(X_train, y_train)

y_pred_random_forest = final_model.predict(X_test)

In [None]:
run_all_metrics(y_pred_random_forest, y_test, "Decision tree regression")

In [None]:
register_param("../outputs/models_parameters/models_params", "models_params", study.best_value, study.best_params)

<h3 style="color:#0c99d1">XGBOOST</h3>


In [29]:
from sklearn.model_selection import KFold
# Hide log
import logging
optuna.logging.set_verbosity(optuna.logging.WARNING)

def objective(trial):
    # Suggest hyperparameters for XGBoost
    param = {
        "max_depth": trial.suggest_int("max_depth", 4, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.04, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 200, 1500),
        "min_child_weight": trial.suggest_int("min_child_weight", 8, 12),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.3, 0.45),
        "subsample": trial.suggest_float("subsample", 0.5, 0.6),
        "reg_alpha": trial.suggest_float("reg_alpha", 2, 6, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 60, 100, log=True)
    }

    # Create the model with suggested hyperparameters
    model = xgb.XGBRegressor(**param, random_state=42)

    # Cross-validation with 3 splits
    kf = KFold(n_splits=3, shuffle=True, random_state=42)

    # Compute negative mean squared error scores
    mse_scores = cross_val_score(model, X_train, y_train, cv=kf, scoring="neg_mean_squared_error")

    # Return positive mean MSE (to be minimized)
    return -mse_scores.mean()

In [30]:
# Create the Optuna study to minimize the objective
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100, show_progress_bar=True)

Best trial: 93. Best value: 1.87482e+09: 100%|██████████| 100/100 [13:44<00:00,  8.25s/it]


In [None]:
print("Best hyperparameters:", study.best_params)
print("Best MSE:", study.best_value)