# Modelisation

In [3]:
%load_ext autoreload
%autoreload 2

import warnings
import pandas as pd
from sklearn import set_config

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', 5000)
pd.set_option("display.max_rows", 101)
pd.set_option('display.float_format', lambda x: '{:.2f}'.format(x))

set_config(transform_output="pandas")

In [4]:
# Load the data
from sklearn.model_selection import train_test_split

from utils import RANDOM_STATE, TARGET, TEST_SIZE


X = pd.read_csv("./data/kaggle_train_set.csv")
y = X.pop(TARGET)

print()
print(f"La shape de X est {X.shape}")
print(f"La shape de y est {y.shape}")

# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
#     )

# print(f"Le train set contient: {X_train.shape} élements")
# print(f"Le test set contient: {X_test.shape} élements")


La shape de X est (1180, 15)
La shape de y est (1180,)


In [5]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import PowerTransformer, RobustScaler, StandardScaler, OrdinalEncoder

neighboors_categories = ['MeadowV', 'IDOTRR', 'BrDale', 'Edwards', 'BrkSide', 'OldTown', 'Blueste', 'Sawyer', 'SWISU', 'NAmes', 'NPkVill', 'Mitchel', 'SawyerW', 'Gilbert', 'Blmngtn', 'NWAmes', 'CollgCr', 'ClearCr', 'Crawfor', 'Somerst', 'Timber', 'Veenker', 'StoneBr', 'NoRidge', 'NridgHt']
quality_order = ['Fa', 'TA', 'Gd', 'Ex']

first_stage = ColumnTransformer(
    transformers=[
        ('encoder', OrdinalEncoder(categories=[neighboors_categories, quality_order, quality_order]), ['Neighborhood', 'ExterQual', 'KitchenQual']),
        ('power_transformation', PowerTransformer(method='yeo-johnson'), ['OverallQual', 'GarageCars', 'GrLivArea', '1stFlrSF'])
    ],
    remainder='drop',
    verbose_feature_names_out=False
)

preprocessing = Pipeline(steps=[
    ('first_stage', first_stage),
    ('scaler', RobustScaler())
])

preprocessing

0,1,2
,steps,"[('first_stage', ...), ('scaler', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('encoder', ...), ('power_transformation', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,"[['MeadowV', 'IDOTRR', ...], ['Fa', 'TA', ...], ...]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,method,'yeo-johnson'
,standardize,True
,copy,True

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False


In [42]:
preprocessing.fit_transform(X)

Unnamed: 0,Neighborhood,ExterQual,KitchenQual,OverallQual,GarageCars,GrLivArea,1stFlrSF
0,0.56,1.00,1.00,0.49,0.00,0.37,-0.53
1,1.11,0.00,0.00,0.00,0.00,-0.29,0.32
2,0.56,1.00,1.00,0.49,0.00,0.46,-0.37
3,0.89,1.00,1.00,0.95,0.00,0.35,0.95
4,-0.67,0.00,0.00,0.49,0.00,0.45,-0.14
...,...,...,...,...,...,...,...
1175,-0.89,0.00,0.00,-0.51,0.00,-0.64,-0.03
1176,0.89,1.00,1.00,0.49,0.00,-0.36,0.25
1177,0.22,0.00,0.00,0.00,0.00,0.29,-0.29
1178,0.78,2.00,1.00,0.49,-1.00,1.04,0.19


In [7]:
from lightgbm import LGBMRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

models = {
    "Ridge": Ridge(random_state=RANDOM_STATE),
    "Random Forest": RandomForestRegressor(random_state=RANDOM_STATE),
    "Gradient Boosting": GradientBoostingRegressor(loss='huber', random_state=RANDOM_STATE),
    "LGBM": LGBMRegressor(verbose=0, random_state=RANDOM_STATE),
    "SVR": SVR(),
    "KNN": KNeighborsRegressor()
}

results = {}
for name, model in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessing', preprocessing),
        ('regressor', model)
    ])
    
    # Validation croisée pour calculer la MAE
    scores = cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_absolute_error')
    results[name] = -scores.mean()


print("Résultats des modèles :")
for name, score in results.items():
    print(f"{name}: MAE = {score:.2f}")

Résultats des modèles :
Ridge: MAE = 24740.97
Random Forest: MAE = 20241.60
Gradient Boosting: MAE = 19104.66
LGBM: MAE = 20173.65
SVR: MAE = 55801.26
KNN: MAE = 20835.00


Gradient Boosting performe mieux que les autres. Gardons le pour la suite.

In [56]:
import optuna


def objective(trial):
    # Définir les hyperparamètres à optimiser
    gb_params = dict(
        max_depth=trial.suggest_int("max_depth", 2, 10),
        loss=trial.suggest_categorical("loss", ['squared_error', 'absolute_error', 'huber', 'quantile']),
        learning_rate=trial.suggest_float("learning_rate", 1e-4, 1e-1, log=True),
        n_estimators=trial.suggest_int("n_estimators", 100, 1000),
        subsample=trial.suggest_float("subsample", 0.5, 1.0),
        min_samples_split=trial.suggest_int("min_samples_split", 2, 20),
        min_samples_leaf=trial.suggest_int("min_samples_leaf", 1, 20),
        max_features=trial.suggest_categorical("max_features", ["sqrt", "log2", None])
    )
    
    # Créer le pipeline avec le modèle et le prétraitement
    pipeline = Pipeline(steps=[
        ('preprocessing', preprocessing),
        ('regressor', GradientBoostingRegressor(random_state=RANDOM_STATE, **gb_params))
    ])
    
    # Validation croisée pour calculer la MAE
    scores = cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_absolute_error')
    return -scores.mean()

# Lancer l'optimisation avec Optuna
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20)

# Afficher les meilleurs paramètres
best_params = study.best_params
print("Meilleurs hyperparamètres :", best_params)

[I 2025-08-26 19:36:01,378] A new study created in memory with name: no-name-a9ab4d0b-e3a7-4aac-ac88-a6649e96136b


[I 2025-08-26 19:36:02,056] Trial 0 finished with value: 43022.336113319 and parameters: {'max_depth': 7, 'loss': 'squared_error', 'learning_rate': 0.001201132270097907, 'n_estimators': 309, 'subsample': 0.5454077004529858, 'min_samples_split': 4, 'min_samples_leaf': 9, 'max_features': 'sqrt'}. Best is trial 0 with value: 43022.336113319.
[I 2025-08-26 19:36:03,753] Trial 1 finished with value: 49295.479462141186 and parameters: {'max_depth': 3, 'loss': 'huber', 'learning_rate': 0.0007066720043192144, 'n_estimators': 234, 'subsample': 0.9850700659927278, 'min_samples_split': 9, 'min_samples_leaf': 19, 'max_features': None}. Best is trial 0 with value: 43022.336113319.
[I 2025-08-26 19:36:12,958] Trial 2 finished with value: 18420.771975981555 and parameters: {'max_depth': 7, 'loss': 'absolute_error', 'learning_rate': 0.010017601543521574, 'n_estimators': 885, 'subsample': 0.666625182930656, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 2 with va

Meilleurs hyperparamètres : {'max_depth': 7, 'loss': 'absolute_error', 'learning_rate': 0.010017601543521574, 'n_estimators': 885, 'subsample': 0.666625182930656, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'log2'}


In [13]:
# Entrainement sur l´ensemble du dataset
from sklearn.metrics import make_scorer
from sklearn.model_selection import TunedThresholdClassifierCV

best_params = {'max_depth': 7, 'loss': 'absolute_error', 'learning_rate': 0.010017601543521574, 'n_estimators': 885, 'subsample': 0.666625182930656, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'log2'}

estimator = Pipeline(
    steps=[
        ('preprocessing', preprocessing),
        ('regressor', GradientBoostingRegressor(random_state=RANDOM_STATE, **best_params))
        ]
)

estimator.fit(X, y)

# Load the data
test = pd.read_csv("./data/kaggle_test_set.csv")

# Prédictions
result = estimator.predict(test)

# Créer le DataFrame de soumission avec "ID" comme index
submission_df = pd.DataFrame(result, columns=["SalePrice"], index=test.index)
submission_df.index.name = "ID"  # Renommer l'index en "ID"

# Sauvegarder le fichier CSV
submission_df.to_csv("./data/submission.csv")

# Vérification
check_df = pd.read_csv("./data/submission.csv")
check_df.head()

Unnamed: 0,ID,SalePrice
0,0,175073.77
1,1,370870.61
2,2,188438.71
3,3,154344.12
4,4,115713.81


In [None]:
from datetime import datetime
import io
import subprocess
import time

from utils import sauvegarder_model

competition = "regression-prediction-prix-immobilier"


def all_status_complete(df):
    return all(df['status'].isin(['SubmissionStatus.FAIL', 'SubmissionStatus.ERROR', 'SubmissionStatus.COMPLETE']))

# Fonction pour relancer le subprocess et récupérer les données
def relaunch_subprocess():
    result = subprocess.run(
        ["kaggle", "competitions", "submissions", "-v", "-c",  competition],
        capture_output=True,
        text=True
        )
    return result.stdout



now = str(datetime.now())
# Timeout de 1 minute
timeout = 60  # en secondes
start_time = time.time()

data = None

if SUBMIT:= True:
    file_path = "./data/submission.csv"
    message = f"timestamp: {now}, Utilisation des paramètres: {estimator.get_params()}"

    result = subprocess.run(
        ["kaggle", "competitions", "submit", "-c", competition, "-f", file_path, "-m", message],
        capture_output=True,
        text=True
    )

    print("⚒️", result.stdout)

    if result.stderr != '':
        print(result.stderr)
    data = None

    while True:
        # Récupérer les données depuis le subprocess
        stdout_data = relaunch_subprocess()
        
        # Convertir les données en DataFrame
        data = pd.read_csv(io.StringIO(stdout_data), parse_dates=['date'])
        
        # Vérifier si tous les statuts sont complets
        if all_status_complete(data):
            break
        
        # Vérification du timeout
        if time.time() - start_time > timeout:
            print("")
            raise RuntimeError("imeout atteint. Arrêt de la boucle.")
        
        # Pause avant la prochaine vérification
        time.sleep(20)  # Pause de 20 secondes

    if data is not None and all_status_complete(data):
        # Trouver la ligne avec la date la plus récente
        most_recent = data.sort_values('date', ascending=False).iloc[0]
        recent_score = most_recent['publicScore']
        best_score = data['publicScore'].max()

        if recent_score > best_score:
            print(f"🥳 Nouveau meilleur score : {recent_score:.5f}")
            sauvegarder_model(estimator, timestamp=now, only_latest=False)
        else:
            print(f"❌ Bien essayé mais c´est moins bon. Score: {recent_score:.5f}. Meilleur score : {best_score:.5f}")

if not SUBMIT:
    sauvegarder_model(estimator, timestamp=now, only_latest=True)


⚒️ Successfully submitted to Régression prédiction prix immobilier

  0%|          | 0.00/6.11k [00:00<?, ?B/s]
100%|██████████| 6.11k/6.11k [00:00<00:00, 15.0kB/s]



KeyboardInterrupt: 