# Modelisation

In [None]:
%load_ext autoreload
%autoreload 2

import warnings
import pandas as pd
from sklearn import set_config

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', 5000)
pd.set_option("display.max_rows", 101)
pd.set_option('display.float_format', lambda x: '{:.2f}'.format(x))

set_config(transform_output="pandas")

In [None]:
from sklearn.neighbors import LocalOutlierFactor
from utils import RANDOM_STATE, TARGET


X = pd.read_csv("./data/kaggle_train_set.csv")

# Ne gardons que les régions et les années de construction présentes dans le test set
test = pd.read_csv("./data/kaggle_test_set.csv")

X = X[X.Neighborhood.isin(test.Neighborhood.unique())]
# X = X[X.YearBuilt.isin(test.YearBuilt)]

# Supprimons les outliers
keep_columns = ['GrLivArea', 'TotalBsmtSF', '1stFlrSF', 'GarageArea']

lof = LocalOutlierFactor(contamination=0.03)
outliers_prediction = lof.fit_predict(X.loc[:, keep_columns])

mask_non_outliers = outliers_prediction == 1
X = X[mask_non_outliers]
y = X.pop(TARGET)

print(f"\nLa shape de X est {X.shape}")
print(f"La shape de y est {y.shape}")

In [None]:
def feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()



    df['Age'] = 2011 - df['YearBuilt']
    df['AgeRemod'] = 2011 - df['YearRemodAdd']

    df["TotalArea"] = df["GrLivArea"] + df["TotalBsmtSF"]

    df["HasGarage"] = df["GarageArea"] > 0).astype(int)
    df["HasFireplace"] = (df["Fireplaces"] > 0).astype(int)
    df["HasBsmt"] = df["TotalBsmtSF"] > 0).astype(int)

    return df
    


In [None]:
from sklearn.pipeline import FunctionTransformer, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import Binarizer, PowerTransformer, OrdinalEncoder, RobustScaler


def get_age(x):
    return 2011 - x

neighboors_categories = ['MeadowV', 'IDOTRR', 'BrDale', 'Edwards', 'BrkSide', 'OldTown', 'Blueste', 'Sawyer', 'SWISU', 'NAmes', 'NPkVill', 'Mitchel', 'SawyerW', 'Gilbert', 'Blmngtn', 'NWAmes', 'CollgCr', 'ClearCr', 'Crawfor', 'Somerst', 'Timber', 'Veenker', 'StoneBr', 'NoRidge', 'NridgHt']
quality_order = ['Fa', 'TA', 'Gd', 'Ex']

numerical_pipeline = Pipeline(
    steps=[
        ('power_transformation', PowerTransformer(method='yeo-johnson', standardize=True)),
    ]
    
)

categorical_encoding_pipeline = Pipeline(
    steps=[
        ('encoder', OrdinalEncoder(categories=[neighboors_categories, quality_order, quality_order])),
    ]
    
)


preprocessing = ColumnTransformer(
    transformers=[
        ('categorical', categorical_encoding_pipeline, ['Neighborhood', 'ExterQual', 'KitchenQual']),
        ('numerical', numerical_pipeline, ['GarageCars', 'GrLivArea', 'TotalBsmtSF', 'LotArea']),
        ('has_categories', Binarizer(threshold=0.0), ['Fireplaces', 'GarageArea', 'TotalBsmtSF']),
        ('selected_passthrough', 'passthrough', ['OverallQual', 'FullBath']),
        ('aging', FunctionTransformer(func=get_age), ['YearRemodAdd', 'YearBuilt'])
    ],
    remainder='drop',
)


preprocessing

In [None]:
preprocessing.fit_transform(X, y)

In [None]:
from lightgbm import LGBMRegressor
import numpy as np
from sklearn.compose import TransformedTargetRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

models = {
    "Ridge": Ridge(random_state=RANDOM_STATE),
    "Random Forest": RandomForestRegressor(random_state=RANDOM_STATE),
    "Gradient Boosting": GradientBoostingRegressor(loss='huber', random_state=RANDOM_STATE),
    "LGBM": LGBMRegressor(verbose=0, random_state=RANDOM_STATE),
    "SVR": SVR(),
    "KNN": KNeighborsRegressor()
}

results = {}
for name, model in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessing', preprocessing),
        ('regressor', TransformedTargetRegressor(
            regressor=model,
            func=np.log1p, inverse_func=np.expm1
        ))
    ])
    
    # Validation croisée pour calculer la MAE
    scores = cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_absolute_error')
    results[name] = -scores.mean()


print("Résultats des modèles :")
for name, score in results.items():
    print(f"{name}: MAE = {score:.2f}")

Gradient Boosting performe mieux que les autres. Gardons le pour la suite.

In [None]:
import optuna


def objective(trial):
    # Définir les hyperparamètres à optimiser
    gb_params = dict(
        max_depth=trial.suggest_int("max_depth", 2, 10),
        # loss=trial.suggest_categorical("loss", ['squared_error', 'absolute_error', 'huber', 'quantile']),
        learning_rate=trial.suggest_float("learning_rate", 1e-4, 1e-1, log=True),
        n_estimators=trial.suggest_int("n_estimators", 100, 1000),
        subsample=trial.suggest_float("subsample", 0.5, 1.0),
        min_samples_split=trial.suggest_int("min_samples_split", 2, 20),
        min_samples_leaf=trial.suggest_int("min_samples_leaf", 1, 20),
        max_features=trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
        criterion=trial.suggest_categorical("criterion", ["friedman_mse", "squared_error"])
    )
    
    # Créer le pipeline avec le modèle et le prétraitement
    pipeline = Pipeline(steps=[
        ('preprocessing', preprocessing),
        ('regressor', TransformedTargetRegressor(
            regressor=GradientBoostingRegressor(random_state=RANDOM_STATE, **gb_params),
            func=np.log1p, inverse_func=np.expm1
        ))
    ])
    
    # Validation croisée pour calculer la MAE
    scores = cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_absolute_error')
    return -scores.mean()

# Lancer l'optimisation avec Optuna
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20)

# Afficher les meilleurs paramètres
best_params = study.best_params
print("Meilleurs hyperparamètres :", best_params)
print("Meilleurs scores: ", study.best_value)

In [None]:
# Entrainement sur l´ensemble du dataset
from sklearn.metrics import make_scorer
from sklearn.model_selection import TunedThresholdClassifierCV

# best_params = {'max_depth': 5, 'learning_rate': 0.009739721675870424, 'n_estimators': 700, 'subsample': 0.5619157765347228, 'min_samples_split': 8, 'min_samples_leaf': 14, 'max_features': 'sqrt', 'criterion': 'squared_error'}

estimator = Pipeline(
    steps=[
        ('preprocessing', preprocessing),
        ('regressor', TransformedTargetRegressor(
            regressor=GradientBoostingRegressor(random_state=RANDOM_STATE, loss='huber', **best_params),
            func=np.log1p, inverse_func=np.expm1))
        ]
)

estimator.set_params(**{'regressor__regressor__subsample': 1.0})

estimator.fit(X, y)


In [None]:
y_pred = estimator.predict(X)

analyse = pd.DataFrame({'Prediction':y_pred, 'SalePrice': y.to_numpy()})
analyse['PriceDifference'] = np.absolute(analyse.Prediction - analyse.SalePrice)
analyse = analyse.join(X)
analyse.head(10)

In [None]:
analyse.PriceDifference.describe()

In [None]:
upper_error = analyse.query("PriceDifference > @analyse.PriceDifference.median()")
upper_error.head()

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns

analysis_columns = [name for name in analyse.columns if name not in ['Prediction', 'SalePrice', 'PriceDifference']]
fig, axs = plt.subplots(len(analysis_columns), 1, figsize=(12, 40))
for i, name in enumerate(analysis_columns):
    sns.histplot(analyse, x=name, ax=axs[i])
fig.suptitle("Distribution des cas > médiane de Absolute Error.")

In [None]:
# Load the data
test = pd.read_csv("./data/kaggle_test_set.csv")

# Prédictions
result = estimator.predict(test)

# Créer le DataFrame de soumission avec "ID" comme index
submission_df = pd.DataFrame(result, columns=["SalePrice"], index=test.index)
submission_df.index.name = "ID"  # Renommer l'index en "ID"

# Sauvegarder le fichier CSV
submission_df.to_csv("./data/submission.csv")

# Vérification
check_df = pd.read_csv("./data/submission.csv", index_col='ID')
check_df.head()

In [None]:
from datetime import datetime
import io
import subprocess
import time

from utils import sauvegarder_model

competition = "regression-prediction-prix-immobilier"


def all_status_complete(df):
    return all(df['status'].isin(['SubmissionStatus.ERROR', 'SubmissionStatus.COMPLETE']))

# Fonction pour relancer le subprocess et récupérer les données
def relaunch_subprocess():
    result = subprocess.run(
        ["kaggle", "competitions", "submissions", "-v", "-c",  competition],
        capture_output=True,
        text=True
        )
    return result.stdout



now = str(datetime.now())
# Timeout de 1 minute
timeout = 60  # en secondes
start_time = time.time()

data = None

if SUBMIT:= True:
    file_path = "./data/submission.csv"
    message = f"timestamp: {now}, Utilisation des paramètres: {estimator.get_params()}"

    result = subprocess.run(
        ["kaggle", "competitions", "submit", "-c", competition, "-f", file_path, "-m", message],
        capture_output=True,
        text=True
    )

    print("⚒️", result.stdout)

    if result.stderr != '':
        print(result.stderr)
    data = None

    while True:
        # Récupérer les données depuis le subprocess
        stdout_data = relaunch_subprocess()
        
        # Convertir les données en DataFrame
        data = pd.read_csv(io.StringIO(stdout_data), parse_dates=['date'])
        
        # Vérifier si tous les statuts sont complets
        if all_status_complete(data):
            break
        
        # Vérification du timeout
        if time.time() - start_time > timeout:
            print("")
            raise RuntimeError("imeout atteint. Arrêt de la boucle.")
        
        # Pause avant la prochaine vérification
        time.sleep(20)  # Pause de 20 secondes

    if data is not None and all_status_complete(data):
        # Trouver la ligne avec la date la plus récente
        most_recent = data.sort_values('date', ascending=False).iloc[0]
        recent_score = most_recent['publicScore']
        best_score = data['publicScore'].min()

        if recent_score == best_score:
            print(f"🥳 Nouveau meilleur score : {recent_score:.5f}")
            sauvegarder_model(estimator, timestamp=now, only_latest=False)
        else:
            print(f"❌ Bien essayé mais c´est moins bon. Score: {recent_score:.5f}. Meilleur score : {best_score:.5f}")

if not SUBMIT:
    sauvegarder_model(estimator, timestamp=now, only_latest=True)
