# üß† Analyse des Variables Importantes pour la Pr√©diction du Prix

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestRegressor
import os
import re
from datetime import datetime
import pandas as pd


In [9]:
# üìÅ Dossier contenant les sauvegardes CSV
data_dir = "/Users/zecklimonsso/GitHub/centris-webscrapping-service/mongo_backup_csv/"

# üîç Pattern de nom de fichier
pattern = re.compile(r"properties_(\d{8}_\d{6})\.csv")

latest_file = None
latest_time = None

# üì¶ Parcours des fichiers dans le dossier
for file in os.listdir(data_dir):
    match = pattern.match(file)
    if match:
        timestamp_str = match.group(1)
        timestamp = datetime.strptime(timestamp_str, "%Y%m%d_%H%M%S")
        if not latest_time or timestamp > latest_time:
            latest_time = timestamp
            latest_file = file

# üìä Chargement du fichier trouv√©
if latest_file:
    file_path = os.path.join(data_dir, latest_file)
    print(f"‚úÖ Chargement du fichier le plus r√©cent : {latest_file}")
    df = pd.read_csv(file_path)
    display(df.head())
else:
    raise FileNotFoundError("‚ùå Aucun fichier properties_*.csv trouv√© dans le dossier.")


‚úÖ Chargement du fichier le plus r√©cent : properties_20250408_233437.csv


EmptyDataError: No columns to parse from file

In [8]:
df = df.copy()
seuil = 0.05
df = df.loc[:, df.notnull().mean() > seuil]

cols_to_drop = []
for col in df.columns:
    try:
        if 'id' in col.lower() or 'link' in col.lower():
            cols_to_drop.append(col)
        elif df[col].nunique() <= 1:
            cols_to_drop.append(col)
    except TypeError:
        cols_to_drop.append(col)
df.drop(columns=cols_to_drop, inplace=True, errors='ignore')

def safe_str(x):
    try:
        return str(x)
    except:
        return "unknown"

object_cols = df.select_dtypes(include='object').columns
for col in object_cols:
    df[col] = df[col].apply(safe_str)
    df[col] = LabelEncoder().fit_transform(df[col])

imputer = SimpleImputer(strategy="mean")
df[df.columns] = imputer.fit_transform(df)


In [None]:
if 'price' not in df.columns:
    raise ValueError("La colonne 'price' est manquante")

X = df.drop(columns=['price'])
y = df['price']


In [None]:
lasso = LassoCV(cv=5, random_state=42).fit(X, y)
lasso_selected = X.columns[lasso.coef_ != 0].tolist()

rf = RandomForestRegressor(n_estimators=100, random_state=42).fit(X, y)
rf_selected = X.columns[rf.feature_importances_ > 0.01].tolist()

important_features = list(set(lasso_selected) | set(rf_selected))

print("Lasso :", lasso_selected)
print("RF :", rf_selected)
print("Variables retenues :", important_features)


In [None]:
rf_importances = {
    feature: importance
    for feature, importance in zip(X.columns, rf.feature_importances_)
    if feature in important_features
}

sorted_features = sorted(rf_importances.items(), key=lambda x: x[1], reverse=True)
features_names = [item[0] for item in sorted_features]
importance_values = [item[1] for item in sorted_features]

plt.figure(figsize=(12, 6))
sns.barplot(x=importance_values, y=features_names)
plt.title("Importance des variables s√©lectionn√©es (Random Forest)")
plt.xlabel("Importance")
plt.ylabel("Variable")
plt.tight_layout()
plt.show()


In [None]:
df_corr = df[important_features + ['price']]
corr_matrix = df_corr.corr()
corr_with_price = corr_matrix['price'].drop('price').sort_values(ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x=corr_with_price.values, y=corr_with_price.index, dodge=False, legend=False)
plt.title("Corr√©lation des variables avec le prix")
plt.xlabel("Corr√©lation avec price")
plt.ylabel("Variable")
plt.tight_layout()
plt.show()
