# Pré-contrôle

In [None]:
import os
from pathlib import Path

print("CWD:", Path.cwd())

PROJECT_ROOT = Path.cwd().parent

DATASET_DIR_NAME = "dataset_final_immo_idf_parquet"
HADOOP_DIR_NAME  = "hadoop"

dataset_dir = (PROJECT_ROOT / "data" / "processed" / DATASET_DIR_NAME).resolve()
hadoop_home = (PROJECT_ROOT / HADOOP_DIR_NAME).resolve()
bin_dir     = (hadoop_home / "bin").resolve()

print("PROJECT_ROOT:", PROJECT_ROOT)
print("Dataset dir:", dataset_dir)
print("Hadoop home:", hadoop_home)
print("Bin dir    :", bin_dir)

assert dataset_dir.exists(), f"Dataset Parquet introuvable: {dataset_dir}"
assert (bin_dir / "winutils.exe").exists(), f"winutils.exe introuvable: {bin_dir / 'winutils.exe'}"
assert (bin_dir / "hadoop.dll").exists(),   f"hadoop.dll introuvable: {bin_dir / 'hadoop.dll'}"

CWD: C:\Users\ilyes\M2\Algorithmique\Projet
Dataset dir: C:\Users\ilyes\M2\Algorithmique\Projet\dataset_final_immo_idf_parquet
Hadoop home: C:\Users\ilyes\M2\Algorithmique\Projet\hadoop
Bin dir    : C:\Users\ilyes\M2\Algorithmique\Projet\hadoop\bin


# Windows Hadoop (process-only) + priorité DLL + détection “DLL parasite”

In [None]:
import subprocess

#Ajouter explicitement le dossier BIN à la recherche de DLL (Python 3.8+)
if hasattr(os, "add_dll_directory"):
    os.add_dll_directory(str(bin_dir))

os.environ["HADOOP_HOME"] = str(hadoop_home)
os.environ["hadoop.home.dir"] = str(hadoop_home)

os.environ["PATH"] = str(bin_dir) + os.pathsep + os.environ.get("PATH", "")

print("HADOOP_HOME =", os.environ["HADOOP_HOME"])
print("PATH head   =", os.environ["PATH"].split(os.pathsep)[:3])

out = subprocess.run(["where", "hadoop.dll"], capture_output=True, text=True)
dlls = [l.strip() for l in (out.stdout or "").splitlines() if l.strip()]
print("where hadoop.dll ->")
print("\n".join(dlls) if dlls else (out.stderr or "Aucune sortie"))

expected = str((bin_dir / "hadoop.dll").resolve())
if dlls:
    first = str(Path(dlls[0]).resolve())
    if first != expected:
        raise RuntimeError(
            "Conflit DLL: la 1ère hadoop.dll trouvée n'est PAS celle de votre projet.\n"
            f"Attendu en 1er: {expected}\n"
            f"Trouvé en 1er : {first}\n"
            "Solution: supprimez/renommez la hadoop.dll parasite (ou retirez son dossier du PATH), "
            "puis Kernel Restart et relancez depuis la cellule 0."
        )
print("OK - pas de conflit DLL (ou votre DLL est prioritaire).")

HADOOP_HOME = C:\Users\ilyes\M2\Algorithmique\Projet\hadoop
PATH head   = ['C:\\Users\\ilyes\\M2\\Algorithmique\\Projet\\hadoop\\bin', 'A:\\Apps\\ide\\Anaconda', 'A:\\Apps\\ide\\Anaconda\\Library\\mingw-w64\\bin']
where hadoop.dll ->
C:\Users\ilyes\M2\Algorithmique\Projet\hadoop\bin\hadoop.dll
OK - pas de conflit DLL (ou votre DLL est prioritaire).


# Démarrage Spark

In [None]:
from pyspark.sql import SparkSession

spark = (SparkSession.builder
         .appName("ImmoIDF-Apprentissage")
         .getOrCreate())

print("Spark:", spark.version)
print("Hadoop (JVM):", spark.sparkContext._jvm.org.apache.hadoop.util.VersionInfo.getVersion())
print("Native loaded?:", spark.sparkContext._jvm.org.apache.hadoop.util.NativeCodeLoader.isNativeCodeLoaded())

Spark: 3.5.6
Hadoop (JVM): 3.3.4
Native loaded?: True


# FS Hadoop

In [None]:
fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration())
p  = spark._jvm.org.apache.hadoop.fs.Path(str(dataset_dir))  # chemin Windows normal
st = fs.listStatus(p)
print("OK - listStatus. Nb entrées:", len(st))

OK - listStatus. Nb entrées: 2


# Lecture du dataset Parquet

In [None]:
from pyspark.sql import functions as F

df = spark.read.parquet(str(dataset_dir))
df = df.withColumn("Prix", F.col("Prix").cast("double"))

print("Nb lignes:", df.count())
print("Nb colonnes:", len(df.columns))
df.show(5, truncate=False)

Nb lignes: 495
Nb colonnes: 16
+-------+---------+-----------+------+--------+-----+-----+-----+-----+-----+-----+----------+----------------+-----------+------------+-----------+
|Surface|NbrPieces|NbrChambres|NbrSdb|Prix    |DPE_B|DPE_C|DPE_D|DPE_E|DPE_F|DPE_G|DPE_Vierge|Type_Appartement|Type_Maison|latitude    |longitude  |
+-------+---------+-----------+------+--------+-----+-----+-----+-----+-----+-----+----------+----------------+-----------+------------+-----------+
|60.0   |3.0      |2.0        |1.0   |215000.0|0    |0    |0    |0    |0    |0    |1         |1               |0          |48.60739655 |2.623860846|
|70.0   |4.0      |2.0        |1.0   |180000.0|0    |0    |0    |0    |0    |0    |1         |1               |0          |48.657051798|2.38747617 |
|75.0   |2.0      |3.0        |1.0   |115000.0|0    |0    |0    |0    |0    |0    |1         |1               |0          |48.657051798|2.38747617 |
|70.0   |4.0      |2.0        |1.0   |120000.0|0    |0    |0    |0    |0   

# Features/Label (drop baselines + drop constantes)

In [None]:
label_col = "Prix"

drop_baselines = {"Type_Appartement", "DPE_Vierge"}
raw_feature_cols = [c for c in df.columns if c != label_col and c not in drop_baselines]

const_cols = []
for c in raw_feature_cols:
    mm = df.agg(F.min(c).alias("min"), F.max(c).alias("max")).first()
    if mm["min"] == mm["max"]:
        const_cols.append(c)

feature_cols = [c for c in raw_feature_cols if c not in const_cols]

print("Baselines supprimées:", drop_baselines)
print("Constantes supprimées:", const_cols)
print("Nb features:", len(feature_cols))
print("Features:", feature_cols)

Baselines supprimées: {'Type_Appartement', 'DPE_Vierge'}
Constantes supprimées: []
Nb features: 13
Features: ['Surface', 'NbrPieces', 'NbrChambres', 'NbrSdb', 'DPE_B', 'DPE_C', 'DPE_D', 'DPE_E', 'DPE_F', 'DPE_G', 'Type_Maison', 'latitude', 'longitude']


# VectorAssembler + split

In [None]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

data = (assembler
        .transform(df)
        .select("features", F.col(label_col).alias("label")))

train_df, test_df = data.randomSplit([0.75, 0.25], seed=49)

train_df = train_df.cache()
test_df  = test_df.cache()
_ = train_df.count()
_ = test_df.count()

print("Train:", train_df.count(), "Test:", test_df.count())

Train: 370 Test: 125


# Évaluateurs

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator

e_r2   = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")
e_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")

# Régression linéaire

In [9]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(
    featuresCol="features",
    labelCol="label",
    solver="l-bfgs",
    regParam=0.0,
    elasticNetParam=0.0,
    maxIter=200
)

lr_model = lr.fit(train_df)
pred_lr = lr_model.transform(test_df)

r2_lr   = float(e_r2.evaluate(pred_lr))
rmse_lr = float(e_rmse.evaluate(pred_lr))

print("LR -> R2:", r2_lr, "RMSE:", rmse_lr)
pred_lr.select("label", "prediction").show(10, truncate=False)

LR -> R2: -0.03231324664734747 RMSE: 203569.18947120733
+---------+------------------+
|label    |prediction        |
+---------+------------------+
|140000.0 |379295.2311125845 |
|102000.0 |288502.311502818  |
|233000.0 |344821.1686208248 |
|244500.0 |401560.2589679044 |
|689000.0 |360841.200340122  |
|230000.0 |349521.0862640254 |
|635000.0 |379097.26784938015|
|560000.0 |381912.1592435427 |
|475000.0 |376110.46623755805|
|1600000.0|362770.11282882094|
+---------+------------------+
only showing top 10 rows



# Scaling + LR (MinMax et Standard)

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import MinMaxScaler, StandardScaler
from pyspark.ml.regression import LinearRegression

#MinMax + LR
minmax = MinMaxScaler(inputCol="features", outputCol="features_minmax")
lr_mm = LinearRegression(featuresCol="features_minmax", labelCol="label", solver="l-bfgs", regParam=0.0, maxIter=200)
pipe_mm = Pipeline(stages=[minmax, lr_mm])
m_mm = pipe_mm.fit(train_df)
pred_mm = m_mm.transform(test_df)
r2_mm   = float(e_r2.evaluate(pred_mm))
rmse_mm = float(e_rmse.evaluate(pred_mm))
print("MinMax+LR -> R2:", r2_mm, "RMSE:", rmse_mm)

#Std + LR
std = StandardScaler(inputCol="features", outputCol="features_std", withStd=True, withMean=False)
lr_std = LinearRegression(featuresCol="features_std", labelCol="label", solver="l-bfgs", regParam=0.0, maxIter=200)
pipe_std = Pipeline(stages=[std, lr_std])
m_std = pipe_std.fit(train_df)
pred_std = m_std.transform(test_df)
r2_std   = float(e_r2.evaluate(pred_std))
rmse_std = float(e_rmse.evaluate(pred_std))
print("Std+LR -> R2:", r2_std, "RMSE:", rmse_std)

MinMax+LR -> R2: -0.03231324660475332 RMSE: 203569.1894670076
Std+LR -> R2: -0.03231324657738632 RMSE: 203569.18946430925


# K plus proches voisins

In [None]:
import math
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType
from pyspark.sql.window import Window
from pyspark.ml.functions import vector_to_array
from pyspark.ml.feature import MinMaxScaler, StandardScaler

#Distance euclidienne
def euclidean_py(a, b):
    if a is None or b is None:
        return None
    return float(math.sqrt(sum((x - y) ** 2 for x, y in zip(a, b))))

euclidean_udf = F.udf(euclidean_py, DoubleType())

def scale_train_test(train_df, test_df, method: str):
    """
    method in {"none", "minmax", "std"}
    Retourne (train_scaled, test_scaled, feat_col)
    """
    if method == "none":
        return train_df, test_df, "features"

    if method == "minmax":
        scaler = MinMaxScaler(inputCol="features", outputCol="features_scaled")
        m = scaler.fit(train_df)
        return m.transform(train_df), m.transform(test_df), "features_scaled"

    if method == "std":
        scaler = StandardScaler(inputCol="features", outputCol="features_scaled", withStd=True, withMean=False)
        m = scaler.fit(train_df)
        return m.transform(train_df), m.transform(test_df), "features_scaled"

    raise ValueError("method doit être: none|minmax|std")

def knn_predict_exact(train_df, test_df, k: int, feat_col: str):
    """
    KNN régression : moyenne des k voisins les plus proches (distance euclidienne).
    Corrigé : labels renommés pour éviter AMBIGUOUS_REFERENCE.
    """
    tr = (train_df
          .withColumn("id_train", F.monotonically_increasing_id())
          .withColumn("fa", vector_to_array(F.col(feat_col)))
          .select("id_train", "fa", F.col("label").alias("label_train")))

    te = (test_df
          .withColumn("id_test", F.monotonically_increasing_id())
          .withColumn("fb", vector_to_array(F.col(feat_col)))
          .select("id_test", "fb", F.col("label").alias("label_test")))

    pairs = (te.crossJoin(tr)
             .withColumn("dist", euclidean_udf(F.col("fb"), F.col("fa")))
             .select("id_test", "label_test", "label_train", "dist"))

    w = Window.partitionBy("id_test").orderBy(F.col("dist").asc())
    knn = pairs.withColumn("rn", F.row_number().over(w)).filter(F.col("rn") <= k)

    pred = (knn.groupBy("id_test")
            .agg(F.avg("label_train").alias("prediction")))

    out = (te.select("id_test", "label_test")
           .join(pred, on="id_test", how="inner")
           .select(F.col("label_test").alias("label"), "prediction"))

    return out

def eval_knn(train_df, test_df, k: int, scaling: str):
    tr_s, te_s, feat_col = scale_train_test(train_df, test_df, scaling)
    pred = knn_predict_exact(tr_s, te_s, k=k, feat_col=feat_col)
    r2 = float(e_r2.evaluate(pred))
    rmse = float(e_rmse.evaluate(pred))
    return r2, rmse

k = 4
r2_knn_none, rmse_knn_none = eval_knn(train_df, test_df, k=k, scaling="none")
r2_knn_mm,   rmse_knn_mm   = eval_knn(train_df, test_df, k=k, scaling="minmax")
r2_knn_std,  rmse_knn_std  = eval_knn(train_df, test_df, k=k, scaling="std")

print(f"KNN k={k} -> none   : R2={r2_knn_none:.4f} RMSE={rmse_knn_none:.2f}")
print(f"KNN k={k} -> minmax : R2={r2_knn_mm:.4f} RMSE={rmse_knn_mm:.2f}")
print(f"KNN k={k} -> std    : R2={r2_knn_std:.4f} RMSE={rmse_knn_std:.2f}")

KNN k=4 -> none   : R2=-0.3507 RMSE=232855.87
KNN k=4 -> minmax : R2=-0.6041 RMSE=253762.54
KNN k=4 -> std    : R2=-0.2331 RMSE=222488.75


# Comparatif final (LR vs KNN)

In [None]:
rows = [
    ("LR", r2_lr, rmse_lr),
]

if "r2_mm" in globals():
    rows.append(("MinMax+LR", r2_mm, rmse_mm))
if "r2_std" in globals():
    rows.append(("Std+LR", r2_std, rmse_std))

#KNN
rows += [
    ("KNN", r2_knn_none, rmse_knn_none),
    ("MinMax+KNN", r2_knn_mm, rmse_knn_mm),
    ("Std+KNN", r2_knn_std, rmse_knn_std),
]

results = spark.createDataFrame(rows, ["Modele", "R2", "RMSE"]).orderBy(F.col("RMSE").asc())
results.show(truncate=False)

+----------+--------------------+------------------+
|Modele    |R2                  |RMSE              |
+----------+--------------------+------------------+
|Std+LR    |-0.03231324657738632|203569.18946430925|
|MinMax+LR |-0.03231324660475332|203569.1894670076 |
|LR        |-0.03231324664734747|203569.18947120733|
|Std+KNN   |-0.23311480247608518|222488.74838266318|
|KNN       |-0.3507089791675315 |232855.87021257592|
|MinMax+KNN|-0.6041405396270276 |253762.53524702974|
+----------+--------------------+------------------+

