In [20]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import preprocessing  # ton fichier local contenant la fonction preprocessing_pipline

# 1️⃣ Chargement des données
data = pd.read_csv("../data/car_price_prediction.csv")

# 2️⃣ Prétraitement personnalisé
data = preprocessing.preprocessing_pipline(data)

# 3️⃣ Encodage One-Hot (pour les colonnes catégorielles avec peu de valeurs uniques)
one_hot_columns = ["Leather interior", "Gear box type", "Drive wheels", "Wheel"]
existing_one_hot_cols = [col for col in one_hot_columns if col in data.columns]
data = pd.get_dummies(data, columns=existing_one_hot_cols)

# 4️⃣ Encodage LabelEncoder (pour les colonnes catégorielles restantes)
label_encode_columns = ["Manufacturer", "Model", "Category", "Fuel type", "Color"]
existing_label_cols = [col for col in label_encode_columns if col in data.columns]

label_encoder = LabelEncoder()
for column in existing_label_cols:
    data[column] = label_encoder.fit_transform(data[column])

# 5️⃣ Séparation des features et de la cible
X = data.drop("Price", axis=1)
y = data["Price"]

# 6️⃣ Division en ensembles d'entraînement, validation et test
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f"Train set: {len(X_train)} samples")
print(f"Validation set: {len(X_val)} samples")
print(f"Test set: {len(X_test)} samples")

# 7️⃣ Normalisation des colonnes numériques
numerical_columns = ["Levy", "Engine volume", "Mileage"]
existing_num_cols = [col for col in numerical_columns if col in X.columns]

scaler = StandardScaler()
X_train[existing_num_cols] = scaler.fit_transform(X_train[existing_num_cols])
X_val[existing_num_cols] = scaler.transform(X_val[existing_num_cols])
X_test[existing_num_cols] = scaler.transform(X_test[existing_num_cols])

# 8️⃣ Entraînement du modèle de régression linéaire
lr = LinearRegression()
lr.fit(X_train, y_train)

# 9️⃣ Évaluation du modèle
y_val_pred = lr.predict(X_val)
mse = mean_squared_error(y_val, y_val_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_val, y_val_pred)

print(f"\n✅ Évaluation du modèle :")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R² Score: {r2}")


Preprocessing started.....
Initial shape : (19237, 18)
After dropping duplicates: (18924, 18)
Replacing categorical values.....
Cleaning 'Levy' column...
Cleaning outliers.....
After cleaning outliers: (16037, 18)
Feature engineering.....
Column transformations...
Dropping columns...
✅ Preprocessing completed successfully!
Final shape: (16037, 20)
Train set: 11225 samples
Validation set: 2406 samples
Test set: 2406 samples

✅ Évaluation du modèle :
Root Mean Squared Error (RMSE): 9622.769396834221
R² Score: 0.2519231582465985
