In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder


### Load the cleaned Data
Nous chargeons le jeu de données préalablement nettoyé pour commencer la phase de transformation.


In [2]:
df =  pd.read_csv('Carvago_cleaned.csv')

In [3]:
df.head()


Unnamed: 0,Mileage,Power,Transmission,Fuel,Drive_type,Make,Body_color,Interior_color,Interior_material,Body,Doors,Seats,CO2_emissions,Engine_capacity,Emission_class,price,Consumption_Standardized,Year
0,58728,288.0,Automatic,Hybrid,4x4,BMW,Blue,Black interior,Part leather interior,SUV / offroad,4/5 doors,5,,1998.0,Euro 6d,39399,,2021
1,28000,201.0,Automatic,Electric,4x2,Volkswagen,White,Grey interior,Cloth interior,Hatchback,4/5 doors,5,,,Euro 6,32999,,2023
2,85023,134.0,Automatic,Electric,4x2,Opel,Grey,Black interior,Cloth interior,Hatchback,4/5 doors,5,,,Euro 6,15849,10.08,2020
3,67514,259.0,Automatic,Diesel,4x4,Volkswagen,Black,Beige interior,Full leather interior,SUV / offroad,4/5 doors,5,180.0,2967.0,Euro 6,38749,6.9,2017
4,3850,225.0,Automatic,Electric,4x4,Mercedes-Benz,Black,Black interior,Part leather interior,SUV / offroad,4/5 doors,5,,,Euro 6,37099,10.92,2023


In [4]:
df.describe()

Unnamed: 0,Mileage,Power,CO2_emissions,Engine_capacity,price,Consumption_Standardized,Year
count,21818.0,21818.0,18805.0,20270.0,21818.0,18663.0,21818.0
mean,43027.886562,177.073792,131.319224,1717.921411,36220.18,5.867935,2021.535659
std,41305.833333,99.584079,43.439249,634.612907,36259.9,2.060837,2.386371
min,0.0,8.0,5.0,1.0,2899.0,0.2,2011.0
25%,8532.75,114.0,109.0,1332.0,20599.0,4.8,2020.0
50%,30508.5,148.0,125.0,1580.0,28549.0,5.5,2022.0
75%,68698.75,201.0,149.0,1984.0,40786.5,6.4,2024.0
max,175000.0,1002.0,499.0,6592.0,2454249.0,30.7,2024.0


In [5]:
print(df.dtypes)

Mileage                       int64
Power                       float64
Transmission                 object
Fuel                         object
Drive_type                   object
Make                         object
Body_color                   object
Interior_color               object
Interior_material            object
Body                         object
Doors                        object
Seats                        object
CO2_emissions               float64
Engine_capacity             float64
Emission_class               object
price                         int64
Consumption_Standardized    float64
Year                          int64
dtype: object


# Data Transformation
Cette section détaille les transformations appliquées au jeu de données, y compris le mapping des colonnes, les encodages, et les suppressions de données rares.


### Mapping des colonnes du DataFrame
Nous utilisons la méthode `map` pour convertir des chaînes de caractères en valeurs numériques dans les colonnes `Transmission`, `Fuel` et `Drive_type` du DataFrame.

Par exemple, la colonne `Transmission` peut être mappée comme suit :
- `Manual` → 0
- `Automatic` → 1


In [6]:
df['Transmission'] = df['Transmission'].map({'Automatic': 0, 'Manual': 1})

In [7]:
df['Fuel'] = df['Fuel'].map({'Diesel': 0, 'Petrol': 1, 'Electric': 3, 'Hybrid': 2})

In [8]:
df['Drive_type'] = df['Drive_type'].map({'4x2': 0, '4x4': 1})


In [9]:
df.head()

Unnamed: 0,Mileage,Power,Transmission,Fuel,Drive_type,Make,Body_color,Interior_color,Interior_material,Body,Doors,Seats,CO2_emissions,Engine_capacity,Emission_class,price,Consumption_Standardized,Year
0,58728,288.0,0,2,1,BMW,Blue,Black interior,Part leather interior,SUV / offroad,4/5 doors,5,,1998.0,Euro 6d,39399,,2021
1,28000,201.0,0,3,0,Volkswagen,White,Grey interior,Cloth interior,Hatchback,4/5 doors,5,,,Euro 6,32999,,2023
2,85023,134.0,0,3,0,Opel,Grey,Black interior,Cloth interior,Hatchback,4/5 doors,5,,,Euro 6,15849,10.08,2020
3,67514,259.0,0,0,1,Volkswagen,Black,Beige interior,Full leather interior,SUV / offroad,4/5 doors,5,180.0,2967.0,Euro 6,38749,6.9,2017
4,3850,225.0,0,3,1,Mercedes-Benz,Black,Black interior,Part leather interior,SUV / offroad,4/5 doors,5,,,Euro 6,37099,10.92,2023


### Création d'un encodage one-hot et conversion en entier
Nous utilisons la fonction `pd.get_dummies` pour effectuer un encodage one-hot de toutes les colonnes catégoriques. Ensuite, les colonnes résultantes sont converties en entiers pour faciliter l'analyse.


In [10]:
print(df['Make'].unique())  # Vérifie les types de carburants restants


['BMW' 'Volkswagen' 'Opel' 'Mercedes-Benz' 'Peugeot' 'Ford' 'Toyota'
 'Audi' 'Volvo' 'Kia' 'Fiat' 'Jeep' 'Škoda' 'DS Automobiles' 'Nissan'
 'Hyundai' 'Renault' 'Mazda' 'Porsche' 'Dacia' 'Lamborghini' 'Honda'
 'Seat' 'Cupra' 'Subaru' 'Smart' 'Land Rover' 'Alfa Romeo' 'Citroën'
 'Aston Martin' 'MINI' 'Mitsubishi' 'Suzuki' 'MG' 'Tesla' 'Chevrolet'
 'Jaguar' 'Polestar' 'Abarth' 'Lancia' 'Lynk & Co' 'Ferrari' 'Lexus'
 'Bentley' 'SsangYong' 'Maserati' 'DR Automobiles']


In [11]:
# Create one-hot encoding and convert to int
make_dummies = pd.get_dummies(df['Make']).astype(int)

# Add dummy columns to original dataframe
df = pd.concat([df, make_dummies], axis=1)

# Verify conversion to 1/0
print("Sample of numeric encoding (1/0):")
print(make_dummies.head())

# Check unique values to confirm only 1 and 0 exist
print("\nUnique values in encoded columns:")
print(make_dummies.nunique().unique())

# Drop original Make column
df.drop('Make', axis=1, inplace=True)

# Verify final shape
print(f"\nFinal dataframe shape: {df.shape}")


Sample of numeric encoding (1/0):
   Abarth  Alfa Romeo  Aston Martin  Audi  BMW  Bentley  Chevrolet  Citroën  \
0       0           0             0     0    1        0          0        0   
1       0           0             0     0    0        0          0        0   
2       0           0             0     0    0        0          0        0   
3       0           0             0     0    0        0          0        0   
4       0           0             0     0    0        0          0        0   

   Cupra  DR Automobiles  ...  Seat  Smart  SsangYong  Subaru  Suzuki  Tesla  \
0      0               0  ...     0      0          0       0       0      0   
1      0               0  ...     0      0          0       0       0      0   
2      0               0  ...     0      0          0       0       0      0   
3      0               0  ...     0      0          0       0       0      0   
4      0               0  ...     0      0          0       0       0      0   

   Toyota 

In [12]:
df.head()

Unnamed: 0,Mileage,Power,Transmission,Fuel,Drive_type,Body_color,Interior_color,Interior_material,Body,Doors,...,Seat,Smart,SsangYong,Subaru,Suzuki,Tesla,Toyota,Volkswagen,Volvo,Škoda
0,58728,288.0,0,2,1,Blue,Black interior,Part leather interior,SUV / offroad,4/5 doors,...,0,0,0,0,0,0,0,0,0,0
1,28000,201.0,0,3,0,White,Grey interior,Cloth interior,Hatchback,4/5 doors,...,0,0,0,0,0,0,0,1,0,0
2,85023,134.0,0,3,0,Grey,Black interior,Cloth interior,Hatchback,4/5 doors,...,0,0,0,0,0,0,0,0,0,0
3,67514,259.0,0,0,1,Black,Beige interior,Full leather interior,SUV / offroad,4/5 doors,...,0,0,0,0,0,0,0,1,0,0
4,3850,225.0,0,3,1,Black,Black interior,Part leather interior,SUV / offroad,4/5 doors,...,0,0,0,0,0,0,0,0,0,0


In [13]:
print(df['Body_color'].unique())  

['Blue' 'White' 'Grey' 'Black' 'Silver' 'Brown' 'Red' 'Orange' 'Green'
 'Yellow' 'Purple' 'Beige' 'Gold']


In [14]:
df.head()

Unnamed: 0,Mileage,Power,Transmission,Fuel,Drive_type,Body_color,Interior_color,Interior_material,Body,Doors,...,Seat,Smart,SsangYong,Subaru,Suzuki,Tesla,Toyota,Volkswagen,Volvo,Škoda
0,58728,288.0,0,2,1,Blue,Black interior,Part leather interior,SUV / offroad,4/5 doors,...,0,0,0,0,0,0,0,0,0,0
1,28000,201.0,0,3,0,White,Grey interior,Cloth interior,Hatchback,4/5 doors,...,0,0,0,0,0,0,0,1,0,0
2,85023,134.0,0,3,0,Grey,Black interior,Cloth interior,Hatchback,4/5 doors,...,0,0,0,0,0,0,0,0,0,0
3,67514,259.0,0,0,1,Black,Beige interior,Full leather interior,SUV / offroad,4/5 doors,...,0,0,0,0,0,0,0,1,0,0
4,3850,225.0,0,3,1,Black,Black interior,Part leather interior,SUV / offroad,4/5 doors,...,0,0,0,0,0,0,0,0,0,0


In [15]:
print(df['Interior_color'].unique())  

['Black interior' 'Grey interior' 'Beige interior' 'Other interior color'
 'Brown interior']


In [16]:
print(df['Interior_material'].unique())  

['Part leather interior' 'Cloth interior' 'Full leather interior'
 'Other interior material' 'Alcantara interior' 'Velour interior']


In [17]:
# Using pandas get_dummies() instead of OneHotEncoder
columns_to_encode = ['Body_color', 'Interior_color', 'Interior_material', 'Body']

# Create dummy variables for all columns at once
encoded_df = pd.get_dummies(
    df[columns_to_encode], 
    prefix=columns_to_encode,
    drop_first=True  # Drop first category to avoid multicollinearity
)

# Drop original columns and concatenate encoded ones
df = df.drop(columns=columns_to_encode)
df = pd.concat([df, encoded_df], axis=1)

# Verify new columns
print("New encoded columns:")
print([col for col in df.columns if any(x in col for x in columns_to_encode)])

New encoded columns:
['Body_color_Black', 'Body_color_Blue', 'Body_color_Brown', 'Body_color_Gold', 'Body_color_Green', 'Body_color_Grey', 'Body_color_Orange', 'Body_color_Purple', 'Body_color_Red', 'Body_color_Silver', 'Body_color_White', 'Body_color_Yellow', 'Interior_color_Black interior', 'Interior_color_Brown interior', 'Interior_color_Grey interior', 'Interior_color_Other interior color', 'Interior_material_Cloth interior', 'Interior_material_Full leather interior', 'Interior_material_Other interior material', 'Interior_material_Part leather interior', 'Interior_material_Velour interior', 'Body_Cargo VAN', 'Body_Coupe', 'Body_Hatchback', 'Body_MPV', 'Body_MPV/VAN', 'Body_Pick-up', 'Body_SUV / offroad', 'Body_Sedans / saloons', 'Body_Station Wagon']


In [18]:
df.head()

Unnamed: 0,Mileage,Power,Transmission,Fuel,Drive_type,Doors,Seats,CO2_emissions,Engine_capacity,Emission_class,...,Interior_material_Velour interior,Body_Cargo VAN,Body_Coupe,Body_Hatchback,Body_MPV,Body_MPV/VAN,Body_Pick-up,Body_SUV / offroad,Body_Sedans / saloons,Body_Station Wagon
0,58728,288.0,0,2,1,4/5 doors,5,,1998.0,Euro 6d,...,False,False,False,False,False,False,False,True,False,False
1,28000,201.0,0,3,0,4/5 doors,5,,,Euro 6,...,False,False,False,True,False,False,False,False,False,False
2,85023,134.0,0,3,0,4/5 doors,5,,,Euro 6,...,False,False,False,True,False,False,False,False,False,False
3,67514,259.0,0,0,1,4/5 doors,5,180.0,2967.0,Euro 6,...,False,False,False,False,False,False,False,True,False,False
4,3850,225.0,0,3,1,4/5 doors,5,,,Euro 6,...,False,False,False,False,False,False,False,True,False,False


In [19]:
print(df['Doors'].unique()) 

['4/5 doors' '2/3 doors' '6/7 doors']


### Suppression des voitures avec 6 ou 7 portes
Les voitures ayant 6 ou 7 portes sont très rares dans le jeu de données, ce qui pourrait introduire du bruit dans l'analyse. Ces lignes sont donc supprimées pour préserver la qualité des résultats.


In [20]:
# Show initial distribution
print("Initial doors distribution:")
print(df['Doors'].value_counts())

# Remove rows with 6 or 7 doors
df = df[~df['Doors'].isin(['6/7 doors'])]

# Show final distribution
print("\nFinal doors distribution:")
print(df['Doors'].value_counts())

# Show total rows affected
print("\nTotal rows in dataset:")
print(f"Before: {len(df)}")
print(f"After: {len(df[~df['Doors'].isin(['6/7 doors'])])}")

Initial doors distribution:
Doors
4/5 doors    20389
2/3 doors     1423
6/7 doors        6
Name: count, dtype: int64

Final doors distribution:
Doors
4/5 doors    20389
2/3 doors     1423
Name: count, dtype: int64

Total rows in dataset:
Before: 21812
After: 21812


In [21]:
df['Doors'] = df['Doors'].map({'4/5 doors': 1, '2/3 doors': 0})

In [22]:
# Show initial distribution
print("Initial Emission_class distribution:")
print(df['Emission_class'].value_counts())

# Remove specified emission classes
df = df[~df['Emission_class'].isin(['Euro 1', 'Euro 4', 'Euro 6c'])]

# Show final distribution
print("\nFinal Emission_class distribution:")
print(df['Emission_class'].value_counts())

# Show total rows affected
print("\nTotal rows in dataset:")
print(f"Total rows removed: {len(df[df['Emission_class'].isin(['Euro 1', 'Euro 4', 'Euro 6c'])])}")
print(f"Remaining rows: {len(df)}")

Initial Emission_class distribution:
Emission_class
Euro 6          8945
Euro 6d         7980
Euro 6d-TEMP    4326
Euro 5           497
Euro 6c           50
Euro 4            12
Euro 1             2
Name: count, dtype: int64

Final Emission_class distribution:
Emission_class
Euro 6          8945
Euro 6d         7980
Euro 6d-TEMP    4326
Euro 5           497
Name: count, dtype: int64

Total rows in dataset:
Total rows removed: 0
Remaining rows: 21748


### Suppression des classes d'émission rares
Nous avons supprimé les voitures avec des classes d'émission Euro 1, Euro 4, et Euro 6c, car elles sont très rares dans le jeu de données et pourraient biaiser les résultats.


In [23]:
df['Emission_class'] = df['Emission_class'].map({'Euro 6': 1, 'Euro 6d-TEMP':  2, 'Euro 6d': 3, 'Euro 5': 0})

In [24]:
# Show initial distribution
print("Initial seats distribution:")
print(df['Seats'].value_counts())

# Remove specified seat configurations
seats_to_remove = ['1', '12+', '9+', '8+']
df = df[~df['Seats'].isin(seats_to_remove)]

# Show final distribution
print("\nFinal seats distribution:")
print(df['Seats'].value_counts())

# Show impact on dataset
print("\nDataset size:")
print(f"Before: {len(df)}")
print(f"After: {len(df[~df['Seats'].isin(seats_to_remove)])}")

Initial seats distribution:
Seats
5      18482
4       1422
7        639
3        482
2        429
6        108
8+       102
9+        80
12+        2
1          2
Name: count, dtype: int64

Final seats distribution:
Seats
5    18482
4     1422
7      639
3      482
2      429
6      108
Name: count, dtype: int64

Dataset size:
Before: 21562
After: 21562


### Suppression des voitures avec des configurations de sièges rares
Les voitures avec des configurations de sièges atypiques ou très peu fréquentes dans le jeu de données ont été supprimées pour éviter des biais dans l'analyse.


In [312]:
df.head()

Unnamed: 0,Mileage,Power,Transmission,Fuel,Drive_type,Doors,Seats,CO2_emissions,Engine_capacity,Emission_class,...,Interior_material_Velour interior,Body_Cargo VAN,Body_Coupe,Body_Hatchback,Body_MPV,Body_MPV/VAN,Body_Pick-up,Body_SUV / offroad,Body_Sedans / saloons,Body_Station Wagon
0,58728,288.0,0,2,1,1,5,30.5,1998.0,3,...,False,False,False,False,False,False,False,True,False,False
1,28000,201.0,0,3,0,1,5,149.0,1498.0,1,...,False,False,False,True,False,False,False,False,False,False
2,85023,134.0,0,3,0,1,5,149.0,1199.0,1,...,False,False,False,True,False,False,False,False,False,False
4,67514,259.0,0,0,1,1,5,180.0,2967.0,1,...,False,False,False,False,False,False,False,True,False,False
5,3850,225.0,0,3,1,1,5,149.0,1991.0,1,...,False,False,False,False,False,False,False,True,False,False


In [313]:
df.dtypes

Mileage                    int64
Power                    float64
Transmission               int64
Fuel                       int64
Drive_type                 int64
                          ...   
Body_MPV/VAN                bool
Body_Pick-up                bool
Body_SUV / offroad          bool
Body_Sedans / saloons       bool
Body_Station Wagon          bool
Length: 90, dtype: object

In [314]:
df.duplicated().sum()

202

In [315]:
df.drop_duplicates(inplace=True)

In [316]:
len(df)

21360

In [320]:
df = df.replace({True: 1, False: 0})
print(df.head())  


   Mileage  Power  Transmission  Fuel  Drive_type  Doors Seats  CO2_emissions  \
0    58728  288.0             0     2           1      1     5           30.5   
1    28000  201.0             0     3           0      1     5          149.0   
2    85023  134.0             0     3           0      1     5          149.0   
4    67514  259.0             0     0           1      1     5          180.0   
5     3850  225.0             0     3           1      1     5          149.0   

   Engine_capacity  Emission_class  ...  Interior_material_Velour interior  \
0           1998.0               3  ...                                  0   
1           1498.0               1  ...                                  0   
2           1199.0               1  ...                                  0   
4           2967.0               1  ...                                  0   
5           1991.0               1  ...                                  0   

   Body_Cargo VAN  Body_Coupe  Body_Hatchbac

In [321]:
df.to_csv('Carvago_encoded.csv', index=False)

In [322]:
df.head()

Unnamed: 0,Mileage,Power,Transmission,Fuel,Drive_type,Doors,Seats,CO2_emissions,Engine_capacity,Emission_class,...,Interior_material_Velour interior,Body_Cargo VAN,Body_Coupe,Body_Hatchback,Body_MPV,Body_MPV/VAN,Body_Pick-up,Body_SUV / offroad,Body_Sedans / saloons,Body_Station Wagon
0,58728,288.0,0,2,1,1,5,30.5,1998.0,3,...,0,0,0,0,0,0,0,1,0,0
1,28000,201.0,0,3,0,1,5,149.0,1498.0,1,...,0,0,0,1,0,0,0,0,0,0
2,85023,134.0,0,3,0,1,5,149.0,1199.0,1,...,0,0,0,1,0,0,0,0,0,0
4,67514,259.0,0,0,1,1,5,180.0,2967.0,1,...,0,0,0,0,0,0,0,1,0,0
5,3850,225.0,0,3,1,1,5,149.0,1991.0,1,...,0,0,0,0,0,0,0,1,0,0
