# Pricing Annalysis

In [186]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import root_mean_squared_error, r2_score
import xgboost as xgb

## Load data

In [187]:
rawdata = pd.read_csv('../dashboard/src/data/get_around_pricing_project.csv')
rawdata.head()

Unnamed: 0.1,Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
0,0,Citroën,140411,100,diesel,black,convertible,True,True,False,False,True,True,True,106
1,1,Citroën,13929,317,petrol,grey,convertible,True,True,False,False,False,True,True,264
2,2,Citroën,183297,120,diesel,white,convertible,False,False,False,False,True,False,True,101
3,3,Citroën,128035,135,diesel,red,convertible,True,True,False,False,True,True,True,158
4,4,Citroën,97097,160,diesel,silver,convertible,True,True,False,False,False,True,True,183


## Basics stats

In [188]:
# Basic stats
print("Taille du dataset:")
print("Number of rows : {}".format(rawdata.shape[0]))
print("Number of columns : {}".format(rawdata.shape[1]))
print()
print("---------------------------")
print()

print("Basics infos:")
print()
display(rawdata.info())
print()
print("---------------------------")
print()

print("Basics statistics: ")
print()
data_desc = rawdata.describe(include='all')
display(data_desc)
print()
print("---------------------------")
print()

print("Unique elements by feature: ")
print()
display(rawdata.nunique().sort_values())
print()
print("---------------------------")
print()

print("Percentage of missing values: ")
print()
display(100*rawdata.isnull().sum()/rawdata.shape[0])## 3. 

Taille du dataset:
Number of rows : 4843
Number of columns : 15

---------------------------

Basics infos:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4843 entries, 0 to 4842
Data columns (total 15 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Unnamed: 0                 4843 non-null   int64 
 1   model_key                  4843 non-null   object
 2   mileage                    4843 non-null   int64 
 3   engine_power               4843 non-null   int64 
 4   fuel                       4843 non-null   object
 5   paint_color                4843 non-null   object
 6   car_type                   4843 non-null   object
 7   private_parking_available  4843 non-null   bool  
 8   has_gps                    4843 non-null   bool  
 9   has_air_conditioning       4843 non-null   bool  
 10  automatic_car              4843 non-null   bool  
 11  has_getaround_connect      4843 non-null   bool  
 12  has_speed

None


---------------------------

Basics statistics: 



Unnamed: 0.1,Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
count,4843.0,4843,4843.0,4843.0,4843,4843,4843,4843,4843,4843,4843,4843,4843,4843,4843.0
unique,,28,,,4,10,8,2,2,2,2,2,2,2,
top,,Citroën,,,diesel,black,estate,True,True,False,False,False,False,True,
freq,,969,,,4641,1633,1606,2662,3839,3865,3881,2613,3674,4514,
mean,2421.0,,140962.8,128.98823,,,,,,,,,,,121.214536
std,1398.198007,,60196.74,38.99336,,,,,,,,,,,33.568268
min,0.0,,-64.0,0.0,,,,,,,,,,,10.0
25%,1210.5,,102913.5,100.0,,,,,,,,,,,104.0
50%,2421.0,,141080.0,120.0,,,,,,,,,,,119.0
75%,3631.5,,175195.5,135.0,,,,,,,,,,,136.0



---------------------------

Unique elements by feature: 



private_parking_available       2
has_gps                         2
has_air_conditioning            2
automatic_car                   2
has_getaround_connect           2
has_speed_regulator             2
winter_tires                    2
fuel                            4
car_type                        8
paint_color                    10
model_key                      28
engine_power                   61
rental_price_per_day          220
mileage                      4786
Unnamed: 0                   4843
dtype: int64


---------------------------

Percentage of missing values: 



Unnamed: 0                   0.0
model_key                    0.0
mileage                      0.0
engine_power                 0.0
fuel                         0.0
paint_color                  0.0
car_type                     0.0
private_parking_available    0.0
has_gps                      0.0
has_air_conditioning         0.0
automatic_car                0.0
has_getaround_connect        0.0
has_speed_regulator          0.0
winter_tires                 0.0
rental_price_per_day         0.0
dtype: float64

### EDA

In [189]:
dataset = rawdata.drop(columns=["Unnamed: 0"])
dataset.head()

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
0,Citroën,140411,100,diesel,black,convertible,True,True,False,False,True,True,True,106
1,Citroën,13929,317,petrol,grey,convertible,True,True,False,False,False,True,True,264
2,Citroën,183297,120,diesel,white,convertible,False,False,False,False,True,False,True,101
3,Citroën,128035,135,diesel,red,convertible,True,True,False,False,True,True,True,158
4,Citroën,97097,160,diesel,silver,convertible,True,True,False,False,False,True,True,183


In [190]:
car_type_counts = dataset['car_type'].value_counts()
car_type_percentages = (car_type_counts / len(dataset) * 100).round(2)

fig = go.Figure()

fig.add_trace(go.Bar(
    name='Nombre de voitures',
    x=car_type_counts.index,
    y=car_type_counts.values,
    text=[f'{val:,}' for val in car_type_counts.values],
    textposition='auto',
    marker_color='royalblue'
))

# Mise en page
fig.update_layout(
    title={
        'text': 'Distribution des types de voitures',
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    xaxis_title="Type de voiture",
    yaxis_title="Nombre de voitures",
    width=1000,
    height=600,
    showlegend=True,
    xaxis_tickangle=-45,
    annotations=[
        dict(
            text=f"Total de voitures: {len(dataset):,}",
            showarrow=False,
            x=0.5,
            y=1.1,
            xref='paper',
            yref='paper'
        )
    ]
)

# Ajout des pourcentages en annotations
for i, (car_type, count) in enumerate(car_type_counts.items()):
    percentage = car_type_percentages[car_type]
    fig.add_annotation(
        x=car_type,
        y=count,
        text=f'{percentage:.1f}%',
        yshift=10,
        showarrow=False
    )

# Affichage des graphiques
fig.show()


In [191]:
model_car_counts = dataset['model_key'].value_counts()
model_car_percentages = (model_car_counts / len(dataset) * 100).round(2)

fig = go.Figure()

fig.add_trace(go.Bar(
    name='Nombre de voitures',
    x=model_car_counts.index,
    y=model_car_counts.values,
    text=[f'{val:,}' for val in model_car_counts.values],
    textposition='auto',
    marker_color='royalblue'
))

# Mise en page
fig.update_layout(
    title={
        'text': 'Distribution des marques de voitures',
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    xaxis_title="Marque de voiture",
    yaxis_title="Nombre de voitures",
    width=1000,
    height=600,
    showlegend=True,
    xaxis_tickangle=-45,
    annotations=[
        dict(
            text=f"Total de voitures: {len(dataset):,}",
            showarrow=False,
            x=0.5,
            y=1.1,
            xref='paper',
            yref='paper'
        )
    ]
)

# Affichage des graphiques
fig.show()


In [192]:
# Correlation matrix
corr_dataset = dataset[['mileage','engine_power','private_parking_available','has_gps','has_air_conditioning','automatic_car','has_getaround_connect', 'has_speed_regulator', 'winter_tires', 'rental_price_per_day']]
    
correlation_matrix = corr_dataset.corr()


# Créer la figure
fig = go.Figure(data=go.Heatmap(
    z=correlation_matrix,
    x=correlation_matrix.columns,
    y=correlation_matrix.columns,
    zmin=-1,  # Minimum de l'échelle de couleurs
    zmax=1,   # Maximum de l'échelle de couleurs
    text=correlation_matrix.round(2),  # Valeurs à afficher dans les cellules
    texttemplate='%{text:.2f}',       # Format des valeurs
    textfont={"size": 10},            # Taille de la police pour les valeurs
    hoverongaps=False,                # Désactiver le survol sur les cellules vides
    colorscale='RdBu',                # Échelle de couleurs rouge-bleu
    colorbar=dict(
        title='Correlation',          # Titre de la barre de couleurs
        titleside='right',
        thickness=15
    )
))

# Mise à jour du layout
fig.update_layout(
    title='Matrice de Corrélation Interactive',
    width=900,                        # Largeur de la figure
    height=800,                       # Hauteur de la figure
    xaxis=dict(
        tickangle=45,                 # Rotation des labels de l'axe x
        side='bottom'
    ),
    yaxis=dict(
        autorange='reversed'          # Inverser l'axe y pour avoir la même disposition que seaborn
    )
)

# Afficher la figure
fig.show()

## training

In [193]:
# Regroupement des données model_key et car_type ayant peu de données
car_type_counts = dataset['car_type'].value_counts(normalize=True).mul(100).round(2)
car_type_mask = car_type_counts > 5.0
# print(car_type_counts)
print(car_type_mask)

dataset['car_type'] = dataset['car_type'].apply(lambda x: x if car_type_mask[x] else 'other')
print(dataset['car_type'].value_counts())


car_type
estate          True
sedan           True
suv             True
hatchback       True
subcompact     False
coupe          False
convertible    False
van            False
Name: proportion, dtype: bool
car_type
estate       1606
sedan        1168
suv          1058
hatchback     699
other         312
Name: count, dtype: int64


In [194]:
model_key_counts = dataset['model_key'].value_counts(normalize=True).mul(100).round(2)
model_key_mask = model_key_counts > 1.0
print(model_key_counts)
# print(model_key_mask)

dataset['model_key'] = dataset['model_key'].apply(lambda x: x if model_key_mask[x] else 'other')
print(dataset['model_key'].value_counts())


model_key
Citroën        20.01
Renault        18.91
BMW            17.08
Peugeot        13.26
Audi           10.86
Nissan          5.68
Mitsubishi      4.77
Mercedes        2.00
Volkswagen      1.34
Toyota          1.09
SEAT            0.95
Subaru          0.91
Opel            0.68
Ferrari         0.68
PGO             0.68
Maserati        0.37
Suzuki          0.17
Porsche         0.12
Ford            0.10
KIA Motors      0.06
Alfa Romeo      0.06
Fiat            0.04
Lexus           0.04
Lamborghini     0.04
Mini            0.02
Mazda           0.02
Honda           0.02
Yamaha          0.02
Name: proportion, dtype: float64
model_key
Citroën       969
Renault       916
BMW           827
Peugeot       642
Audi          526
Nissan        275
other         242
Mitsubishi    231
Mercedes       97
Volkswagen     65
Toyota         53
Name: count, dtype: int64


In [195]:
numeric_features = ['mileage', 'engine_power', 'private_parking_available', 'has_gps', 
                       'has_air_conditioning', 'automatic_car', 'has_getaround_connect',
                       'has_speed_regulator', 'winter_tires']
categorical_features = ['model_key', 'fuel', 'paint_color', 'car_type']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(alpha=1.5),
    'Lasso': Lasso(alpha=0.6),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'XGBoost': xgb.XGBRegressor(n_estimators=100, random_state=42)
}

In [None]:
def evaluate_models(X, y):
    # Split des données
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    results = {}
    feature_importance_dict = {}  # Pour stocker l'importance des features
    
    for name, model in models.items():
        # Création du pipeline
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('regressor', model)
        ])
        
        # Entraînement
        pipeline.fit(X_train, y_train)
        
        # Prédictions
        y_pred = pipeline.predict(X_test)
        
        # Calcul des métriques
        rmse_value = root_mean_squared_error(y_test, y_pred)
        r2_value = r2_score(y_test, y_pred)
        mae_value = np.mean(np.abs(y_test - y_pred))
        
        # Cross-validation
        cv_scores = cross_val_score(pipeline, X, y, cv=10, scoring='r2', n_jobs=-1)

        # Prédiction sur un échantillon de données
        sample_data = X_test.iloc[0].to_frame().T  # Prendre la première ligne de X_test comme échantillon
        sample_prediction = pipeline.predict(sample_data)
        
        # Stockage des résultats
        results[name] = {
            'RMSE': rmse_value,
            'MAE': mae_value,
            'R2': r2_value,
            'CV_mean': cv_scores.mean(),
            'CV_std': cv_scores.std(),
            'Sample_Prediction': sample_prediction[0],
            'sample_data': sample_data

        }
        
        # Extraction de l'importance des features pour les modèles qui le supportent
        if hasattr(pipeline.named_steps['regressor'], 'feature_importances_'):
            feature_names = (numeric_features + 
                           [f"{feat}_{val}" for feat, vals in 
                            zip(categorical_features, 
                                pipeline.named_steps['preprocessor']
                                .named_transformers_['cat']
                                .categories_) 
                            for val in vals[1:]])
            
            importances = pipeline.named_steps['regressor'].feature_importances_
            feature_importance_dict[name] = dict(zip(feature_names, importances))
    
    return results, feature_importance_dict


In [197]:
X = dataset[numeric_features + categorical_features]
y = dataset['rental_price_per_day']

# Évaluation des modèles
results, feature_importance = evaluate_models(X, y)

# Affichage des résultats
for model, metrics in results.items():
    print(f"\n{model}:")
    print(f"RMSE: {metrics['RMSE']:.2f}")
    print(f"MAE: {metrics['MAE']:.2f}")
    print(f"R2 Score: {metrics['R2']:.2f}")
    print(f"Cross-validation R2: {metrics['CV_mean']:.2f} (+/- {metrics['CV_std']*2:.2f})")
    print(f"Sample Prediction: {metrics['Sample_Prediction']:.2f}")


Linear Regression:
RMSE: 18.22
MAE: 12.29
R2 Score: 0.68
Cross-validation R2: 0.64 (+/- 0.13)
Sample Prediction: 98.65

Ridge:
RMSE: 18.23
MAE: 12.28
R2 Score: 0.68
Cross-validation R2: 0.64 (+/- 0.12)
Sample Prediction: 98.75

Lasso:
RMSE: 19.38
MAE: 13.20
R2 Score: 0.64
Cross-validation R2: 0.60 (+/- 0.10)
Sample Prediction: 105.88

Random Forest:
RMSE: 16.87
MAE: 10.73
R2 Score: 0.73
Cross-validation R2: 0.69 (+/- 0.14)
Sample Prediction: 108.92

Gradient Boosting:
RMSE: 16.78
MAE: 11.13
R2 Score: 0.73
Cross-validation R2: 0.69 (+/- 0.13)
Sample Prediction: 100.91

XGBoost:
RMSE: 16.74
MAE: 10.55
R2 Score: 0.73
Cross-validation R2: 0.69 (+/- 0.15)
Sample Prediction: 102.44


In [198]:
def plot_metrics(results):
    models = list(results.keys())
    metrics = ['RMSE', 'MAE', 'R2']
    
    fig = go.Figure()
    
    for metric in metrics:
        values = [results[model][metric] for model in models]
        fig.add_trace(go.Bar(
            name=metric,
            x=models,
            y=values,
            text=[f'{v:.2f}' for v in values],
            textposition='auto',
        ))

    fig.update_layout(
        title='Comparaison des métriques par modèle',
        barmode='group',
        xaxis_title="Modèles",
        yaxis_title="Valeur des métriques",
        width=1000,
        height=600,
        xaxis_tickangle=-45
    )
    
    return fig

In [199]:
# Visualisation de l'importance des features
def plot_feature_importance(feature_importance, model_name='Random Forest'):
    if model_name not in feature_importance:
        return None
    
    importances = feature_importance[model_name]
    features = list(importances.keys())
    values = list(importances.values())
    
    # Trier par importance
    sorted_idx = np.argsort(values)
    
    fig = go.Figure(go.Bar(
        x=np.array(values)[sorted_idx],
        y=np.array(features)[sorted_idx],
        orientation='h'
    ))
    
    fig.update_layout(
        title=f'Importance des features - {model_name}',
        xaxis_title="Importance",
        yaxis_title="Features",
        width=1000,
        height=800
    )
    
    return fig

In [200]:
# Affichage des graphiques
plot_metrics(results).show()
plot_feature_importance(feature_importance).show()