# PART 2 : MODEL - LINEAR REGRESSION

## 1.Import des librairies nécessaires

In [7]:
# Loading librairies needed
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
import warnings
import joblib
warnings.filterwarnings("ignore", category=DeprecationWarning) # to avoid deprecation warnings

## 2.Exploration de la Dataframe

In [8]:
# Loading the dataset
dataset = pd.read_csv("../content/sample_data/get_around_pricing_project.csv")

In [9]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
0,0,Citroën,140411,100,diesel,black,convertible,True,True,False,False,True,True,True,106
1,1,Citroën,13929,317,petrol,grey,convertible,True,True,False,False,False,True,True,264
2,2,Citroën,183297,120,diesel,white,convertible,False,False,False,False,True,False,True,101
3,3,Citroën,128035,135,diesel,red,convertible,True,True,False,False,True,True,True,158
4,4,Citroën,97097,160,diesel,silver,convertible,True,True,False,False,False,True,True,183


In [10]:
dataset.shape

(4843, 15)

In [11]:
dataset.describe(include='all')

Unnamed: 0.1,Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
count,4843.0,4843,4843.0,4843.0,4843,4843,4843,4843,4843,4843,4843,4843,4843,4843,4843.0
unique,,28,,,4,10,8,2,2,2,2,2,2,2,
top,,Citroën,,,diesel,black,estate,True,True,False,False,False,False,True,
freq,,969,,,4641,1633,1606,2662,3839,3865,3881,2613,3674,4514,
mean,2421.0,,140962.8,128.98823,,,,,,,,,,,121.214536
std,1398.198007,,60196.74,38.99336,,,,,,,,,,,33.568268
min,0.0,,-64.0,0.0,,,,,,,,,,,10.0
25%,1210.5,,102913.5,100.0,,,,,,,,,,,104.0
50%,2421.0,,141080.0,120.0,,,,,,,,,,,119.0
75%,3631.5,,175195.5,135.0,,,,,,,,,,,136.0


In [12]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4843 entries, 0 to 4842
Data columns (total 15 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Unnamed: 0                 4843 non-null   int64 
 1   model_key                  4843 non-null   object
 2   mileage                    4843 non-null   int64 
 3   engine_power               4843 non-null   int64 
 4   fuel                       4843 non-null   object
 5   paint_color                4843 non-null   object
 6   car_type                   4843 non-null   object
 7   private_parking_available  4843 non-null   bool  
 8   has_gps                    4843 non-null   bool  
 9   has_air_conditioning       4843 non-null   bool  
 10  automatic_car              4843 non-null   bool  
 11  has_getaround_connect      4843 non-null   bool  
 12  has_speed_regulator        4843 non-null   bool  
 13  winter_tires               4843 non-null   bool  
 14  rental_p

In [13]:
# Il n'y a pas de valeurs manquantes

In [14]:
dataset.nunique()

Unnamed: 0                   4843
model_key                      28
mileage                      4786
engine_power                   61
fuel                            4
paint_color                    10
car_type                        8
private_parking_available       2
has_gps                         2
has_air_conditioning            2
automatic_car                   2
has_getaround_connect           2
has_speed_regulator             2
winter_tires                    2
rental_price_per_day          220
dtype: int64

In [15]:
dataset.duplicated().sum()

0

In [16]:
# As we can see, we have some useless columns and columns transformation to do on this dataset for Machine learning : 
# * Unnamed: 0                   => A SUPPRIMER 
# * model_key                    => A CONSERVER EN L'ETAT _ 28 valeurs uniques
# * mileage                      => A CONSERVER _ CREER DES FOURCHETTES
# * engine_power                 => A CONSERVER _ CREER DES FOURCHETTES
# * fuel                         => A CONSERVER EN L'ETAT _ 4 valeurs uniques
# * paint_color                  => A CONSERVER EN L'ETAT _ 10 valeurs uniques
# * car_type                     => A CONSERVER EN L'ETAT _ 8 valeurs uniques  
# * private_parking_available    => A CONSERVER EN L'ETAT _ 2 valeurs uniques    
# * has_gps                      => A CONSERVER EN L'ETAT _ 2 valeurs uniques   
# * has_air_conditioning         => A CONSERVER EN L'ETAT _ 2 valeurs uniques   
# * automatic_car                => A CONSERVER EN L'ETAT _ 2 valeurs uniques   
# * has_getaround_connect        => A CONSERVER EN L'ETAT _ 2 valeurs uniques   
# * has_speed_regulator          => A CONSERVER EN L'ETAT _ 2 valeurs uniques   
# * winter_tires                 => A CONSERVER EN L'ETAT _ 2 valeurs uniques   
# * rental_price_per_day         => A CONSERVER EN L'ETAT _ 2 valeurs uniques

## 3.Application des transformation de la Dataframe 

In [17]:
# Suppression de la colonne 'Unnamed: 0'

In [18]:
dataset = dataset.drop('Unnamed: 0', axis=1)

In [19]:
# Création de fourchettes sur la colonne 'mileage'

In [20]:
dataset['mileage'].min()

-64

In [21]:
dataset['mileage'].max()

1000376

In [22]:
dataset['mileage'].mean()

140962.7995044394

In [23]:
dataset['mileage'].std()

60196.7407025145

In [24]:
dataset['mileage_scaled'] = dataset['mileage'].apply(lambda x : 'Négatif' if x<=0 else(
                                                                '0-1000' if x>0 and x<=1000 else(
                                                                '1000-5000' if x>1000 and x<=5000 else(
                                                                '5000-10000' if x>5000 and x<=10000 else(
                                                                '10000-50000' if x>10000 and x<=50000 else(
                                                                '50000-100000' if x>50000 and x<=100000 else(
                                                                '100000-150000' if x>100000 and x<=150000 else(
                                                                '150000-200000' if x>150000 and x<=200000 else(
                                                                '200000-300000' if x>250000 and x<=300000 else
                                                                '300000 et plus'
                                                                )))))))))

In [25]:
dataset.head()

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day,mileage_scaled
0,Citroën,140411,100,diesel,black,convertible,True,True,False,False,True,True,True,106,100000-150000
1,Citroën,13929,317,petrol,grey,convertible,True,True,False,False,False,True,True,264,10000-50000
2,Citroën,183297,120,diesel,white,convertible,False,False,False,False,True,False,True,101,150000-200000
3,Citroën,128035,135,diesel,red,convertible,True,True,False,False,True,True,True,158,100000-150000
4,Citroën,97097,160,diesel,silver,convertible,True,True,False,False,False,True,True,183,50000-100000


In [26]:
dataset['mileage_scaled'].value_counts()

100000-150000     1593
150000-200000     1552
50000-100000       856
300000 et plus     456
10000-50000        267
200000-300000      105
1000-5000            5
5000-10000           5
0-1000               3
Négatif              1
Name: mileage_scaled, dtype: int64

In [27]:
# On voit au'il y a qu'une seule valeur négative. On va supprimer cette ligne

In [28]:
indexNames = dataset[dataset['mileage_scaled'] == 'Négatif'].index
# Delete these row indexes from dataFrame
dataset.drop(indexNames , inplace=True)

In [29]:
# Création de fourchettes sur la colonne 'engine_power'

In [30]:
dataset['engine_power'].min()

0

In [31]:
dataset['engine_power'].max()

423

In [32]:
dataset['engine_power'].mean()

128.96736885584468

In [33]:
dataset['engine_power'].std()

38.97034832802932

In [34]:
dataset['engine_power_scaled'] = dataset['engine_power'].apply(lambda x : '0-50' if x>=0 and x<=50 else(
                                                                        '50-100' if x>50 and x<=100 else(
                                                                        '100-150' if x>100 and x<=150 else(
                                                                        '150-200' if x>150 and x<=200 else(
                                                                        '200-250' if x>200 and x<=250 else(
                                                                        '250-300' if x>250 and x<=300 else(
                                                                        '300-350' if x>300 and x<=350 else(
                                                                        '350-400' if x>350 and x<=400 else(
                                                                        '400-450' if x>400 and x<=450 else(
                                                                        '450 et plus'
                                                                        ))))))))))

In [35]:
dataset['engine_power_scaled'].value_counts()

100-150    2618
50-100     1279
150-200     684
200-250     212
250-300      36
300-350       8
0-50          3
400-450       2
Name: engine_power_scaled, dtype: int64

In [36]:
dataset.head()

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day,mileage_scaled,engine_power_scaled
0,Citroën,140411,100,diesel,black,convertible,True,True,False,False,True,True,True,106,100000-150000,50-100
1,Citroën,13929,317,petrol,grey,convertible,True,True,False,False,False,True,True,264,10000-50000,300-350
2,Citroën,183297,120,diesel,white,convertible,False,False,False,False,True,False,True,101,150000-200000,100-150
3,Citroën,128035,135,diesel,red,convertible,True,True,False,False,True,True,True,158,100000-150000,100-150
4,Citroën,97097,160,diesel,silver,convertible,True,True,False,False,False,True,True,183,50000-100000,150-200


In [37]:
# Réorganisation des colonnes pour préparer le pré-processing

In [38]:
dataset.columns

Index(['model_key', 'mileage', 'engine_power', 'fuel', 'paint_color',
       'car_type', 'private_parking_available', 'has_gps',
       'has_air_conditioning', 'automatic_car', 'has_getaround_connect',
       'has_speed_regulator', 'winter_tires', 'rental_price_per_day',
       'mileage_scaled', 'engine_power_scaled'],
      dtype='object')

In [39]:
# *_* for easy preprocessing 
dataset = dataset[['model_key', 'mileage', 'engine_power', 'fuel', 'paint_color', 'car_type',
                    'private_parking_available', 'has_gps', 'has_air_conditioning',
                    'automatic_car', 'has_getaround_connect', 'has_speed_regulator',
                    'winter_tires', 'rental_price_per_day']]

# *_* for dataset with scaled categories. potential improvement if we have time
# # Suppression des colonnes 'mileage' et 'engine_power'
# useless_col = ['mileage', 'engine_power']
# dataset = dataset.loc[:,[col for col in dataset.columns if col not in useless_col]]

# dataset = dataset[['model_key', 'fuel', 'paint_color', 'car_type',
#                    'private_parking_available', 'has_gps', 'has_air_conditioning',
#                    'automatic_car', 'has_getaround_connect', 'has_speed_regulator',
#                    'winter_tires', 'mileage_scaled',
#                    'engine_power_scaled','rental_price_per_day']]

## 4.Application du modèle de Machine Learning

### Préprocessing

In [40]:
# Splitting the dataset into X and y
TARGET = "rental_price_per_day"
X = dataset.iloc[:,:-1]
y = dataset.loc[:,TARGET]

In [41]:
X.head()

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires
0,Citroën,140411,100,diesel,black,convertible,True,True,False,False,True,True,True
1,Citroën,13929,317,petrol,grey,convertible,True,True,False,False,False,True,True
2,Citroën,183297,120,diesel,white,convertible,False,False,False,False,True,False,True
3,Citroën,128035,135,diesel,red,convertible,True,True,False,False,True,True,True
4,Citroën,97097,160,diesel,silver,convertible,True,True,False,False,False,True,True


In [42]:
y.head()

0    106
1    264
2    101
3    158
4    183
Name: rental_price_per_day, dtype: int64

In [43]:
# Automatically detect positions of numeric/categorical features
idx = 0
numeric_features = []
numeric_indices = []
categorical_features = []
categorical_indices = []
for i,t in X.dtypes.iteritems():
    if ('float' in str(t)) or ('int' in str(t)) :
        numeric_features.append(i)
        numeric_indices.append(idx)
    else :
        categorical_features.append(i)
        categorical_indices.append(idx)

    idx = idx + 1

print('Found numeric features ', numeric_features,' at positions ', numeric_indices)
print('Found categorical features ', categorical_features,' at positions ', categorical_indices)

Found numeric features  ['mileage', 'engine_power']  at positions  [1, 2]
Found categorical features  ['model_key', 'fuel', 'paint_color', 'car_type', 'private_parking_available', 'has_gps', 'has_air_conditioning', 'automatic_car', 'has_getaround_connect', 'has_speed_regulator', 'winter_tires']  at positions  [0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]


### Training

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.2, random_state=42)

In [45]:
# Create pipeline for numeric features
numeric_transformer = Pipeline(steps=[
                        ('scaler', StandardScaler())
                            ])

In [46]:
# Create pipeline for categorical features
categorical_transformer = Pipeline(steps=[
                        ('encoder', OneHotEncoder(drop='first'))])

In [47]:
# Use ColumnTranformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
                transformers=[
                    ('num', numeric_transformer, numeric_features),
                    ('cat', categorical_transformer, categorical_features)])

### Training w Linear Model

In [48]:
# Instantiating a linear regression model
model_lr = Pipeline(steps=[
                    ("Preprocessing", preprocessor),
                    ("Regressor",LinearRegression())
                    ])

In [49]:
model_lr.fit(X_train, y_train)

Pipeline(steps=[('Preprocessing',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['mileage', 'engine_power']),
                                                 ('cat',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder(drop='first'))]),
                                                  ['model_key', 'fuel',
                                                   'paint_color', 'car_type',
                                                   'private_parking_available',
                                                   'has_gps',
                                                   'has_air_conditioning',
                                                   'automatic_c

In [50]:
# *_* predictions 
predictions_train = model_lr.predict(X_train) #train data
predictions_test = model_lr.predict(X_test) #test data

In [61]:
# *_* evaluation de la performance du modele par root mean squared error (rmse)

def rmse(preds, targets):
    return np.sqrt(((preds - targets) ** 2).mean()) #definition de rmse

rmse_train = rmse(predictions_train, y_train)
print("For train set, root mean squared error (in dollars) is: " + str(round(rmse_train,1)))

rmse_test = rmse(predictions_test, y_test)
print("For test set, root mean squared error (in dollars) is: " + str(round(rmse_test,1)))


For train set, root mean squared error (in dollars) is: 8.3
For test set, root mean squared error (in dollars) is: 16.6


In [52]:
# dumping models
joblib.dump(model_lr, "./model.joblib")

['./model.joblib']

### Training w Random Forest Model

Let's test random forest model!

In [53]:
# *_* alternatively, testing a second ML model
# Instantiating a random forest model
from sklearn.ensemble import RandomForestRegressor

#parameters obtained after trial and error
random_forest_regressor = RandomForestRegressor(
    n_estimators= 300,
    max_depth= 20,
    min_samples_split= 5,
    random_state= 0)

#pipeline
model_rf = Pipeline(steps=[
                    ("Preprocessing", preprocessor),
                    ("Regressor",random_forest_regressor)
                    ])

#fitting
model_rf.fit(X_train, y_train)

Pipeline(steps=[('Preprocessing',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['mileage', 'engine_power']),
                                                 ('cat',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder(drop='first'))]),
                                                  ['model_key', 'fuel',
                                                   'paint_color', 'car_type',
                                                   'private_parking_available',
                                                   'has_gps',
                                                   'has_air_conditioning',
                                                   'automatic_c

In [54]:
# *_* predictions 
predictions_train = model_rf.predict(X_train) #train data
predictions_test = model_rf.predict(X_test) #test data

In [62]:
# *_* evaluation de la performance du modele par root mean squared error (rmse)
rmse_train = rmse(predictions_train, y_train)
print("For train set, root mean squared error is: " + str(rmse_train))

rmse_test = rmse(predictions_test, y_test)
print("For test set, root mean squared error is: " + str(rmse_test))

For train set, root mean squared error is: 8.332150871476081
For test set, root mean squared error is: 16.570761059460455


In [57]:
# random forest model has a better performance - dumping model
joblib.dump(model_rf, "./model_rforest.joblib")

['./model_rforest.joblib']

In [58]:
# END PART 2