# PART 2 : MODEL - LINEAR REGRESSION

In [None]:
# Loading librairies needed
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
import warnings
import joblib
warnings.filterwarnings("ignore", category=DeprecationWarning) # to avoid deprecation warnings

In [None]:
# Loading the dataset
dataset = pd.read_csv("src/get_around_pricing_project.csv")

In [8]:
# Splitting the dataset into X and y
TARGET = "rental_price_per_day"
X = dataset.iloc[:,:-1]
y = dataset.loc[:,TARGET]

In [9]:
X.head()

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires
0,Citroën,140411,100,diesel,black,convertible,True,True,False,False,True,True,True
1,Citroën,13929,317,petrol,grey,convertible,True,True,False,False,False,True,True
2,Citroën,183297,120,diesel,white,convertible,False,False,False,False,True,False,True
3,Citroën,128035,135,diesel,red,convertible,True,True,False,False,True,True,True
4,Citroën,97097,160,diesel,silver,convertible,True,True,False,False,False,True,True


In [10]:
y.head()

0    106
1    264
2    101
3    158
4    183
Name: rental_price_per_day, dtype: int64

In [11]:
# Automatically detect positions of numeric/categorical features
idx = 0
numeric_features = []
numeric_indices = []
categorical_features = []
categorical_indices = []
for i,t in X.dtypes.iteritems():
    if ('float' in str(t)) or ('int' in str(t)) :
        numeric_features.append(i)
        numeric_indices.append(idx)
    else :
        categorical_features.append(i)
        categorical_indices.append(idx)

    idx = idx + 1

print('Found numeric features ', numeric_features,' at positions ', numeric_indices)
print('Found categorical features ', categorical_features,' at positions ', categorical_indices)

Found numeric features  ['mileage', 'engine_power']  at positions  [1, 2]
Found categorical features  ['model_key', 'fuel', 'paint_color', 'car_type', 'private_parking_available', 'has_gps', 'has_air_conditioning', 'automatic_car', 'has_getaround_connect', 'has_speed_regulator', 'winter_tires']  at positions  [0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]


TRAINING

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.2, random_state=42)

In [48]:
# Create pipeline for numeric features
numeric_transformer = Pipeline(steps=[
                        ('scaler', StandardScaler())
                            ])

In [49]:
# Create pipeline for categorical features
categorical_transformer = Pipeline(steps=[
                        ('encoder', OneHotEncoder(drop='first'))])

In [52]:
# Use ColumnTranformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
                transformers=[
                    ('num', numeric_transformer, numeric_features),
                    ('cat', categorical_transformer, categorical_features)])

In [53]:
# Instantiating a linear regression model
model_lr = Pipeline(steps=[
                    ("Preprocessing", preprocessor),
                    ("Regressor",LinearRegression())
                    ])

In [54]:
model_lr.fit(X_train, y_train)

Pipeline(steps=[('Preprocessing',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['mileage', 'engine_power']),
                                                 ('cat',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder(drop='first'))]),
                                                  ['model_key', 'fuel',
                                                   'paint_color', 'car_type',
                                                   'private_parking_available',
                                                   'has_gps',
                                                   'has_air_conditioning',
                                                   'automatic_c

In [55]:
predictions = model_lr.predict(X_train)

In [56]:
# dumping models
joblib.dump(preprocessor, "./prepro.joblib")
joblib.dump(model_lr, "./model.joblib")

['./model.joblib']

END PART 2