In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

In [2]:
df=pd.read_csv('final_scout_not_dummy2.csv')
df

Unnamed: 0,make_model,body_type,price,km,Type,Fuel,Extras,Gears,age,Previous_Owners,...,Upholstery_type,Gearing_Type,Displacement_cc,Weight_kg,Drive_chain,cons_comb,CO2_Emission,Comfort_Convenience_Package,Entertainment_Media_Package,Safety_Security_Package
0,Audi A1,Sedans,15770,56013.000000,Used,Diesel,"Alloy wheels, Catalytic Converter, Voice Control",7.0,3.0,2.0,...,Cloth,Automatic,1422.0,1220.0,front,3.8,99.0,Premium,Plus,Safety Premium Package
1,Audi A1,Sedans,14500,80000.000000,Used,Benzine,"Alloy wheels, Sport seats, Sport suspension, V...",7.0,2.0,1.0,...,Cloth,Automatic,1798.0,1255.0,front,5.6,129.0,Standard,Plus,Safety Premium Package
2,Audi A1,Sedans,14640,83450.000000,Used,Diesel,"Alloy wheels, Voice Control",7.0,3.0,1.0,...,Cloth,Automatic,1598.0,1135.0,front,3.8,99.0,Premium,Standard,Safety Premium Package
3,Audi A1,Sedans,14500,73000.000000,Used,Diesel,"Alloy wheels, Sport seats, Voice Control",6.0,3.0,1.0,...,Cloth,Automatic,1422.0,1195.0,front,3.8,99.0,Standard,Plus,Safety Premium Package
4,Audi A1,Sedans,16790,16200.000000,Used,Diesel,"Alloy wheels, Sport package, Sport suspension,...",7.0,3.0,1.0,...,Cloth,Automatic,1422.0,1135.0,front,4.1,109.0,Premium,Plus,Safety Premium Package
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15904,Renault Espace,Van,39950,1647.362609,New,Diesel,"Alloy wheels, Touch screen",6.0,0.0,1.0,...,Part/Full Leather,Automatic,1997.0,1758.0,front,5.3,139.0,Premium,Plus,Safety Premium Package
15905,Renault Espace,Van,39885,9900.000000,Used,Benzine,"Alloy wheels, Touch screen, Voice Control",7.0,0.0,1.0,...,Cloth,Automatic,1798.0,1708.0,front,7.4,168.0,Premium,Plus,Safety Premium Package
15906,Renault Espace,Van,39875,15.000000,Pre-registered,Diesel,Alloy wheels,6.0,0.0,1.0,...,Part/Full Leather,Automatic,1997.0,1734.0,front,5.3,139.0,Standard,Plus,Safety Premium Package
15907,Renault Espace,Van,39700,10.000000,Pre-registered,Diesel,"Alloy wheels, Touch screen",6.0,0.0,1.0,...,Part/Full Leather,Automatic,1997.0,1758.0,front,5.3,139.0,Standard,Plus,Safety Premium Package


### Converting the Extras feature from object to numeric

In [3]:
df.Extras = df.Extras.apply(lambda x: len(x.split(',')))

In [4]:
df

Unnamed: 0,make_model,body_type,price,km,Type,Fuel,Extras,Gears,age,Previous_Owners,...,Upholstery_type,Gearing_Type,Displacement_cc,Weight_kg,Drive_chain,cons_comb,CO2_Emission,Comfort_Convenience_Package,Entertainment_Media_Package,Safety_Security_Package
0,Audi A1,Sedans,15770,56013.000000,Used,Diesel,3,7.0,3.0,2.0,...,Cloth,Automatic,1422.0,1220.0,front,3.8,99.0,Premium,Plus,Safety Premium Package
1,Audi A1,Sedans,14500,80000.000000,Used,Benzine,4,7.0,2.0,1.0,...,Cloth,Automatic,1798.0,1255.0,front,5.6,129.0,Standard,Plus,Safety Premium Package
2,Audi A1,Sedans,14640,83450.000000,Used,Diesel,2,7.0,3.0,1.0,...,Cloth,Automatic,1598.0,1135.0,front,3.8,99.0,Premium,Standard,Safety Premium Package
3,Audi A1,Sedans,14500,73000.000000,Used,Diesel,3,6.0,3.0,1.0,...,Cloth,Automatic,1422.0,1195.0,front,3.8,99.0,Standard,Plus,Safety Premium Package
4,Audi A1,Sedans,16790,16200.000000,Used,Diesel,4,7.0,3.0,1.0,...,Cloth,Automatic,1422.0,1135.0,front,4.1,109.0,Premium,Plus,Safety Premium Package
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15904,Renault Espace,Van,39950,1647.362609,New,Diesel,2,6.0,0.0,1.0,...,Part/Full Leather,Automatic,1997.0,1758.0,front,5.3,139.0,Premium,Plus,Safety Premium Package
15905,Renault Espace,Van,39885,9900.000000,Used,Benzine,3,7.0,0.0,1.0,...,Cloth,Automatic,1798.0,1708.0,front,7.4,168.0,Premium,Plus,Safety Premium Package
15906,Renault Espace,Van,39875,15.000000,Pre-registered,Diesel,1,6.0,0.0,1.0,...,Part/Full Leather,Automatic,1997.0,1734.0,front,5.3,139.0,Standard,Plus,Safety Premium Package
15907,Renault Espace,Van,39700,10.000000,Pre-registered,Diesel,2,6.0,0.0,1.0,...,Part/Full Leather,Automatic,1997.0,1758.0,front,5.3,139.0,Standard,Plus,Safety Premium Package


In [5]:
for model in df.make_model.unique():
    
    car_prices = df[df["make_model"]== model]["price"]

    Q1 = car_prices.quantile(0.25)
    Q3 = car_prices.quantile(0.75)
    
    IQR = Q3-Q1
    
    lower_lim = Q1-1.5*IQR
    upper_lim = Q3+1.5*IQR

    drop_index = df[df["make_model"]== model][(car_prices < lower_lim) | (car_prices > upper_lim)].index
    df.drop(index = drop_index, inplace=True)
    df.reset_index(drop=True, inplace=True)

In [6]:
df

Unnamed: 0,make_model,body_type,price,km,Type,Fuel,Extras,Gears,age,Previous_Owners,...,Upholstery_type,Gearing_Type,Displacement_cc,Weight_kg,Drive_chain,cons_comb,CO2_Emission,Comfort_Convenience_Package,Entertainment_Media_Package,Safety_Security_Package
0,Audi A1,Sedans,15770,56013.000000,Used,Diesel,3,7.0,3.0,2.0,...,Cloth,Automatic,1422.0,1220.0,front,3.8,99.0,Premium,Plus,Safety Premium Package
1,Audi A1,Sedans,14500,80000.000000,Used,Benzine,4,7.0,2.0,1.0,...,Cloth,Automatic,1798.0,1255.0,front,5.6,129.0,Standard,Plus,Safety Premium Package
2,Audi A1,Sedans,14640,83450.000000,Used,Diesel,2,7.0,3.0,1.0,...,Cloth,Automatic,1598.0,1135.0,front,3.8,99.0,Premium,Standard,Safety Premium Package
3,Audi A1,Sedans,14500,73000.000000,Used,Diesel,3,6.0,3.0,1.0,...,Cloth,Automatic,1422.0,1195.0,front,3.8,99.0,Standard,Plus,Safety Premium Package
4,Audi A1,Sedans,16790,16200.000000,Used,Diesel,4,7.0,3.0,1.0,...,Cloth,Automatic,1422.0,1135.0,front,4.1,109.0,Premium,Plus,Safety Premium Package
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15488,Renault Espace,Van,39950,1647.362609,New,Diesel,2,6.0,0.0,1.0,...,Part/Full Leather,Automatic,1997.0,1758.0,front,5.3,139.0,Premium,Plus,Safety Premium Package
15489,Renault Espace,Van,39885,9900.000000,Used,Benzine,3,7.0,0.0,1.0,...,Cloth,Automatic,1798.0,1708.0,front,7.4,168.0,Premium,Plus,Safety Premium Package
15490,Renault Espace,Van,39875,15.000000,Pre-registered,Diesel,1,6.0,0.0,1.0,...,Part/Full Leather,Automatic,1997.0,1734.0,front,5.3,139.0,Standard,Plus,Safety Premium Package
15491,Renault Espace,Van,39700,10.000000,Pre-registered,Diesel,2,6.0,0.0,1.0,...,Part/Full Leather,Automatic,1997.0,1758.0,front,5.3,139.0,Standard,Plus,Safety Premium Package


### Score Tester

In [7]:
def train_val(model):
    
    y_pred = model.predict(X_test)
    y_train_pred = model.predict(X_train)
    
    scores = {"train": {"R2" : r2_score(y_train, y_train_pred),
    "mae" : mean_absolute_error(y_train, y_train_pred),
    "mse" : mean_squared_error(y_train, y_train_pred),                          
    "rmse" : np.sqrt(mean_squared_error(y_train, y_train_pred))},
    
    "test": {"R2" : r2_score(y_test, y_pred),
    "mae" : mean_absolute_error(y_test, y_pred),
    "mse" : mean_squared_error(y_test, y_pred),
    "rmse" : np.sqrt(mean_squared_error(y_test, y_pred))}}
    
    return pd.DataFrame(scores)

### Train | Test Split

In [8]:
X= df.drop(columns="price")
y= df.price

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### Random Forest

In [10]:
cat = X_train.select_dtypes(include='object').columns
cat

Index(['make_model', 'body_type', 'Type', 'Fuel', 'Paint_Type',
       'Upholstery_type', 'Gearing_Type', 'Drive_chain',
       'Comfort_Convenience_Package', 'Entertainment_Media_Package',
       'Safety_Security_Package'],
      dtype='object')

In [11]:
enc_ord=OrdinalEncoder()

column_trans = make_column_transformer((enc_ord, cat),
                                       remainder='passthrough')

In [12]:
operations = [("Ordinal_Encoder", column_trans), ("RF_Model", RandomForestRegressor())]

rf_pipe = Pipeline(steps=operations)

rf_pipe.fit(X_train, y_train)

In [13]:
df_= train_val(rf_pipe)
df_

Unnamed: 0,train,test
R2,0.992918,0.9560731
mae,346.831958,870.3354
mse,324802.771144,1989432.0
rmse,569.914705,1410.472


In [14]:
my_dict={'model':'rf_model','train_R2':df_.iloc[0,0],
         'test_R2':df_.iloc[0,1], 'train_rmse':df_.iloc[3,0],
        'test_rmse':df_.iloc[3,1]}
df_val= pd.DataFrame(my_dict, index=[0])
df_val

Unnamed: 0,model,train_R2,test_R2,train_rmse,test_rmse
0,rf_model,0.992918,0.956073,569.914705,1410.472127


### Grid Search Random Forest

In [15]:
# operations = [("Ordinal_Encoder", column_trans), ("RF_Model", RandomForestRegressor())]

# model = Pipeline(steps=operations)

# param_grid = {"RF_Model__n_estimators":[50, 100],
#               "RF_Model__max_depth": [50, 'None'],
#               "RF_Model__min_samples_leaf": [1, 2, 3],
#               "RF_Model__min_samples_split": [2, 3, 5]}

# grid_model_rf = GridSearchCV(estimator=model,
#                           param_grid=param_grid,
#                           scoring='neg_root_mean_squared_error',
#                           cv=10,
#                           n_jobs = -1,
#                           return_train_score=True)

# grid_model_rf.fit(X_train,y_train)

In [16]:
# train_val(grid_model_rf)

In [17]:
# grid_model_rf.best_params_

### Random Forest with Best Parameters

In [18]:
operations = [("Ordinal_Encoder", column_trans), ("RF_Model", RandomForestRegressor(max_depth=50, min_samples_leaf=1, 

                                                                                                min_samples_split=2, n_estimators=100))] 
rf_grid = Pipeline(steps=operations)

rf_grid.fit(X_train, y_train)

In [19]:
df_1= train_val(rf_grid)
df_1

Unnamed: 0,train,test
R2,0.993066,0.9564512
mae,344.74293,866.8064
mse,318017.683766,1972310.0
rmse,563.930566,1404.39


In [20]:
my_dict={'model':'rf_grid','train_R2':df_1.iloc[0,0],
         'test_R2':df_1.iloc[0,1], 'train_rmse':df_1.iloc[3,0],
        'test_rmse':df_1.iloc[3,1]}
df_val1= pd.DataFrame(my_dict, index=[0])
df_val1

Unnamed: 0,model,train_R2,test_R2,train_rmse,test_rmse
0,rf_grid,0.993066,0.956451,563.930566,1404.389525


In [21]:
df_val=pd.concat([df_val, df_val1], axis=0).reset_index(drop=True)
df_val.sort_values(by='test_R2', ascending=False)

Unnamed: 0,model,train_R2,test_R2,train_rmse,test_rmse
1,rf_grid,0.993066,0.956451,563.930566,1404.389525
0,rf_model,0.992918,0.956073,569.914705,1410.472127


### Feature Importance

In [22]:
features = rf_grid["Ordinal_Encoder"].get_feature_names_out()
features

array(['ordinalencoder__make_model', 'ordinalencoder__body_type',
       'ordinalencoder__Type', 'ordinalencoder__Fuel',
       'ordinalencoder__Paint_Type', 'ordinalencoder__Upholstery_type',
       'ordinalencoder__Gearing_Type', 'ordinalencoder__Drive_chain',
       'ordinalencoder__Comfort_Convenience_Package',
       'ordinalencoder__Entertainment_Media_Package',
       'ordinalencoder__Safety_Security_Package', 'remainder__km',
       'remainder__Extras', 'remainder__Gears', 'remainder__age',
       'remainder__Previous_Owners', 'remainder__hp_kW',
       'remainder__Inspection_new', 'remainder__Displacement_cc',
       'remainder__Weight_kg', 'remainder__cons_comb',
       'remainder__CO2_Emission'], dtype=object)

In [23]:
new_features = [i.replace("ordinalencoder__","").replace("remainder__", "") for i in features]
new_features

['make_model',
 'body_type',
 'Type',
 'Fuel',
 'Paint_Type',
 'Upholstery_type',
 'Gearing_Type',
 'Drive_chain',
 'Comfort_Convenience_Package',
 'Entertainment_Media_Package',
 'Safety_Security_Package',
 'km',
 'Extras',
 'Gears',
 'age',
 'Previous_Owners',
 'hp_kW',
 'Inspection_new',
 'Displacement_cc',
 'Weight_kg',
 'cons_comb',
 'CO2_Emission']

In [24]:
rf_grid["RF_Model"].feature_importances_

array([1.65085364e-01, 1.90953892e-03, 5.37389384e-03, 9.51040665e-04,
       3.96872177e-04, 2.66622736e-03, 1.41188310e-02, 3.57315853e-04,
       2.89665581e-03, 1.71009754e-03, 8.91845795e-04, 8.84337976e-02,
       5.12347736e-03, 2.39697378e-02, 1.93114878e-01, 1.91558338e-03,
       4.47383731e-01, 1.50670456e-03, 4.84293755e-03, 2.17072082e-02,
       6.85434569e-03, 8.78991605e-03])

In [25]:
df_f_i = pd.DataFrame(data = rf_grid["RF_Model"].feature_importances_, index=new_features,
                      columns = ["Feature Importance"])
df_f_i = df_f_i.sort_values("Feature Importance", ascending=False)
df_f_i

Unnamed: 0,Feature Importance
hp_kW,0.447384
age,0.193115
make_model,0.165085
km,0.088434
Gears,0.02397
Weight_kg,0.021707
Gearing_Type,0.014119
CO2_Emission,0.00879
cons_comb,0.006854
Type,0.005374


### Feature Selection

In [31]:
X2 = X[["hp_kW", "age", "make_model", "km", "Gearing_Type"]]
X2


Unnamed: 0,hp_kW,age,make_model,km,Gearing_Type
0,66.0,3.0,Audi A1,56013.000000,Automatic
1,141.0,2.0,Audi A1,80000.000000,Automatic
2,85.0,3.0,Audi A1,83450.000000,Automatic
3,66.0,3.0,Audi A1,73000.000000,Automatic
4,66.0,3.0,Audi A1,16200.000000,Automatic
...,...,...,...,...,...
15488,147.0,0.0,Renault Espace,1647.362609,Automatic
15489,165.0,0.0,Renault Espace,9900.000000,Automatic
15490,146.0,0.0,Renault Espace,15.000000,Automatic
15491,147.0,0.0,Renault Espace,10.000000,Automatic


In [32]:
X_train,X_test,y_train,y_test=train_test_split(X2, y, test_size=0.2)

In [33]:
cat2 = ["make_model", "Gearing_Type"]

ord_enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

column_trans = make_column_transformer((ord_enc, cat2), remainder='passthrough')


operations = [("OrdinalEncoder", column_trans), ("RF_Model", RandomForestRegressor(max_depth=50, min_samples_leaf=1, 

                                                                                min_samples_split=2, n_estimators=100))]

rf_best = Pipeline(steps=operations)
rf_best.fit(X_train,y_train)
train_val(rf_best)

Unnamed: 0,train,test
R2,0.979023,0.9397749
mae,541.694709,1088.127
mse,951539.388956,2847732.0
rmse,975.468805,1687.522


In [34]:
operations = [("OrdinalEncoder", column_trans), ("RF_Model", RandomForestRegressor(max_depth=50, min_samples_leaf=1, 

                                                                                min_samples_split=2, n_estimators=100))]
model = Pipeline(steps=operations)
scores = cross_validate(model, X_train, y_train, scoring=['r2', 
            'neg_mean_absolute_error','neg_mean_squared_error','neg_root_mean_squared_error'], cv = 10,
             return_train_score=True)
df_scores = pd.DataFrame(scores)
df_scores.mean()[2:]

test_r2                              9.382302e-01
train_r2                             9.792248e-01
test_neg_mean_absolute_error        -1.053488e+03
train_neg_mean_absolute_error       -5.408806e+02
test_neg_mean_squared_error         -2.798978e+06
train_neg_mean_squared_error        -9.424179e+05
test_neg_root_mean_squared_error    -1.669726e+03
train_neg_root_mean_squared_error   -9.707246e+02
dtype: float64

### Deployment

In [35]:
import pickle
pickle.dump(rf_best, open('rf_model', 'wb'))

In [36]:
my_model= pickle.load(open("rf_model", "rb"))

In [38]:
my_dict = {'hp_kW': 105, 'age':2,
          'km':100000, 'make_model':'Audi A3',
          'Gearing_Type':'Automatic'}

In [40]:
df_test=pd.DataFrame(my_dict, index=[0])
df_test

Unnamed: 0,hp_kW,age,km,make_model,Gearing_Type
0,105,2,100000,Audi A3,Automatic


In [41]:
prediction=my_model.predict(df_test)
print(prediction)

[17519.21]


In [45]:
print(f"The Estimated Price of Your Car Is:  {prediction[0]} Euro")

The Estimated Price of Your Car Is:  17519.21 Euro
