In [1]:
import numpy as np
import pandas as pd
import plotly.express as px

In [2]:
car_price = pd.read_csv('/content/car_price.csv')
car_price.head(2)

Unnamed: 0,Car ID,Brand,Year,Engine Size,Fuel Type,Transmission,Mileage,Condition,Price,Model
0,1,Tesla,2016,2.3,Petrol,Manual,114832,New,26613.92,Model X
1,2,BMW,2018,4.4,Electric,Manual,143190,Used,14679.61,5 Series


## **data cleaning**

In [3]:
car_price.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Car ID        2500 non-null   int64  
 1   Brand         2500 non-null   object 
 2   Year          2500 non-null   int64  
 3   Engine Size   2500 non-null   float64
 4   Fuel Type     2500 non-null   object 
 5   Transmission  2500 non-null   object 
 6   Mileage       2500 non-null   int64  
 7   Condition     2500 non-null   object 
 8   Price         2500 non-null   float64
 9   Model         2500 non-null   object 
dtypes: float64(2), int64(3), object(5)
memory usage: 195.4+ KB


In [4]:
car_price.duplicated().sum()

np.int64(0)

In [5]:
car_price.isnull().sum()

Unnamed: 0,0
Car ID,0
Brand,0
Year,0
Engine Size,0
Fuel Type,0
Transmission,0
Mileage,0
Condition,0
Price,0
Model,0


In [6]:
car_price.columns = car_price.columns.str.lower().str.replace(' ', '_')
car_price.head(2)

Unnamed: 0,car_id,brand,year,engine_size,fuel_type,transmission,mileage,condition,price,model
0,1,Tesla,2016,2.3,Petrol,Manual,114832,New,26613.92,Model X
1,2,BMW,2018,4.4,Electric,Manual,143190,Used,14679.61,5 Series


In [7]:
car_price.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   car_id        2500 non-null   int64  
 1   brand         2500 non-null   object 
 2   year          2500 non-null   int64  
 3   engine_size   2500 non-null   float64
 4   fuel_type     2500 non-null   object 
 5   transmission  2500 non-null   object 
 6   mileage       2500 non-null   int64  
 7   condition     2500 non-null   object 
 8   price         2500 non-null   float64
 9   model         2500 non-null   object 
dtypes: float64(2), int64(3), object(5)
memory usage: 195.4+ KB


In [8]:
car_price.to_csv('car_price_updated.csv', index=False)

## **data analysis**

In [9]:
car_price.describe()

Unnamed: 0,car_id,year,engine_size,mileage,price
count,2500.0,2500.0,2500.0,2500.0,2500.0
mean,1250.5,2011.6268,3.46524,149749.8448,52638.022532
std,721.83216,6.9917,1.432053,87919.952034,27295.833455
min,1.0,2000.0,1.0,15.0,5011.27
25%,625.75,2005.0,2.2,71831.5,28908.485
50%,1250.5,2012.0,3.4,149085.0,53485.24
75%,1875.25,2018.0,4.7,225990.5,75838.5325
max,2500.0,2023.0,6.0,299967.0,99982.59


In [10]:
car_price.describe(include = 'object')

Unnamed: 0,brand,fuel_type,transmission,condition,model
count,2500,2500,2500,2500,2500
unique,7,4,2,3,28
top,Toyota,Diesel,Manual,Used,Fiesta
freq,374,655,1308,855,103


In [11]:
#cars are there for each brand
brand_car_count = car_price.groupby('brand')['brand'].count().sort_values(ascending=False)
brand_car_count

Unnamed: 0_level_0,brand
brand,Unnamed: 1_level_1
Toyota,374
Audi,368
BMW,358
Mercedes,353
Honda,352
Tesla,348
Ford,347


In [12]:
#cars have a manual transmission for each brand
manual_transmission_by_brand = car_price[car_price['transmission'] == 'Manual'].groupby('brand')['brand'].count().sort_values(ascending=False)
manual_transmission_by_brand

Unnamed: 0_level_0,brand
brand,Unnamed: 1_level_1
Audi,198
Honda,190
Ford,188
Toyota,187
Tesla,187
BMW,181
Mercedes,177


In [13]:
#cars have a Automatic transmission for each brand
Automatic_transmission_by_brand = car_price[car_price['transmission'] == 'Automatic'].groupby('brand')['brand'].count().sort_values(ascending=False)
Automatic_transmission_by_brand

Unnamed: 0_level_0,brand
brand,Unnamed: 1_level_1
Toyota,187
BMW,177
Mercedes,176
Audi,170
Honda,162
Tesla,161
Ford,159


In [14]:
#engine size average for each brand
engine_avg_by_brand = car_price.groupby('brand')['engine_size'].mean().round(2)
engine_avg_by_brand

Unnamed: 0_level_0,engine_size
brand,Unnamed: 1_level_1
Audi,3.45
BMW,3.44
Ford,3.44
Honda,3.58
Mercedes,3.4
Tesla,3.43
Toyota,3.51


In [15]:
#the sum mileage of cars for each brand and fuel type combination
mileage_sum_by_brand_fuel = car_price.groupby(['brand', 'fuel_type'])[['mileage']].sum().sort_values("brand" , ascending=False)
mileage_sum_by_brand_fuel

Unnamed: 0_level_0,Unnamed: 1_level_0,mileage
brand,fuel_type,Unnamed: 2_level_1
Toyota,Petrol,13451047
Toyota,Hybrid,14920817
Toyota,Electric,11778281
Toyota,Diesel,15174885
Tesla,Petrol,11285387
Tesla,Hybrid,13260604
Tesla,Electric,15090814
Tesla,Diesel,12892400
Mercedes,Petrol,16972657
Mercedes,Hybrid,11963438


In [16]:
# the average price of cars for each brand
avg_price_brand = car_price.groupby(['brand'])['price'].mean().sort_values(ascending=False)
avg_price_brand

Unnamed: 0_level_0,price
brand,Unnamed: 1_level_1
BMW,54157.114385
Tesla,53475.547471
Mercedes,53191.090085
Toyota,52078.728235
Honda,52050.283949
Audi,51953.42481
Ford,51593.254813


In [17]:
# the average price of cars for each brand and fuel type
avg_price_brand_fuel = car_price.groupby(['brand', 'fuel_type'])['price'].mean()
avg_price_brand_fuel

Unnamed: 0_level_0,Unnamed: 1_level_0,price
brand,fuel_type,Unnamed: 2_level_1
Audi,Diesel,56574.073814
Audi,Electric,51376.56117
Audi,Hybrid,50374.12
Audi,Petrol,49182.028125
BMW,Diesel,59334.373587
BMW,Electric,50654.630879
BMW,Hybrid,55599.859302
BMW,Petrol,50992.418764
Ford,Diesel,52679.123294
Ford,Electric,47298.914706


In [18]:
#engine size average for each fuel type
engine_avg_by_fuel = car_price.groupby('fuel_type')['engine_size'].mean().round(2)
engine_avg_by_fuel

Unnamed: 0_level_0,engine_size
fuel_type,Unnamed: 1_level_1
Diesel,3.44
Electric,3.56
Hybrid,3.44
Petrol,3.43


In [19]:
#the total price of cars for each brand and condition
total_price_brand_owner = car_price.groupby(['brand', 'condition'])['price'].sum()
total_price_brand_owner

Unnamed: 0_level_0,Unnamed: 1_level_0,price
brand,condition,Unnamed: 2_level_1
Audi,Like New,5764416.22
Audi,New,6551874.07
Audi,Used,6802570.04
BMW,Like New,7452003.08
BMW,New,5711768.04
BMW,Used,6224475.83
Ford,Like New,6567997.95
Ford,New,6060813.77
Ford,Used,5274047.7
Honda,Like New,6448845.74


In [20]:
#Count cars by fuel type.
fuel_type_count = car_price.groupby('fuel_type')['brand'].count()
fuel_type_count

Unnamed: 0_level_0,brand
fuel_type,Unnamed: 1_level_1
Diesel,655
Electric,614
Hybrid,601
Petrol,630


In [21]:
#the avarage price of cars for each fuel type
fuel_type_average_price = car_price.groupby('fuel_type')['price'].mean().sort_values(ascending=False).round(2)
fuel_type_average_price

Unnamed: 0_level_0,price
fuel_type,Unnamed: 1_level_1
Diesel,54997.0
Hybrid,52547.39
Petrol,51767.92
Electric,51103.01


In [22]:
#the total number of cars with an automatic transmission for each fuel type
automatic_by_fuel_type = car_price[car_price['transmission'] == 'Automatic'].groupby('fuel_type')['fuel_type'].count().sort_values(ascending=False)
automatic_by_fuel_type

Unnamed: 0_level_0,fuel_type
fuel_type,Unnamed: 1_level_1
Petrol,316
Diesel,309
Electric,291
Hybrid,276


In [23]:
#the total number of cars with an manual transmission for each fuel type
manual_by_fuel_type = car_price[car_price['transmission'] == 'Manual'].groupby('fuel_type')['fuel_type'].count().sort_values(ascending=False)
manual_by_fuel_type

Unnamed: 0_level_0,fuel_type
fuel_type,Unnamed: 1_level_1
Diesel,346
Hybrid,325
Electric,323
Petrol,314


In [24]:
#the average mileage of cars for each fuel type
avg_mileage_by_fuel_type = car_price.groupby('fuel_type')['mileage'].mean().round(2)
avg_mileage_by_fuel_type

Unnamed: 0_level_0,mileage
fuel_type,Unnamed: 1_level_1
Diesel,150379.16
Electric,143897.39
Hybrid,153476.47
Petrol,151244.29


In [25]:
# the minimum mileage of cars for each fuel type
min_mileage_by_fuel = car_price.groupby('fuel_type')['mileage'].min().sort_values(ascending=True)
min_mileage_by_fuel

Unnamed: 0_level_0,mileage
fuel_type,Unnamed: 1_level_1
Electric,15
Diesel,36
Petrol,56
Hybrid,1379


In [26]:
#the minimum price of cars for each fuel type
min_price_fuel_year = car_price.groupby(['fuel_type'])['price'].min()
min_price_fuel_year

Unnamed: 0_level_0,price
fuel_type,Unnamed: 1_level_1
Diesel,5022.86
Electric,5011.27
Hybrid,5124.89
Petrol,5129.96


In [27]:
#Count cars by condition
condition_count = car_price.groupby('condition')['brand'].count()
condition_count

Unnamed: 0_level_0,brand
condition,Unnamed: 1_level_1
Like New,836
New,809
Used,855


In [28]:
#cars are there for each transmission type and condition
transmission_condition_count = car_price.groupby(['transmission', 'condition'])[['brand']].count().sort_values("transmission" , ascending=False)
transmission_condition_count

Unnamed: 0_level_0,Unnamed: 1_level_0,brand
transmission,condition,Unnamed: 2_level_1
Manual,Like New,442
Manual,New,423
Manual,Used,443
Automatic,Like New,394
Automatic,New,386
Automatic,Used,412


In [29]:
#price average for each condition
price_avg_by_condition = car_price.groupby('condition')['price'].mean().round(2)
price_avg_by_condition

Unnamed: 0_level_0,price
condition,Unnamed: 1_level_1
Like New,53518.75
New,51904.51
Used,52470.92


In [30]:
#engine size average for each condition
engine_avg_by_condition = car_price.groupby('condition')['engine_size'].mean().round(2)
engine_avg_by_condition

Unnamed: 0_level_0,engine_size
condition,Unnamed: 1_level_1
Like New,3.46
New,3.44
Used,3.5


In [31]:
#the average mileage of cars for each condition
avg_mileage_owner = car_price.groupby('condition')['mileage'].mean().round(2)
avg_mileage_owner

Unnamed: 0_level_0,mileage
condition,Unnamed: 1_level_1
Like New,151986.43
New,148667.97
Used,148586.63


In [32]:
#the total price of cars for each brand and condition
total_price_brand_owner = car_price.groupby(['brand', 'condition'])['price'].sum()
total_price_brand_owner

Unnamed: 0_level_0,Unnamed: 1_level_0,price
brand,condition,Unnamed: 2_level_1
Audi,Like New,5764416.22
Audi,New,6551874.07
Audi,Used,6802570.04
BMW,Like New,7452003.08
BMW,New,5711768.04
BMW,Used,6224475.83
Ford,Like New,6567997.95
Ford,New,6060813.77
Ford,Used,5274047.7
Honda,Like New,6448845.74


In [33]:
#Count cars by model
model_count = car_price.groupby('model')['brand'].count()
model_count

Unnamed: 0_level_0,brand
model,Unnamed: 1_level_1
3 Series,93
5 Series,93
A3,98
A4,96
Accord,88
C-Class,92
CR-V,95
Camry,90
Civic,80
Corolla,103


In [34]:
#Count cars by transmission
transmission_count = car_price.groupby('transmission')['brand'].count()
transmission_count

Unnamed: 0_level_0,brand
transmission,Unnamed: 1_level_1
Automatic,1192
Manual,1308


In [35]:
#engine size average for each transmission
engine_avg_by_transmission = car_price.groupby('transmission')['engine_size'].mean().round(2)
engine_avg_by_transmission

Unnamed: 0_level_0,engine_size
transmission,Unnamed: 1_level_1
Automatic,3.47
Manual,3.46


In [36]:
#mileage average for each transmission
mileage_avg_by_transmission = car_price.groupby('transmission')['mileage'].mean().round(2)
mileage_avg_by_transmission

Unnamed: 0_level_0,mileage
transmission,Unnamed: 1_level_1
Automatic,151984.13
Manual,147713.71


In [37]:
#the average price of cars for each transmission type
average_price_transmission = car_price.groupby(['transmission'])['price'].mean().round(2)
average_price_transmission

Unnamed: 0_level_0,price
transmission,Unnamed: 1_level_1
Automatic,52691.68
Manual,52589.12


In [38]:
#cars number are there for each year of registration
registration_year_car_count = car_price.groupby('year')['year'].count().sort_values(ascending=False)
registration_year_car_count

Unnamed: 0_level_0,year
year,Unnamed: 1_level_1
2020,122
2003,118
2016,118
2022,112
2021,112
2012,111
2007,110
2002,110
2001,109
2008,109


In [39]:
#the maximum price of cars for each registration year
max_price_by_year = car_price.groupby('year')['price'].max().sort_values(ascending=False)
max_price_by_year

Unnamed: 0_level_0,price
year,Unnamed: 1_level_1
2022,99982.59
2010,99968.62
2015,99905.9
2009,99794.46
2016,99754.42
2000,99605.33
2003,99578.74
2011,99496.42
2021,99400.47
2008,99212.85


In [40]:
#the minimum price of cars for each registration year
min_price_year = car_price.groupby(['year'])['price'].min().sort_values(ascending=False)
min_price_year

Unnamed: 0_level_0,price
year,Unnamed: 1_level_1
2012,7563.58
2021,6846.34
2018,6728.96
2010,6690.81
2014,6645.97
2005,6629.41
2000,6493.08
2016,6483.01
2013,6398.77
2008,6176.89


In [41]:
#the average price of cars for each registration year
average_price_transmission_year = car_price.groupby(['year'])['price'].mean().round(2)
average_price_transmission_year

Unnamed: 0_level_0,price
year,Unnamed: 1_level_1
2000,53094.16
2001,48944.06
2002,57197.48
2003,52260.19
2004,51517.98
2005,57534.32
2006,53437.6
2007,56464.2
2008,53104.79
2009,52932.74


In [42]:
#the total price of all cars for each fuel type
fuel_type_total_price = car_price.groupby('fuel_type')['price'].sum().sort_values(ascending=False)
fuel_type_total_price

Unnamed: 0_level_0,price
fuel_type,Unnamed: 1_level_1
Diesel,36023033.42
Petrol,32613788.35
Hybrid,31580984.25
Electric,31377250.31


In [43]:
# the average price of cars for each brand
avg_price_brand = car_price.groupby(['brand'])['price'].mean().sort_values(ascending=False)
avg_price_brand

Unnamed: 0_level_0,price
brand,Unnamed: 1_level_1
BMW,54157.114385
Tesla,53475.547471
Mercedes,53191.090085
Toyota,52078.728235
Honda,52050.283949
Audi,51953.42481
Ford,51593.254813


In [44]:
#the total price of cars for each brand and condition
total_price_brand_owner = car_price.groupby(['brand', 'condition'])['price'].sum()
total_price_brand_owner

Unnamed: 0_level_0,Unnamed: 1_level_0,price
brand,condition,Unnamed: 2_level_1
Audi,Like New,5764416.22
Audi,New,6551874.07
Audi,Used,6802570.04
BMW,Like New,7452003.08
BMW,New,5711768.04
BMW,Used,6224475.83
Ford,Like New,6567997.95
Ford,New,6060813.77
Ford,Used,5274047.7
Honda,Like New,6448845.74


In [45]:
#the average price of cars for each transmission type and registration year
average_price_transmission = car_price.groupby(['transmission'])['price'].mean().round(2)
average_price_transmission

Unnamed: 0_level_0,price
transmission,Unnamed: 1_level_1
Automatic,52691.68
Manual,52589.12


## data preprocessing

In [46]:
car_price.head(2)

Unnamed: 0,car_id,brand,year,engine_size,fuel_type,transmission,mileage,condition,price,model
0,1,Tesla,2016,2.3,Petrol,Manual,114832,New,26613.92,Model X
1,2,BMW,2018,4.4,Electric,Manual,143190,Used,14679.61,5 Series


In [47]:
car_price.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   car_id        2500 non-null   int64  
 1   brand         2500 non-null   object 
 2   year          2500 non-null   int64  
 3   engine_size   2500 non-null   float64
 4   fuel_type     2500 non-null   object 
 5   transmission  2500 non-null   object 
 6   mileage       2500 non-null   int64  
 7   condition     2500 non-null   object 
 8   price         2500 non-null   float64
 9   model         2500 non-null   object 
dtypes: float64(2), int64(3), object(5)
memory usage: 195.4+ KB


In [48]:
from sklearn.preprocessing import OneHotEncoder
car_price_1= car_price

In [49]:
onehot = OneHotEncoder(drop='first')
onehot.fit(car_price_1[["brand",	"fuel_type"	,"transmission",	"mileage",	"condition" , "model"]])

In [50]:
one_hot_carprice1 = onehot.transform(car_price_1[["brand",	"fuel_type"	,"transmission",	"mileage",	"condition" , "model"]])
one_hot_carprice1.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.]])

In [51]:
carprice1_encoded = pd.DataFrame(one_hot_carprice1.toarray(), columns=onehot.get_feature_names_out())
carprice1_encoded

Unnamed: 0,brand_BMW,brand_Ford,brand_Honda,brand_Mercedes,brand_Tesla,brand_Toyota,fuel_type_Electric,fuel_type_Hybrid,fuel_type_Petrol,transmission_Manual,...,model_Model S,model_Model X,model_Model Y,model_Mustang,model_Prius,model_Q5,model_Q7,model_RAV4,model_X3,model_X5
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2495,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2497,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2498,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [52]:
carprice1_final = pd.concat([car_price_1, carprice1_encoded], axis=1)
carprice1_final.drop(['brand',	'fuel_type'	,'transmission',	'condition' , 'model' , "car_id"], axis=1, inplace=True)
carprice1_final

Unnamed: 0,year,engine_size,mileage,price,brand_BMW,brand_Ford,brand_Honda,brand_Mercedes,brand_Tesla,brand_Toyota,...,model_Model S,model_Model X,model_Model Y,model_Mustang,model_Prius,model_Q5,model_Q7,model_RAV4,model_X3,model_X5
0,2016,2.3,114832,26613.92,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2018,4.4,143190,14679.61,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2013,4.5,181601,44402.61,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2011,4.1,68682,86374.33,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2009,2.6,223009,73577.10,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2495,2020,2.4,22650,61384.10,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2496,2001,5.7,77701,24710.35,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2497,2021,1.1,272827,29902.45,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2498,2002,4.5,229164,46085.67,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


## handle outliers

In [53]:
px.box(carprice1_final, x='price', points='all')

In [54]:
px.box(carprice1_final, x='mileage', points='all')

In [55]:
px.box(carprice1_final, x='engine_size', points='all')

## output and input

In [56]:
input = carprice1_final.drop('price', axis=1)
output = carprice1_final['price']

In [57]:
input

Unnamed: 0,year,engine_size,mileage,brand_BMW,brand_Ford,brand_Honda,brand_Mercedes,brand_Tesla,brand_Toyota,fuel_type_Electric,...,model_Model S,model_Model X,model_Model Y,model_Mustang,model_Prius,model_Q5,model_Q7,model_RAV4,model_X3,model_X5
0,2016,2.3,114832,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2018,4.4,143190,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2013,4.5,181601,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2011,4.1,68682,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2009,2.6,223009,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2495,2020,2.4,22650,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2496,2001,5.7,77701,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2497,2021,1.1,272827,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2498,2002,4.5,229164,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [58]:
output

Unnamed: 0,price
0,26613.92
1,14679.61
2,44402.61
3,86374.33
4,73577.10
...,...
2495,61384.10
2496,24710.35
2497,29902.45
2498,46085.67


In [59]:
from sklearn.model_selection import train_test_split
input_train,input_test,output_train,output_test=train_test_split(input,output,test_size=0.2,shuffle=True,random_state=0)

In [60]:
input_train

Unnamed: 0,year,engine_size,mileage,brand_BMW,brand_Ford,brand_Honda,brand_Mercedes,brand_Tesla,brand_Toyota,fuel_type_Electric,...,model_Model S,model_Model X,model_Model Y,model_Mustang,model_Prius,model_Q5,model_Q7,model_RAV4,model_X3,model_X5
286,2007,1.9,233853,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1495,2001,1.2,158651,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40,2000,3.9,204120,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1106,2001,5.4,93778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1379,2013,5.0,2526,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1033,2008,3.1,199521,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1731,2015,4.2,69306,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
763,2005,5.0,19659,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
835,2009,2.3,210192,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [61]:
input_test

Unnamed: 0,year,engine_size,mileage,brand_BMW,brand_Ford,brand_Honda,brand_Mercedes,brand_Tesla,brand_Toyota,fuel_type_Electric,...,model_Model S,model_Model X,model_Model Y,model_Mustang,model_Prius,model_Q5,model_Q7,model_RAV4,model_X3,model_X5
53,2007,2.2,147733,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2391,2022,5.8,38059,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2310,2014,5.0,185846,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
728,2006,3.7,277845,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
850,2002,4.4,40391,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1810,2000,2.8,188873,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2330,2016,3.0,171955,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
684,2017,2.6,190326,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1674,2011,1.2,201484,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [62]:
output_train

Unnamed: 0,price
286,96713.01
1495,70253.37
40,36094.75
1106,8434.50
1379,7102.22
...,...
1033,72016.11
1731,26081.98
763,62280.02
835,77469.18


In [63]:
output_test

Unnamed: 0,price
53,19993.56
2391,78531.99
2310,64720.06
728,21525.04
850,68399.37
...,...
1810,91057.57
2330,84575.51
684,42133.21
1674,45212.41


## scaling

In [64]:
from sklearn.preprocessing import RobustScaler
robust_scaler = RobustScaler()
input_train_r_scaled = robust_scaler.fit_transform(input_train)

In [65]:
input_train_r_scaled = pd.DataFrame(input_train_r_scaled, columns=input_train.columns)
input_train_r_scaled

Unnamed: 0,year,engine_size,mileage,brand_BMW,brand_Ford,brand_Honda,brand_Mercedes,brand_Tesla,brand_Toyota,fuel_type_Electric,...,model_Model S,model_Model X,model_Model Y,model_Mustang,model_Prius,model_Q5,model_Q7,model_RAV4,model_X3,model_X5
0,-0.384615,-0.64,0.533547,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.846154,-0.92,0.040383,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.923077,0.16,0.338562,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.846154,0.76,-0.385044,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.076923,0.60,-0.983461,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,-0.307692,-0.16,0.308403,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1996,0.230769,0.28,-0.545528,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1997,-0.538462,0.60,-0.871105,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1998,-0.230769,-0.48,0.378381,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [66]:
input_test_r_scaled = robust_scaler.transform(input_test)

In [67]:
input_test_r_scaled = pd.DataFrame(input_test_r_scaled, columns=input_test.columns)
input_test_r_scaled

Unnamed: 0,year,engine_size,mileage,brand_BMW,brand_Ford,brand_Honda,brand_Mercedes,brand_Tesla,brand_Toyota,fuel_type_Electric,...,model_Model S,model_Model X,model_Model Y,model_Mustang,model_Prius,model_Q5,model_Q7,model_RAV4,model_X3,model_X5
0,-0.384615,-0.52,-0.031215,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.769231,0.92,-0.750441,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.153846,0.60,0.218724,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.461538,0.08,0.822040,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.769231,0.36,-0.735148,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,-0.923077,-0.28,0.238575,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
496,0.307692,-0.20,0.127629,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
497,0.384615,-0.36,0.248103,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
498,-0.076923,-0.92,0.321276,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
