In [1]:
import numpy as np
import pandas as pd
import plotly.express as px

In [2]:
car_price = pd.read_csv('/content/car_price.csv')
car_price.head(2)

Unnamed: 0,Car ID,Brand,Year,Engine Size,Fuel Type,Transmission,Mileage,Condition,Price,Model
0,1,Tesla,2016,2.3,Petrol,Manual,114832,New,26613.92,Model X
1,2,BMW,2018,4.4,Electric,Manual,143190,Used,14679.61,5 Series


## **data cleaning**

In [3]:
car_price.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8750 entries, 0 to 8749
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Car ID        8750 non-null   int64  
 1   Brand         8750 non-null   object 
 2   Year          8750 non-null   int64  
 3   Engine Size   8750 non-null   float64
 4   Fuel Type     8750 non-null   object 
 5   Transmission  8750 non-null   object 
 6   Mileage       8750 non-null   int64  
 7   Condition     8750 non-null   object 
 8   Price         8750 non-null   float64
 9   Model         8750 non-null   object 
dtypes: float64(2), int64(3), object(5)
memory usage: 683.7+ KB


In [4]:
car_price.duplicated().sum()

np.int64(0)

In [5]:
car_price.isnull().sum()

Unnamed: 0,0
Car ID,0
Brand,0
Year,0
Engine Size,0
Fuel Type,0
Transmission,0
Mileage,0
Condition,0
Price,0
Model,0


In [6]:
car_price.columns = car_price.columns.str.lower().str.replace(' ', '_')
car_price.head(2)

Unnamed: 0,car_id,brand,year,engine_size,fuel_type,transmission,mileage,condition,price,model
0,1,Tesla,2016,2.3,Petrol,Manual,114832,New,26613.92,Model X
1,2,BMW,2018,4.4,Electric,Manual,143190,Used,14679.61,5 Series


In [7]:
car_price.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8750 entries, 0 to 8749
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   car_id        8750 non-null   int64  
 1   brand         8750 non-null   object 
 2   year          8750 non-null   int64  
 3   engine_size   8750 non-null   float64
 4   fuel_type     8750 non-null   object 
 5   transmission  8750 non-null   object 
 6   mileage       8750 non-null   int64  
 7   condition     8750 non-null   object 
 8   price         8750 non-null   float64
 9   model         8750 non-null   object 
dtypes: float64(2), int64(3), object(5)
memory usage: 683.7+ KB


In [8]:
car_price.to_csv('car_price_updated.csv', index=False)

## **data analysis**

In [9]:
car_price.describe()

Unnamed: 0,car_id,year,engine_size,mileage,price
count,8750.0,8750.0,8750.0,8750.0,8750.0
mean,1253.659657,2011.567886,3.496114,148963.624457,53106.915877
std,720.050777,6.964052,1.471319,88119.193624,27414.331529
min,1.0,2000.0,1.0,15.0,5011.27
25%,633.0,2005.0,2.2,70602.25,29259.73
50%,1254.0,2012.0,3.5,147733.0,54194.93
75%,1873.75,2018.0,4.8,225295.0,76480.0775
max,2500.0,2023.0,6.0,299967.0,99982.59


In [10]:
car_price.describe(include = 'object')

Unnamed: 0,brand,fuel_type,transmission,condition,model
count,8750,8750,8750,8750,8750
unique,7,4,2,3,28
top,Tesla,Diesel,Manual,New,3 Series
freq,1302,2235,4410,2956,337


In [11]:
#cars are there for each brand
brand_car_count = car_price.groupby('brand')['brand'].count().sort_values(ascending=False)
brand_car_count

Unnamed: 0_level_0,brand
brand,Unnamed: 1_level_1
Tesla,1302
Ford,1265
Mercedes,1245
Toyota,1245
BMW,1239
Audi,1236
Honda,1218


In [12]:
#cars have a manual transmission for each brand
manual_transmission_by_brand = car_price[car_price['transmission'] == 'Manual'].groupby('brand')['brand'].count().sort_values(ascending=False)
manual_transmission_by_brand

Unnamed: 0_level_0,brand
brand,Unnamed: 1_level_1
Tesla,674
BMW,641
Mercedes,640
Audi,629
Ford,618
Toyota,614
Honda,594


In [13]:
#cars have a Automatic transmission for each brand
Automatic_transmission_by_brand = car_price[car_price['transmission'] == 'Automatic'].groupby('brand')['brand'].count().sort_values(ascending=False)
Automatic_transmission_by_brand

Unnamed: 0_level_0,brand
brand,Unnamed: 1_level_1
Ford,647
Toyota,631
Tesla,628
Honda,624
Audi,607
Mercedes,605
BMW,598


In [14]:
#engine size average for each brand
engine_avg_by_brand = car_price.groupby('brand')['engine_size'].mean().round(2)
engine_avg_by_brand

Unnamed: 0_level_0,engine_size
brand,Unnamed: 1_level_1
Audi,3.55
BMW,3.47
Ford,3.45
Honda,3.5
Mercedes,3.58
Tesla,3.45
Toyota,3.47


In [15]:
#the sum mileage of cars for each brand and fuel type combination
mileage_sum_by_brand_fuel = car_price.groupby(['brand', 'fuel_type'])[['mileage']].sum().sort_values("brand" , ascending=False)
mileage_sum_by_brand_fuel

Unnamed: 0_level_0,Unnamed: 1_level_0,mileage
brand,fuel_type,Unnamed: 2_level_1
Toyota,Petrol,47634742
Toyota,Hybrid,49608414
Toyota,Electric,40733917
Toyota,Diesel,47847431
Tesla,Petrol,47580360
Tesla,Hybrid,49903825
Tesla,Electric,51226793
Tesla,Diesel,46641336
Mercedes,Petrol,47753877
Mercedes,Hybrid,47238520


In [16]:
# the average price of cars for each brand
avg_price_brand = car_price.groupby(['brand'])['price'].mean().sort_values(ascending=False)
avg_price_brand

Unnamed: 0_level_0,price
brand,Unnamed: 1_level_1
Ford,53908.862134
BMW,53622.448224
Toyota,53436.539831
Honda,53098.453144
Mercedes,52765.903863
Tesla,52725.295207
Audi,52191.179029


In [17]:
# the average price of cars for each brand and fuel type
avg_price_brand_fuel = car_price.groupby(['brand', 'fuel_type'])['price'].mean()
avg_price_brand_fuel

Unnamed: 0_level_0,Unnamed: 1_level_0,price
brand,fuel_type,Unnamed: 2_level_1
Audi,Diesel,53541.67805
Audi,Electric,52488.863937
Audi,Hybrid,52567.623766
Audi,Petrol,49963.844862
BMW,Diesel,53978.882915
BMW,Electric,53673.096213
BMW,Hybrid,55106.862968
BMW,Petrol,51715.923042
Ford,Diesel,53511.843084
Ford,Electric,53442.310714


In [18]:
#engine size average for each fuel type
engine_avg_by_fuel = car_price.groupby('fuel_type')['engine_size'].mean().round(2)
engine_avg_by_fuel

Unnamed: 0_level_0,engine_size
fuel_type,Unnamed: 1_level_1
Diesel,3.48
Electric,3.52
Hybrid,3.54
Petrol,3.45


In [19]:
#the total price of cars for each brand and condition
total_price_brand_owner = car_price.groupby(['brand', 'condition'])['price'].sum()
total_price_brand_owner

Unnamed: 0_level_0,Unnamed: 1_level_0,price
brand,condition,Unnamed: 2_level_1
Audi,Like New,18905121.19
Audi,New,23323464.32
Audi,Used,22279711.77
BMW,Like New,22314190.94
BMW,New,22281909.04
BMW,Used,21842113.37
Ford,Like New,21837665.0
Ford,New,22927067.63
Ford,Used,23429977.97
Honda,Like New,21221147.01


In [20]:
#Count cars by fuel type.
fuel_type_count = car_price.groupby('fuel_type')['brand'].count()
fuel_type_count

Unnamed: 0_level_0,brand
fuel_type,Unnamed: 1_level_1
Diesel,2235
Electric,2162
Hybrid,2232
Petrol,2121


In [21]:
#the avarage price of cars for each fuel type
fuel_type_average_price = car_price.groupby('fuel_type')['price'].mean().sort_values(ascending=False).round(2)
fuel_type_average_price

Unnamed: 0_level_0,price
fuel_type,Unnamed: 1_level_1
Diesel,53820.72
Petrol,53020.6
Hybrid,52972.19
Electric,52592.78


In [22]:
#the total number of cars with an automatic transmission for each fuel type
automatic_by_fuel_type = car_price[car_price['transmission'] == 'Automatic'].groupby('fuel_type')['fuel_type'].count().sort_values(ascending=False)
automatic_by_fuel_type

Unnamed: 0_level_0,fuel_type
fuel_type,Unnamed: 1_level_1
Diesel,1128
Hybrid,1095
Electric,1063
Petrol,1054


In [23]:
#the total number of cars with an manual transmission for each fuel type
manual_by_fuel_type = car_price[car_price['transmission'] == 'Manual'].groupby('fuel_type')['fuel_type'].count().sort_values(ascending=False)
manual_by_fuel_type

Unnamed: 0_level_0,fuel_type
fuel_type,Unnamed: 1_level_1
Hybrid,1137
Diesel,1107
Electric,1099
Petrol,1067


In [24]:
#the average mileage of cars for each fuel type
avg_mileage_by_fuel_type = car_price.groupby('fuel_type')['mileage'].mean().round(2)
avg_mileage_by_fuel_type

Unnamed: 0_level_0,mileage
fuel_type,Unnamed: 1_level_1
Diesel,145870.63
Electric,148698.54
Hybrid,151579.3
Petrol,149740.5


In [25]:
# the minimum mileage of cars for each fuel type
min_mileage_by_fuel = car_price.groupby('fuel_type')['mileage'].min().sort_values(ascending=True)
min_mileage_by_fuel

Unnamed: 0_level_0,mileage
fuel_type,Unnamed: 1_level_1
Electric,15
Diesel,36
Hybrid,56
Petrol,56


In [26]:
#the minimum price of cars for each fuel type
min_price_fuel_year = car_price.groupby(['fuel_type'])['price'].min()
min_price_fuel_year

Unnamed: 0_level_0,price
fuel_type,Unnamed: 1_level_1
Diesel,5022.86
Electric,5011.27
Hybrid,5011.27
Petrol,5011.27


In [27]:
#Count cars by condition
condition_count = car_price.groupby('condition')['brand'].count()
condition_count

Unnamed: 0_level_0,brand
condition,Unnamed: 1_level_1
Like New,2864
New,2956
Used,2930


In [28]:
#cars are there for each transmission type and condition
transmission_condition_count = car_price.groupby(['transmission', 'condition'])[['brand']].count().sort_values("transmission" , ascending=False)
transmission_condition_count

Unnamed: 0_level_0,Unnamed: 1_level_0,brand
transmission,condition,Unnamed: 2_level_1
Manual,Like New,1443
Manual,New,1483
Manual,Used,1484
Automatic,Like New,1421
Automatic,New,1473
Automatic,Used,1446


In [29]:
#price average for each condition
price_avg_by_condition = car_price.groupby('condition')['price'].mean().round(2)
price_avg_by_condition

Unnamed: 0_level_0,price
condition,Unnamed: 1_level_1
Like New,53068.7
New,52566.23
Used,53689.75


In [30]:
#engine size average for each condition
engine_avg_by_condition = car_price.groupby('condition')['engine_size'].mean().round(2)
engine_avg_by_condition

Unnamed: 0_level_0,engine_size
condition,Unnamed: 1_level_1
Like New,3.51
New,3.48
Used,3.5


In [31]:
#the average mileage of cars for each condition
avg_mileage_owner = car_price.groupby('condition')['mileage'].mean().round(2)
avg_mileage_owner

Unnamed: 0_level_0,mileage
condition,Unnamed: 1_level_1
Like New,149287.16
New,149331.13
Used,148276.6


In [32]:
#the total price of cars for each brand and condition
total_price_brand_owner = car_price.groupby(['brand', 'condition'])['price'].sum()
total_price_brand_owner

Unnamed: 0_level_0,Unnamed: 1_level_0,price
brand,condition,Unnamed: 2_level_1
Audi,Like New,18905121.19
Audi,New,23323464.32
Audi,Used,22279711.77
BMW,Like New,22314190.94
BMW,New,22281909.04
BMW,Used,21842113.37
Ford,Like New,21837665.0
Ford,New,22927067.63
Ford,Used,23429977.97
Honda,Like New,21221147.01


In [33]:
#Count cars by model
model_count = car_price.groupby('model')['brand'].count()
model_count

Unnamed: 0_level_0,brand
model,Unnamed: 1_level_1
3 Series,337
5 Series,294
A3,303
A4,336
Accord,327
C-Class,313
CR-V,333
Camry,309
Civic,292
Corolla,312


In [34]:
#Count cars by transmission
transmission_count = car_price.groupby('transmission')['brand'].count()
transmission_count

Unnamed: 0_level_0,brand
transmission,Unnamed: 1_level_1
Automatic,4340
Manual,4410


In [35]:
#engine size average for each transmission
engine_avg_by_transmission = car_price.groupby('transmission')['engine_size'].mean().round(2)
engine_avg_by_transmission

Unnamed: 0_level_0,engine_size
transmission,Unnamed: 1_level_1
Automatic,3.48
Manual,3.51


In [36]:
#mileage average for each transmission
mileage_avg_by_transmission = car_price.groupby('transmission')['mileage'].mean().round(2)
mileage_avg_by_transmission

Unnamed: 0_level_0,mileage
transmission,Unnamed: 1_level_1
Automatic,149314.29
Manual,148618.52


In [37]:
#the average price of cars for each transmission type
average_price_transmission = car_price.groupby(['transmission'])['price'].mean().round(2)
average_price_transmission

Unnamed: 0_level_0,price
transmission,Unnamed: 1_level_1
Automatic,53215.94
Manual,52999.62


In [38]:
#cars number are there for each year of registration
registration_year_car_count = car_price.groupby('year')['year'].count().sort_values(ascending=False)
registration_year_car_count

Unnamed: 0_level_0,year
year,Unnamed: 1_level_1
2020,400
2003,388
2021,378
2022,375
2023,374
2004,372
2005,372
2016,372
2009,371
2017,369


In [39]:
#the maximum price of cars for each registration year
max_price_by_year = car_price.groupby('year')['price'].max().sort_values(ascending=False)
max_price_by_year

Unnamed: 0_level_0,price
year,Unnamed: 1_level_1
2006,99982.59
2005,99982.59
2022,99982.59
2020,99982.59
2016,99982.59
2010,99968.62
2015,99905.9
2021,99905.9
2009,99794.46
2018,99794.46


In [40]:
#the minimum price of cars for each registration year
min_price_year = car_price.groupby(['year'])['price'].min().sort_values(ascending=False)
min_price_year

Unnamed: 0_level_0,price
year,Unnamed: 1_level_1
2016,5865.44
2012,5843.96
2018,5843.96
2014,5741.64
2013,5703.33
2000,5537.99
2021,5535.3
2022,5472.39
2015,5353.03
2011,5247.71


In [41]:
#the average price of cars for each registration year
average_price_transmission_year = car_price.groupby(['year'])['price'].mean().round(2)
average_price_transmission_year

Unnamed: 0_level_0,price
year,Unnamed: 1_level_1
2000,51863.26
2001,51080.53
2002,55207.68
2003,51000.79
2004,51201.66
2005,56029.92
2006,53562.91
2007,53859.21
2008,55107.13
2009,51869.35


In [42]:
#the total price of all cars for each fuel type
fuel_type_total_price = car_price.groupby('fuel_type')['price'].sum().sort_values(ascending=False)
fuel_type_total_price

Unnamed: 0_level_0,price
fuel_type,Unnamed: 1_level_1
Diesel,120289300.0
Hybrid,118233900.0
Electric,113705600.0
Petrol,112456700.0


In [44]:
#the total price of cars for each brand and condition
total_price_brand_owner = car_price.groupby(['brand', 'condition'])['price'].sum()
total_price_brand_owner

Unnamed: 0_level_0,Unnamed: 1_level_0,price
brand,condition,Unnamed: 2_level_1
Audi,Like New,18905121.19
Audi,New,23323464.32
Audi,Used,22279711.77
BMW,Like New,22314190.94
BMW,New,22281909.04
BMW,Used,21842113.37
Ford,Like New,21837665.0
Ford,New,22927067.63
Ford,Used,23429977.97
Honda,Like New,21221147.01


## data preprocessing

In [46]:
car_price.head(2)

Unnamed: 0,car_id,brand,year,engine_size,fuel_type,transmission,mileage,condition,price,model
0,1,Tesla,2016,2.3,Petrol,Manual,114832,New,26613.92,Model X
1,2,BMW,2018,4.4,Electric,Manual,143190,Used,14679.61,5 Series


In [47]:
car_price.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8750 entries, 0 to 8749
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   car_id        8750 non-null   int64  
 1   brand         8750 non-null   object 
 2   year          8750 non-null   int64  
 3   engine_size   8750 non-null   float64
 4   fuel_type     8750 non-null   object 
 5   transmission  8750 non-null   object 
 6   mileage       8750 non-null   int64  
 7   condition     8750 non-null   object 
 8   price         8750 non-null   float64
 9   model         8750 non-null   object 
dtypes: float64(2), int64(3), object(5)
memory usage: 683.7+ KB


In [48]:
from sklearn.preprocessing import OneHotEncoder
car_price_1= car_price

In [49]:
onehot = OneHotEncoder(drop='first')
onehot.fit(car_price_1[["brand",	"fuel_type"	,"transmission",	"mileage",	"condition" , "model"]])

In [50]:
one_hot_carprice1 = onehot.transform(car_price_1[["brand",	"fuel_type"	,"transmission",	"mileage",	"condition" , "model"]])
one_hot_carprice1.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

In [51]:
carprice1_encoded = pd.DataFrame(one_hot_carprice1.toarray(), columns=onehot.get_feature_names_out())
carprice1_encoded

Unnamed: 0,brand_BMW,brand_Ford,brand_Honda,brand_Mercedes,brand_Tesla,brand_Toyota,fuel_type_Electric,fuel_type_Hybrid,fuel_type_Petrol,transmission_Manual,...,model_Model S,model_Model X,model_Model Y,model_Mustang,model_Prius,model_Q5,model_Q7,model_RAV4,model_X3,model_X5
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8745,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8746,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8747,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8748,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [52]:
carprice1_final = pd.concat([car_price_1, carprice1_encoded], axis=1)
carprice1_final.drop(['brand',	'fuel_type'	,'transmission',	'condition' , 'model' , "car_id"], axis=1, inplace=True)
carprice1_final

Unnamed: 0,year,engine_size,mileage,price,brand_BMW,brand_Ford,brand_Honda,brand_Mercedes,brand_Tesla,brand_Toyota,...,model_Model S,model_Model X,model_Model Y,model_Mustang,model_Prius,model_Q5,model_Q7,model_RAV4,model_X3,model_X5
0,2016,2.3,114832,26613.92,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2018,4.4,143190,14679.61,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2013,4.5,181601,44402.61,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2011,4.1,68682,86374.33,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2009,2.6,223009,73577.10,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8745,2004,1.0,262128,62989.13,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8746,2000,5.6,290042,11167.74,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8747,2006,5.2,145512,16264.48,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8748,2011,2.8,99468,59411.75,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


## handle outliers

In [53]:
px.box(carprice1_final, x='price', points='all')

In [54]:
px.box(carprice1_final, x='mileage', points='all')

In [55]:
px.box(carprice1_final, x='engine_size', points='all')

## output and input

In [56]:
input = carprice1_final.drop('price', axis=1)
output = carprice1_final['price']

In [57]:
input

Unnamed: 0,year,engine_size,mileage,brand_BMW,brand_Ford,brand_Honda,brand_Mercedes,brand_Tesla,brand_Toyota,fuel_type_Electric,...,model_Model S,model_Model X,model_Model Y,model_Mustang,model_Prius,model_Q5,model_Q7,model_RAV4,model_X3,model_X5
0,2016,2.3,114832,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2018,4.4,143190,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2013,4.5,181601,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2011,4.1,68682,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2009,2.6,223009,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8745,2004,1.0,262128,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8746,2000,5.6,290042,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8747,2006,5.2,145512,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8748,2011,2.8,99468,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [58]:
output

Unnamed: 0,price
0,26613.92
1,14679.61
2,44402.61
3,86374.33
4,73577.10
...,...
8745,62989.13
8746,11167.74
8747,16264.48
8748,59411.75


In [59]:
from sklearn.model_selection import train_test_split
input_train,input_test,output_train,output_test=train_test_split(input,output,test_size=0.2,shuffle=True,random_state=0)

In [60]:
input_train

Unnamed: 0,year,engine_size,mileage,brand_BMW,brand_Ford,brand_Honda,brand_Mercedes,brand_Tesla,brand_Toyota,fuel_type_Electric,...,model_Model S,model_Model X,model_Model Y,model_Mustang,model_Prius,model_Q5,model_Q7,model_RAV4,model_X3,model_X5
7571,2018,4.6,52107,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7014,2018,5.6,111581,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1369,2007,4.9,164721,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
773,2012,1.2,200903,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1514,2021,4.9,278491,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4373,2015,5.7,258466,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7891,2015,1.1,150943,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4859,2009,5.8,225956,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3264,2017,4.6,170715,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [61]:
input_test

Unnamed: 0,year,engine_size,mileage,brand_BMW,brand_Ford,brand_Honda,brand_Mercedes,brand_Tesla,brand_Toyota,fuel_type_Electric,...,model_Model S,model_Model X,model_Model Y,model_Mustang,model_Prius,model_Q5,model_Q7,model_RAV4,model_X3,model_X5
7057,2008,1.5,265954,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1638,2018,4.9,69072,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
277,2012,4.6,192081,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5546,2015,2.5,194292,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2570,2023,1.3,22222,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2931,2007,3.0,148430,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5007,2012,5.6,167720,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1563,2011,2.6,153868,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1580,2007,5.4,275731,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [62]:
output_train

Unnamed: 0,price
7571,46834.12
7014,47162.21
1369,62117.43
773,17165.24
1514,74400.55
...,...
4373,5353.03
7891,9730.07
4859,76481.99
3264,11508.22


In [63]:
output_test

Unnamed: 0,price
7057,80341.84
1638,73142.61
277,83194.08
5546,27698.24
2570,62317.77
...,...
2931,93126.06
5007,58688.87
1563,49235.03
1580,27898.02


## scaling

In [64]:
from sklearn.preprocessing import RobustScaler
robust_scaler = RobustScaler()
input_train_r_scaled = robust_scaler.fit_transform(input_train)

In [65]:
input_train_r_scaled = pd.DataFrame(input_train_r_scaled, columns=input_train.columns)
input_train_r_scaled

Unnamed: 0,year,engine_size,mileage,brand_BMW,brand_Ford,brand_Honda,brand_Mercedes,brand_Tesla,brand_Toyota,fuel_type_Electric,...,model_Model S,model_Model X,model_Model Y,model_Mustang,model_Prius,model_Q5,model_Q7,model_RAV4,model_X3,model_X5
0,0.538462,0.423077,-0.611568,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.538462,0.807692,-0.225408,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,-0.307692,0.538462,0.119625,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.076923,-0.884615,0.354552,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.769231,0.538462,0.858325,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6995,0.307692,0.846154,0.728304,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6996,0.307692,-0.923077,0.030166,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6997,-0.153846,0.884615,0.517219,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6998,0.461538,0.423077,0.158544,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [66]:
input_test_r_scaled = robust_scaler.transform(input_test)

In [67]:
input_test_r_scaled = pd.DataFrame(input_test_r_scaled, columns=input_test.columns)
input_test_r_scaled

Unnamed: 0,year,engine_size,mileage,brand_BMW,brand_Ford,brand_Honda,brand_Mercedes,brand_Tesla,brand_Toyota,fuel_type_Electric,...,model_Model S,model_Model X,model_Model Y,model_Mustang,model_Prius,model_Q5,model_Q7,model_RAV4,model_X3,model_X5
0,-0.230769,-0.769231,0.776923,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.538462,0.538462,-0.501415,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.076923,0.423077,0.297272,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.307692,-0.384615,0.311628,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.923077,-0.846154,-0.805609,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1745,-0.307692,-0.192308,0.013849,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1746,0.076923,0.807692,0.139098,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1747,0.000000,-0.346154,0.049158,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1748,-0.307692,0.730769,0.840404,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
