In [44]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler ,OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [4]:
df=pd.read_csv('car-details.csv')

In [7]:
df.sample(5)

Unnamed: 0,name,company,model,edition,year,owner,fuel,seller_type,transmission,km_driven,mileage_mpg,engine_cc,max_power_bhp,torque_nm,seats,selling_price
2877,Tata Indigo CS LS (TDI) BS-III,Tata,Indigo,CS LS (TDI) BS-III,2015,Second,Diesel,Individual,Manual,120000,44.88,1405.0,69.01,135.0,5.0,295000
6669,BMW 5 Series 520d Sport Line,BMW,5,Series 520d Sport Line,2009,Second,Diesel,Dealer,Automatic,80000,52.8,1995.0,187.74,400.0,5.0,975000
3185,Honda Amaze S AT i-Vtech,Honda,Amaze,S AT i-Vtech,2015,First,Petrol,Individual,Automatic,45000,36.44,1198.0,86.7,109.0,5.0,415000
548,Mahindra Scorpio 1.99 S10,Mahindra,Scorpio,1.99 S10,2016,First,Diesel,Individual,Manual,100000,36.2,1997.0,120.0,280.0,7.0,1100000
2495,Hyundai i10 Magna 1.1L,Hyundai,i10,Magna 1.1L,2012,First,Petrol,Individual,Manual,110000,46.56,1086.0,68.05,99.04,5.0,225000


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6926 entries, 0 to 6925
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           6926 non-null   object 
 1   company        6926 non-null   object 
 2   model          6926 non-null   object 
 3   edition        6926 non-null   object 
 4   year           6926 non-null   int64  
 5   owner          6926 non-null   object 
 6   fuel           6926 non-null   object 
 7   seller_type    6926 non-null   object 
 8   transmission   6926 non-null   object 
 9   km_driven      6926 non-null   int64  
 10  mileage_mpg    6718 non-null   float64
 11  engine_cc      6718 non-null   float64
 12  max_power_bhp  6717 non-null   float64
 13  torque_nm      6717 non-null   float64
 14  seats          6718 non-null   float64
 15  selling_price  6926 non-null   int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 865.9+ KB


In [12]:
df.isna().sum()

name               0
company            0
model              0
edition            0
year               0
owner              0
fuel               0
seller_type        0
transmission       0
km_driven          0
mileage_mpg      208
engine_cc        208
max_power_bhp    209
torque_nm        209
seats            208
selling_price      0
dtype: int64

In [13]:
df.shape

(6926, 16)

In [14]:
df=df.drop(columns=['name','model','edition'])

In [17]:
df.duplicated().sum()

19

In [18]:
df.drop_duplicates()
df.head()

Unnamed: 0,company,year,owner,fuel,seller_type,transmission,km_driven,mileage_mpg,engine_cc,max_power_bhp,torque_nm,seats,selling_price
0,Maruti,2014,First,Diesel,Individual,Manual,145500,55.0,1248.0,74.0,190.0,5.0,450000
1,Skoda,2014,Second,Diesel,Individual,Manual,120000,49.7,1498.0,103.52,250.0,5.0,370000
2,Honda,2006,Third,Petrol,Individual,Manual,140000,41.6,1497.0,78.0,124.544455,5.0,158000
3,Hyundai,2010,First,Diesel,Individual,Manual,127000,54.06,1396.0,90.0,219.66896,5.0,225000
4,Maruti,2007,First,Petrol,Individual,Manual,120000,37.84,1298.0,88.2,112.776475,5.0,130000


In [None]:
x=df.drop(columns=['selling_price'])
y=df.selling_price.copy()
print(x.shape,y.shape)

(6926, 12) (6926,)


In [23]:
x_train ,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)
print(x_train.shape,y_train.shape)
print(x_test.shape,y_test.shape)

(5540, 12) (5540,)
(1386, 12) (1386,)


In [32]:
num_cols=x_train.select_dtypes(include='number').columns.tolist()
cat_cols=[col for col in x_train.columns if col not in num_cols]
print(num_cols)

['year', 'km_driven', 'mileage_mpg', 'engine_cc', 'max_power_bhp', 'torque_nm', 'seats']


In [39]:
num_pipe=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())
])

cat_pipe=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='constant',fill_value='missing')),
    ('encoder',OneHotEncoder(handle_unknown='ignore',sparse_output=False))

])


preprocessor=ColumnTransformer(transformers=[
    ('num',num_pipe,num_cols),
    ('cat',cat_pipe,cat_cols)
])
regressor=RandomForestRegressor(
    n_estimators=10,max_depth=5,random_state=42
)

rf_model=Pipeline(steps=[
    ('pre',preprocessor),
    ('reg',regressor)
])

rf_model.fit(x_train,y_train)


In [46]:
y_train_pred=rf_model.predict(x_train)
train_rmse=mean_squared_error(y_train,y_train_pred,squared=False)

y_test_pred=rf_model.predict(x_test)
train_rmse=mean_squared_error(y_test,y_test_pred,squared=False)

