In [1]:
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder
# import plotly.express as px


In [2]:
dataset=pd.read_csv('Car details v3.csv')

In [3]:
X=dataset.drop('selling_price',axis=1)
y=dataset['selling_price']

In [4]:
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [5]:
X_train

Unnamed: 0,name,year,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
6518,Tata Tiago NRG Petrol AMT,2019,2560,Petrol,Individual,Automatic,First Owner,24.0 kmpl,1199 CC,83.81 bhp,114Nm@ 3500rpm,5.0
6144,Honda Brio S MT,2013,80000,Petrol,Individual,Manual,Second Owner,19.4 kmpl,1198 CC,86.8 bhp,109Nm@ 4500rpm,5.0
6381,Hyundai i20 1.4 CRDi Asta,2011,150000,Diesel,Individual,Manual,Fourth & Above Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0
438,Maruti Swift Dzire VDI,2013,120000,Diesel,Individual,Manual,Second Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0
5939,Maruti Alto K10 VXI,2017,25000,Petrol,Individual,Manual,First Owner,23.95 kmpl,998 CC,67.05 bhp,90Nm@ 3500rpm,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...
5226,Mahindra Scorpio 2006-2009 VLX 2WD 7 Str BSIII,2009,120000,Diesel,Individual,Manual,First Owner,12.05 kmpl,2179 CC,120 bhp,290Nm@ 1800-2800rpm,7.0
5390,Maruti Swift Dzire VDI,2014,80000,Diesel,Individual,Manual,Second Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0
860,Hyundai i20 Asta Option 1.2,2016,35000,Petrol,Individual,Manual,First Owner,18.6 kmpl,1197 CC,81.83 bhp,114.7Nm@ 4000rpm,5.0
7603,Maruti Swift Dzire VDI,2019,27000,Diesel,Individual,Manual,First Owner,28.4 kmpl,1248 CC,74.02 bhp,190Nm@ 2000rpm,5.0


In [6]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6502 entries, 6518 to 7270
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   name          6502 non-null   object 
 1   year          6502 non-null   int64  
 2   km_driven     6502 non-null   int64  
 3   fuel          6502 non-null   object 
 4   seller_type   6502 non-null   object 
 5   transmission  6502 non-null   object 
 6   owner         6502 non-null   object 
 7   mileage       6332 non-null   object 
 8   engine        6332 non-null   object 
 9   max_power     6336 non-null   object 
 10  torque        6331 non-null   object 
 11  seats         6332 non-null   float64
dtypes: float64(1), int64(2), object(9)
memory usage: 660.4+ KB


In [8]:
def engineAdjuster(s:str):
    try:
        return float(s.split()[0])
    except:
        try:
            return float(s.split('CC')[0])
        except:
            return None
        
def mileageAdjuster(s:str):
    try:
        return float(s.split()[0])
    except:
        return None

def maxPowerAdjuster(s:str):
    try:
        return float(s.split()[0])
    except:
        return None
    
def torqueAdjuster(s:str):
    try:
        ans=''
        for i in s:
            if i.isdigit() or i=='.':
                ans=ans+i
            else:
                break
        if 'NM'in s or 'nm' in s or 'Nm'in s:
            return float(ans)
        else:
            return float(ans)*9.80665
    except:
        return None

In [9]:
class CustomSimpleImputer(BaseEstimator,TransformerMixin):
    def __init__(self,drop_na=False):
        self.drop_na=drop_na
    def fit(self,X):
        return self
    def transform(self,X:pd.DataFrame):
        c=X.columns
        dataset=X.copy()
        if self.drop_na:
            return dataset.dropna()
        return pd.DataFrame(SimpleImputer(strategy='most_frequent').fit_transform(dataset),columns=c)

In [10]:
class NumericTransformer(BaseEstimator,TransformerMixin):
    def fit(self,X):
        return self
    def transform(self,X:pd.DataFrame):
        dataset=X.drop('name',axis=1)
        
        dataset['mileage_no']=dataset['mileage'].apply(lambda x:mileageAdjuster(x))
        dataset=dataset.drop('mileage',axis=1)
        
        dataset['engine_no']=dataset['engine'].apply(lambda x:engineAdjuster(x))
        dataset=dataset.drop('engine',axis=1)
        
        dataset['max_power_no']=dataset['max_power'].apply(lambda x:maxPowerAdjuster(x))
        dataset=dataset.drop('max_power',axis=1)
        
        dataset['torque_nm']=dataset['torque'].apply(lambda x:torqueAdjuster(x))
        dataset=dataset.drop('torque',axis=1)
        
        return dataset
        

In [11]:
# fuel, seller_type, owner, transmission are categorical

In [12]:
owner_dict={'First Owner':1, 'Second Owner':2, 'Third Owner':3,'Fourth & Above Owner':4, 'Test Drive Car':5}
transmission_dict={'Manual':1, 'Automatic':2}

In [13]:
dataset['owner'].unique()


array(['First Owner', 'Second Owner', 'Third Owner',
       'Fourth & Above Owner', 'Test Drive Car'], dtype=object)

In [14]:
class CategoricalTransformer(BaseEstimator,TransformerMixin):
        
    def fit(self,X):
        return self
    def transform(self,X:pd.DataFrame):
        X_copy=X.copy()
        
        X_copy['f_diesel']=0
        X_copy['f_diesel'].mask(X_copy['fuel']=='Diesel',1,inplace=True)
        
        X_copy['f_petrol']=0
        X_copy['f_petrol'].mask(X_copy['fuel']=='Petrol',1,inplace=True)
        
        X_copy['f_lpg']=0
        X_copy['f_lpg'].mask(X_copy['fuel']=='LPG',1,inplace=True)
        
        X_copy['f_cng']=0
        X_copy['f_cng'].mask(X_copy['fuel']=='CNG',1,inplace=True)
        
        
        X_copy['seller_type_individual']=0
        X_copy['seller_type_individual'].mask(X_copy['seller_type']=='Individual',1,inplace=True)
        
        X_copy['seller_type_dealer']=0
        X_copy['seller_type_dealer'].mask(X_copy['seller_type']=='Dealer',1,inplace=True)
        
        X_copy['seller_type_trusted_dealer']=0
        X_copy['seller_type_trusted_dealer'].mask(X_copy['seller_type']=='Trustmark Dealer',1,inplace=True)
        
        X_copy['owner_no']=X_copy.owner.map(owner_dict)
        
        X_copy['transmission_no']=X_copy.transmission.map(transmission_dict)
        
        X_copy.drop(['fuel','owner','seller_type','transmission'],axis=1,inplace=True)
        
        return X_copy
        

In [16]:
dataset_copy=dataset.copy()

In [17]:
temp1=pd.DataFrame(SimpleImputer(strategy='most_frequent').fit_transform(dataset_copy),columns=dataset_copy.columns)

In [18]:
dataset.loc[dataset['mileage'].isnull()& dataset['engine'].isnull()& dataset['seats'].isnull()]

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
13,Maruti Swift 1.3 VXi,2007,200000,80000,Petrol,Individual,Manual,Second Owner,,,,,
31,Fiat Palio 1.2 ELX,2003,70000,50000,Petrol,Individual,Manual,Second Owner,,,,,
78,Tata Indica DLS,2003,50000,70000,Diesel,Individual,Manual,First Owner,,,,,
87,Maruti Swift VDI BSIV W ABS,2015,475000,78000,Diesel,Dealer,Manual,First Owner,,,,,
119,Maruti Swift VDI BSIV,2010,300000,120000,Diesel,Individual,Manual,Second Owner,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7846,Toyota Qualis Fleet A3,2000,200000,100000,Diesel,Individual,Manual,First Owner,,,,,
7996,Hyundai Santro LS zipPlus,2000,140000,50000,Petrol,Individual,Manual,Second Owner,,,,,
8009,Hyundai Santro Xing XS eRLX Euro III,2006,145000,80000,Petrol,Individual,Manual,Second Owner,,,,,
8068,Ford Figo Aspire Facelift,2017,580000,165000,Diesel,Individual,Manual,First Owner,,,,,


In [33]:
isTrainDataset=True

In [34]:
full_pipeline=Pipeline([
    ('imputer',CustomSimpleImputer(drop_na=isTrainDataset)),
    ('num_transformer',NumericTransformer()),
    ('cat_transformer',CategoricalTransformer())
])

In [35]:
train_set=X_train.copy()
train_set['selling_price']=y_train

In [36]:
train_set

Unnamed: 0,name,year,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats,selling_price
6518,Tata Tiago NRG Petrol AMT,2019,2560,Petrol,Individual,Automatic,First Owner,24.0 kmpl,1199 CC,83.81 bhp,114Nm@ 3500rpm,5.0,520000
6144,Honda Brio S MT,2013,80000,Petrol,Individual,Manual,Second Owner,19.4 kmpl,1198 CC,86.8 bhp,109Nm@ 4500rpm,5.0,300000
6381,Hyundai i20 1.4 CRDi Asta,2011,150000,Diesel,Individual,Manual,Fourth & Above Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0,380000
438,Maruti Swift Dzire VDI,2013,120000,Diesel,Individual,Manual,Second Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0,530000
5939,Maruti Alto K10 VXI,2017,25000,Petrol,Individual,Manual,First Owner,23.95 kmpl,998 CC,67.05 bhp,90Nm@ 3500rpm,5.0,335000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5226,Mahindra Scorpio 2006-2009 VLX 2WD 7 Str BSIII,2009,120000,Diesel,Individual,Manual,First Owner,12.05 kmpl,2179 CC,120 bhp,290Nm@ 1800-2800rpm,7.0,475000
5390,Maruti Swift Dzire VDI,2014,80000,Diesel,Individual,Manual,Second Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0,530000
860,Hyundai i20 Asta Option 1.2,2016,35000,Petrol,Individual,Manual,First Owner,18.6 kmpl,1197 CC,81.83 bhp,114.7Nm@ 4000rpm,5.0,576000
7603,Maruti Swift Dzire VDI,2019,27000,Diesel,Individual,Manual,First Owner,28.4 kmpl,1248 CC,74.02 bhp,190Nm@ 2000rpm,5.0,770000


In [37]:
train_set_final=full_pipeline.fit_transform(train_set)

In [38]:
train_set_final.to_csv('train_set.csv',index=False)

In [39]:
isTrainDataset=False

In [40]:
test_set=X_test.copy()
test_set['selling_price']=y_test

In [41]:
test_set_final=full_pipeline.fit_transform(test_set)

In [42]:
test_set_final.to_csv('test_set.csv',index=False)