# Pipeline

In [37]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from joblib import dump, load

In [2]:
car_df = pd.read_csv('csv_files_car_dheko/CarData_Cleaned_UsedCarPricePrediction.csv')
car_df.head()

Unnamed: 0,body_type,km,ownerno,brand,model,modelyear,price_in_lakh,insurance_validity,fuel_type,seats,transmission,top_features_count,comfort_count,safety_count,mileage,color,displacement,city
0,Hatchback,120000,3,Maruti,Maruti Celerio,2015,4.0,Third Party insurance,Petrol,5,Manual,8,10,13,23.1,White,998,Bangalore
1,SUV,32706,2,Ford,Ford Ecosport,2018,8.11,Comprehensive,Petrol,5,Manual,9,17,27,19.095,White,1497,Bangalore
2,Hatchback,11949,1,Tata,Tata Tiago,2018,5.85,Comprehensive,Petrol,5,Manual,9,14,24,23.84,Red,1199,Bangalore
3,Sedan,17794,1,Hyundai,Hyundai Xcent,2014,4.62,Comprehensive,Petrol,5,Manual,9,16,18,19.1,Others,1197,Bangalore
4,SUV,60000,1,Maruti,Maruti SX4 S Cross,2015,7.9,Third Party insurance,Diesel,5,Manual,9,21,22,23.65,Gray,1248,Bangalore


In [5]:
x = car_df.drop('price_in_lakh', axis=1)
y = car_df['price_in_lakh']# Target variable

# Performing the train-test split (80% train, 20% test)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [22]:
print(x_train.columns.tolist())

['body_type', 'km', 'ownerno', 'brand', 'model', 'modelyear', 'insurance_validity', 'fuel_type', 'seats', 'transmission', 'top_features_count', 'comfort_count', 'safety_count', 'mileage', 'color', 'displacement', 'city']


In [15]:
# List of categorical columns
categorical_columns = x.select_dtypes(include=['object']).columns.tolist()

# Numerical columns (excluding the target variable)
numeric_columns = x.select_dtypes(exclude=['object']).columns.tolist()

# building pipeline steps for preprocessing
num_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, numeric_columns),
    ('cat', cat_transformer, categorical_columns)
])

In [20]:
# storing the pipeine steps in model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(n_estimators=200, random_state=42))
])

# fit the model
pipeline.fit(x_train, y_train)

# Model Evalation

In [26]:
model_pred = pipeline.predict(x_test)
print(f'Random Forest Regression MSE: {mean_squared_error(y_test, model_pred)}')
print(f'Random Forest Regression R2 Score: {r2_score(y_test, model_pred)}')
print(f'Random Forest Regression MAE: {mean_absolute_error(y_test, model_pred)}')

Random Forest Regression MSE: 44.701847031925446
Random Forest Regression R2 Score: 0.701645585931566
Random Forest Regression MAE: 2.1671917145593866


In [27]:
y_test.iloc[0]

4.8

In [31]:
x_test.iloc[0]

body_type                         Hatchback
km                                    75000
ownerno                                   1
brand                                Maruti
model                          Maruti Swift
modelyear                              2014
insurance_validity    Third Party insurance
fuel_type                            Diesel
seats                                     5
transmission                         Manual
top_features_count                        9
comfort_count                            11
safety_count                             15
mileage                                22.9
color                                  Grey
displacement                           1248
city                              Hyderabad
Name: 5176, dtype: object

In [32]:
pred = pipeline.predict(x_test.iloc[0].to_frame().T)
print(f'Predicted Price: {pred[0]}')

Predicted Price: 4.42115


# Training the whole dataset

In [33]:
pipeline.fit(x, y)

# save model

In [38]:
dump(pipeline, 'RFmodel.joblib')

['RFmodel.joblib']