# Create model with more features

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore")

%matplotlib inline

In [2]:
df_filtered = pd.read_csv('C:\\Users\\Lenovo\\Downloads\\data\\df_filtered.csv')

print(df_filtered.head())

df_filtered.dtypes

        City FuelType   BodyType  KmsDriven TransmissionType  NumberOwner  \
0  Bangalore   Petrol  Hatchback     120000           Manual            3   
1  Bangalore   Petrol        SUV      32706           Manual            2   
2  Bangalore   Petrol  Hatchback      11949           Manual            1   
3  Bangalore   Petrol      Sedan      17794           Manual            1   
4  Bangalore   Diesel        SUV      60000           Manual            1   

       Insurance Manufacturer            CarModel  ModelYear  \
0    Third Party       Maruti      Maruti Celerio       2015   
1  Comprehensive         Ford       Ford Ecosport       2018   
2  Comprehensive         Tata          Tata Tiago       2018   
3  Comprehensive      Hyundai       Hyundai Xcent       2014   
4    Third Party       Maruti  Maruti SX4 S Cross       2015   

                VariantName  Mileage  Engine   Price  No of Cylinder  Seats  \
0                       VXI       23     998  400000               3     

City                object
FuelType            object
BodyType            object
KmsDriven            int64
TransmissionType    object
NumberOwner          int64
Insurance           object
Manufacturer        object
CarModel            object
ModelYear            int64
VariantName         object
Mileage              int64
Engine               int64
Price                int64
No of Cylinder       int64
Seats                int64
AgeOfCar             int64
dtype: object

# Select all features in df_filtered dataset


In [3]:
# Features (X): all columns except 'Price'
X = df_filtered[['City', 'FuelType', 'BodyType', 'TransmissionType', 'Insurance', 'Manufacturer', 'CarModel', 'VariantName',
        'KmsDriven', 'NumberOwner', 'Mileage', 'Engine', 'No of Cylinder', 'Seats', 'AgeOfCar']]

# Target (y): the 'Price' column
y = df_filtered['Price']

# Create model with all features

In [4]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, explained_variance_score, r2_score

# Define categorical and numerical features
categorical_features = ['City', 'FuelType', 'BodyType', 'TransmissionType', 'Insurance', 'Manufacturer', 'CarModel', 'VariantName']
numerical_features = ['KmsDriven', 'NumberOwner', 'Mileage', 'Engine', 'No of Cylinder', 'AgeOfCar']

# OneHotEncode categorical features and scale numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)  # Ignore unknown categories
    ])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply preprocessing
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

# Initialize and train a Random Forest model
model = RandomForestRegressor(n_estimators=25, random_state=42)
model.fit(X_train, y_train)

# Predict on test set and evaluate
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = (np.sqrt(mean_squared_error(y_true= y_test, y_pred= y_pred)))
evs = (explained_variance_score(y_true= y_test, y_pred= y_pred))
r_square_score = (r2_score(y_true= y_test, y_pred= y_pred))
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squarred Error: {rmse}")
print(f"Explained Variance Score: {evs}")
print(f"R-Sqaure Score(Accuracy): {r_square_score}")

Mean Squared Error: 128695432944.5775
Root Mean Squarred Error: 358741.45696389413
Explained Variance Score: 0.9027379202945107
R-Sqaure Score(Accuracy): 0.9026954304684864


### RandomForestRegressor accuracy rate is 94%

In [5]:
import pickle
    
with open('random_forest_model_with_preprocessor.pkl', 'wb') as file:
    pickle.dump((model, preprocessor), file)   

In [6]:
import pickle
import pandas as pd

# Load the saved model
model_filename = 'random_forest_model_with_preprocessor.pkl'
with open(model_filename, 'rb') as file:
    loaded_model, preprocessor = pickle.load(file)

# Define new car data for prediction
new_data = {
    'City': 'Mumbai',
    'FuelType': 'Petrol',
    'BodyType': 'Sedan',
    'TransmissionType': 'Manual',
    'Insurance': 'Yes',
    'Manufacturer': 'Maruthi',
    'CarModel': 'Swift',
    'VariantName': 'VX',
    'KmsDriven': 50000,
    'NumberOwner': 1,
    'ModelYear': 2015,
    'Mileage': 18.0,
    'Engine': 1497,
    'No of Cylinder': 4,
    'Seats': 5,
    'AgeOfCar': 8
}

# Convert new data to a DataFrame
new_data_df = pd.DataFrame([new_data])

# Apply the same preprocessing used during training
new_data_preprocessed = preprocessor.transform(new_data_df)

# Predict the price
predicted_price = loaded_model.predict(new_data_preprocessed)

print(f"Predicted Price: {predicted_price[0]}")

Predicted Price: 634520.0
