In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR
import pickle

In [2]:
# Function to train and evaluate multiple models
def get_best_model(X, y):
    # Define the models to evaluate
    models = {
        'Linear Regression': LinearRegression(),
        'Decision Tree': DecisionTreeRegressor(random_state=42),
        'Random Forest': RandomForestRegressor(random_state=42),
        'Support Vector Regressor': SVR()
    }

    # Preprocessing for numerical and categorical data
    numerical_cols = ['Year', 'Kilometers_Driven', 'Engine_CC', 'Power', 'Seats', 'Mileage_KmL']
    categorical_cols = ['Fuel_Type', 'Transmission', 'Owner_Type']
    
    numerical_transformer = StandardScaler()
    categorical_transformer = OneHotEncoder()

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])

    best_model = None
    best_score = float('inf')

    for name, model in models.items():
        # Create a pipeline with preprocessor and model
        pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                   ('regressor', model)])
        
        # Evaluate model using cross-validation
        scores = cross_val_score(pipeline, X, y, scoring='neg_root_mean_squared_error', cv=5)
        rmse = -scores.mean()
        
        print(f'{name} RMSE: {rmse}')
        
        # Update the best model if current model is better
        if rmse < best_score:
            best_score = rmse
            best_model = pipeline

    print(f'Best model: {best_model.steps[-1][0]} with RMSE: {best_score}')
    
    # Fit the best model on the full training data
    best_model.fit(X, y)
    
    return best_model

In [3]:
# Load the dataset
df = pd.read_csv("indian-auto-mpg.csv")

In [4]:
# Checking for null values

print(df.isna().sum())
print("-----------------------------")
print("Total Null Values : ", df.isna().sum().sum())

Name                 0
Manufacturer         0
Location             0
Year                 0
Kilometers_Driven    0
Fuel_Type            0
Transmission         0
Owner_Type           0
Engine_CC            0
Power                0
Seats                0
Mileage_KmL          0
Price                0
dtype: int64
-----------------------------
Total Null Values :  0


In [5]:
# Data preprocessing
df.dropna(inplace=True)  # Dropping rows with missing values for simplicity

In [6]:
# Split data into features and target variable
X = df.drop(columns=['Price', 'Name', 'Manufacturer', 'Location'])  # Features
y = df['Price']  # Target variable

In [7]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Get the best model
best_model = get_best_model(X_train, y_train)

Linear Regression RMSE: 5.855341473515645
Decision Tree RMSE: 4.607668710729338
Random Forest RMSE: 3.4439396134002336
Support Vector Regressor RMSE: 5.30144447462154
Best model: regressor with RMSE: 3.4439396134002336


In [9]:
# Evaluate the best model on the test set
test_preds = best_model.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, test_preds))

In [10]:
print(f'Test RMSE of the best model: {test_rmse}')

Test RMSE of the best model: 4.870214467311351


In [11]:
# Predict on new data
new_data = pd.DataFrame({
    'Year': [2015],
    'Kilometers_Driven': [50000],
    'Fuel_Type': ['Diesel'],
    'Transmission': ['Manual'],
    'Owner_Type': ['First'],
    'Engine_CC': [1500],
    'Power': [100],
    'Seats': [5],
    'Mileage_KmL': [18.0]
})

predicted_price = best_model.predict(new_data)
print(f'Predicted Price: {predicted_price[0]}')

Predicted Price: 5.3822


In [12]:
# saving best model

filename = 'best_model.sav'
pickle.dump(best_model, open(filename,'wb'))