In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
file_path = '/content/drive/MyDrive/Cars.csv'
df = pd.read_csv(file_path)


In [4]:
print(df.head())

                           name  year  selling_price  km_driven    fuel  \
0        Maruti Swift Dzire VDI  2014         450000     145500  Diesel   
1  Skoda Rapid 1.5 TDI Ambition  2014         370000     120000  Diesel   
2      Honda City 2017-2020 EXi  2006         158000     140000  Petrol   
3     Hyundai i20 Sportz Diesel  2010         225000     127000  Diesel   
4        Maruti Swift VXI BSIII  2007         130000     120000  Petrol   

  seller_type transmission         owner     mileage   engine   max_power  \
0  Individual       Manual   First Owner   23.4 kmpl  1248 CC      74 bhp   
1  Individual       Manual  Second Owner  21.14 kmpl  1498 CC  103.52 bhp   
2  Individual       Manual   Third Owner   17.7 kmpl  1497 CC      78 bhp   
3  Individual       Manual   First Owner   23.0 kmpl  1396 CC      90 bhp   
4  Individual       Manual   First Owner   16.1 kmpl  1298 CC    88.2 bhp   

                     torque  seats  
0            190Nm@ 2000rpm    5.0  
1       250N

In [5]:
# Map the 'owner' feature
owner_mapping = {
    'First Owner': 1,
    'Second Owner': 2,
    'Third Owner': 3,
    'Fourth & Above Owner': 4,
    'Test Drive Car': 5
}
df['owner'] = df['owner'].map(owner_mapping)


In [6]:
# Remove rows with 'fuel' as 'CNG' or 'LPG'
df = df[~df['fuel'].isin(['CNG', 'LPG'])]

# Process 'mileage' to remove "kmpl" and convert to float
df['mileage'] = df['mileage'].str.split(' ').str[0].astype(float)

# Process 'engine' to remove "CC" and convert to float
df['engine'] = df['engine'].str.replace('CC', '').astype(float)

# Process 'max_power' to remove "bhp" and convert to float
df['max_power'] = df['max_power'].str.replace('bhp', '').astype(float)

# Process 'name' to extract the brand
df['brand'] = df['name'].str.split(' ').str[0]

# Drop the 'torque' feature
df = df.drop(columns=['torque'])

# Remove rows with 'owner' as 5 (Test Drive Car)
df = df[df['owner'] != 5]

# Log transform the 'selling_price'
df['selling_price'] = np.log(df['selling_price'])

# Save the cleaned dataset (optional)
df.to_csv('cleaned_dataset.csv', index=False)

# Display the first few rows of the cleaned dataset
print(df.head())

                           name  year  selling_price  km_driven    fuel  \
0        Maruti Swift Dzire VDI  2014      13.017003     145500  Diesel   
1  Skoda Rapid 1.5 TDI Ambition  2014      12.821258     120000  Diesel   
2      Honda City 2017-2020 EXi  2006      11.970350     140000  Petrol   
3     Hyundai i20 Sportz Diesel  2010      12.323856     127000  Diesel   
4        Maruti Swift VXI BSIII  2007      11.775290     120000  Petrol   

  seller_type transmission  owner  mileage  engine  max_power  seats    brand  
0  Individual       Manual      1    23.40  1248.0      74.00    5.0   Maruti  
1  Individual       Manual      2    21.14  1498.0     103.52    5.0    Skoda  
2  Individual       Manual      3    17.70  1497.0      78.00    5.0    Honda  
3  Individual       Manual      1    23.00  1396.0      90.00    5.0  Hyundai  
4  Individual       Manual      1    16.10  1298.0      88.20    5.0   Maruti  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['mileage'] = df['mileage'].str.split(' ').str[0].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['engine'] = df['engine'].str.replace('CC', '').astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['max_power'] = df['max_power'].str.replace('bhp', '').astype(float)
A va

In [7]:
# Check for non-string or missing values in 'mileage'
print(df['mileage'].isnull().sum())  # Check for NaN values
print(df['mileage'].dtype)  # Check the data type of 'mileage'



214
float64


In [8]:

# Convert non-string or missing 'mileage' values to a default value
# Fill missing values with a placeholder (e.g., '0 kmpl') before processing
df['mileage'] = df['mileage'].fillna('0 kmpl')

In [9]:
# Ensure all values are strings before splitting
df['mileage'] = df['mileage'].astype(str).str.split(' ').str[0].replace('', '0').astype(float)

In [10]:
print(df['mileage'].isnull().sum())  # Check for NaN values
print(df['mileage'].dtype)

0
float64


In [11]:
print(df.head())  # Display first rows
print(df.info())  # Check column data types and null counts


                           name  year  selling_price  km_driven    fuel  \
0        Maruti Swift Dzire VDI  2014      13.017003     145500  Diesel   
1  Skoda Rapid 1.5 TDI Ambition  2014      12.821258     120000  Diesel   
2      Honda City 2017-2020 EXi  2006      11.970350     140000  Petrol   
3     Hyundai i20 Sportz Diesel  2010      12.323856     127000  Diesel   
4        Maruti Swift VXI BSIII  2007      11.775290     120000  Petrol   

  seller_type transmission  owner  mileage  engine  max_power  seats    brand  
0  Individual       Manual      1    23.40  1248.0      74.00    5.0   Maruti  
1  Individual       Manual      2    21.14  1498.0     103.52    5.0    Skoda  
2  Individual       Manual      3    17.70  1497.0      78.00    5.0    Honda  
3  Individual       Manual      1    23.00  1396.0      90.00    5.0  Hyundai  
4  Individual       Manual      1    16.10  1298.0      88.20    5.0   Maruti  
<class 'pandas.core.frame.DataFrame'>
Index: 8028 entries, 0 to 8127


In [13]:
# Fill missing values in 'engine' with the median
df['engine'] = df['engine'].fillna(df['engine'].median())

# Fill missing values in 'max_power' with the median
df['max_power'] = df['max_power'].fillna('0 bhp')  # Replace NaN with placeholder
df['max_power'] = df['max_power'].str.replace('bhp', '', regex=False).astype(float)

# Fill missing values in 'seats' with the most frequent value (mode)
df['seats'] = df['seats'].fillna(df['seats'].mode()[0])




In [14]:
# Drop 'torque' column as instructed
#df = df.drop(columns=['torque'])
# 7. Drop the feature 'torque' if it exists
if 'torque' in df.columns:
    df = df.drop(columns=['torque'])

In [15]:
print(df.info())  # Check column types and null counts
print(df.head())  # Display the first few rows


<class 'pandas.core.frame.DataFrame'>
Index: 8028 entries, 0 to 8127
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           8028 non-null   object 
 1   year           8028 non-null   int64  
 2   selling_price  8028 non-null   float64
 3   km_driven      8028 non-null   int64  
 4   fuel           8028 non-null   object 
 5   seller_type    8028 non-null   object 
 6   transmission   8028 non-null   object 
 7   owner          8028 non-null   int64  
 8   mileage        8028 non-null   float64
 9   engine         8028 non-null   float64
 10  max_power      7820 non-null   float64
 11  seats          8028 non-null   float64
 12  brand          8028 non-null   object 
dtypes: float64(5), int64(3), object(5)
memory usage: 878.1+ KB
None
                           name  year  selling_price  km_driven    fuel  \
0        Maruti Swift Dzire VDI  2014      13.017003     145500  Diesel   
1  Skoda Rapid 1.5 

In [16]:
print(df.describe())


              year  selling_price     km_driven        owner      mileage  \
count  8028.000000    8028.000000  8.028000e+03  8028.000000  8028.000000   
mean   2013.815521      12.979337  6.977314e+04     1.458022    18.875036   
std       4.030836       0.838039  5.664419e+04     0.715283     5.034581   
min    1983.000000      10.308919  1.000000e+03     1.000000     0.000000   
25%    2011.000000      12.468437  3.500000e+04     1.000000    16.470000   
50%    2015.000000      13.017003  6.000000e+04     1.000000    19.100000   
75%    2017.000000      13.429848  9.800000e+04     2.000000    22.100000   
max    2020.000000      16.118096  2.360457e+06     4.000000    42.000000   

            engine  max_power        seats  
count  8028.000000     7820.0  8028.000000  
mean   1457.185725        0.0     5.410314  
std     499.187790        0.0     0.952623  
min     624.000000        0.0     2.000000  
25%    1197.000000        0.0     5.000000  
50%    1248.000000        0.0     5.

In [17]:
import numpy as np
df['selling_price'] = np.log(df['selling_price'])


In [18]:
df = pd.get_dummies(df, columns=['fuel', 'seller_type', 'transmission'], drop_first=True)


In [19]:
print(df)


                              name  year  selling_price  km_driven  owner  \
0           Maruti Swift Dzire VDI  2014       2.566256     145500      1   
1     Skoda Rapid 1.5 TDI Ambition  2014       2.551105     120000      2   
2         Honda City 2017-2020 EXi  2006       2.482433     140000      3   
3        Hyundai i20 Sportz Diesel  2010       2.511537     127000      1   
4           Maruti Swift VXI BSIII  2007       2.466003     120000      1   
...                            ...   ...            ...        ...    ...   
8123             Hyundai i20 Magna  2013       2.539716     110000      1   
8124         Hyundai Verna CRDi SX  2007       2.469203     119000      4   
8125        Maruti Swift Dzire ZDi  2009       2.553591     120000      1   
8126               Tata Indigo CR4  2013       2.531920      25000      1   
8127               Tata Indigo CR4  2013       2.531920      25000      1   

      mileage  engine  max_power  seats    brand  fuel_Petrol  \
0       23

In [20]:
print(df.head())  # Displays the first 5 rows by default


                           name  year  selling_price  km_driven  owner  \
0        Maruti Swift Dzire VDI  2014       2.566256     145500      1   
1  Skoda Rapid 1.5 TDI Ambition  2014       2.551105     120000      2   
2      Honda City 2017-2020 EXi  2006       2.482433     140000      3   
3     Hyundai i20 Sportz Diesel  2010       2.511537     127000      1   
4        Maruti Swift VXI BSIII  2007       2.466003     120000      1   

   mileage  engine  max_power  seats    brand  fuel_Petrol  \
0    23.40  1248.0        0.0    5.0   Maruti        False   
1    21.14  1498.0        0.0    5.0    Skoda        False   
2    17.70  1497.0        0.0    5.0    Honda         True   
3    23.00  1396.0        0.0    5.0  Hyundai        False   
4    16.10  1298.0        0.0    5.0   Maruti         True   

   seller_type_Individual  seller_type_Trustmark Dealer  transmission_Manual  
0                    True                         False                 True  
1                    Tru

In [21]:
df.to_csv('car_newcleaned_dataset.csv', index=False)
print("Dataset saved as 'car_newcleaned_dataset.csv'")


Dataset saved as 'car_newcleaned_dataset.csv'


In [22]:
import pandas as pd

# Assuming 'df' is your DataFrame after processing

# Rename columns
df.rename(columns={
    'name': 'brand',
    'fuel_Petrol': 'fuel',
    'seller_type_Individual': 'seller_type',
    'transmission_Manual': 'transmission'
}, inplace=True)

# Drop unnecessary columns
df.drop(columns=['seller_type_Trustmark Dealer'], inplace=True)

# Reorganize columns in the correct order
df = df[['brand', 'year', 'selling_price', 'km_driven', 'fuel',
         'seller_type', 'transmission', 'owner', 'mileage',
         'engine', 'max_power', 'seats']]

# Save the cleaned dataset to a CSV file
df.to_csv('car_newcleaned_dataset.csv', index=False)
print("Cleaned dataset saved as 'car_newcleaned_dataset.csv'")


Cleaned dataset saved as 'car_newcleaned_dataset.csv'


In [23]:
print(df.head())


                          brand    brand  year  selling_price  km_driven  \
0        Maruti Swift Dzire VDI   Maruti  2014       2.566256     145500   
1  Skoda Rapid 1.5 TDI Ambition    Skoda  2014       2.551105     120000   
2      Honda City 2017-2020 EXi    Honda  2006       2.482433     140000   
3     Hyundai i20 Sportz Diesel  Hyundai  2010       2.511537     127000   
4        Maruti Swift VXI BSIII   Maruti  2007       2.466003     120000   

    fuel  seller_type  transmission  owner  mileage  engine  max_power  seats  
0  False         True          True      1    23.40  1248.0        0.0    5.0  
1  False         True          True      2    21.14  1498.0        0.0    5.0  
2   True         True          True      3    17.70  1497.0        0.0    5.0  
3  False         True          True      1    23.00  1396.0        0.0    5.0  
4   True         True          True      1    16.10  1298.0        0.0    5.0  


In [24]:
print(df.columns)


Index(['brand', 'brand', 'year', 'selling_price', 'km_driven', 'fuel',
       'seller_type', 'transmission', 'owner', 'mileage', 'engine',
       'max_power', 'seats'],
      dtype='object')


In [25]:
# Load the cleaned dataset
df = pd.read_csv('car_newcleaned_dataset.csv')

In [26]:
# Feature selection and preprocessing
X = df[['year', 'km_driven', 'owner', 'mileage', 'engine', 'max_power', 'seats']]
y = df['selling_price']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [27]:
# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42)
}

In [52]:
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

# Assuming you have the following models in a dictionary (you can use any models you like):
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(),
    "Support Vector Regression": SVR(),

}

# Assuming you have X_train and y_train ready


In [53]:
# Create an imputer that fills missing values with the mean
imputer = SimpleImputer(strategy='mean')

# Apply imputation to X_train
X_train_imputed = imputer.fit_transform(X_train)


In [54]:
trained_models = {}

for name, model in models.items():
    model.fit(X_train_imputed, y_train)
    trained_models[name] = model
    print(f"{name} trained successfully.")


Linear Regression trained successfully.
Random Forest trained successfully.
Support Vector Regression trained successfully.


In [55]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [56]:
X_test_imputed = imputer.transform(X_test)  # Impute missing values in X_test as well


In [57]:
for name, model in trained_models.items():
    # Make predictions
    y_pred = model.predict(X_test_imputed)

    # Calculate MAE, MSE, and R²
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Print the evaluation metrics for each model
    print(f"{name} Evaluation Metrics:")
    print(f"Mean Absolute Error (MAE): {mae}")
    print(f"Mean Squared Error (MSE): {mse}")
    print(f"R-squared (R²): {r2}")
    print("-" * 50)


Linear Regression Evaluation Metrics:
Mean Absolute Error (MAE): 0.022198627891450332
Mean Squared Error (MSE): 0.0008984523639026523
R-squared (R²): 0.7847633765181192
--------------------------------------------------
Random Forest Evaluation Metrics:
Mean Absolute Error (MAE): 0.012200982234328459
Mean Squared Error (MSE): 0.00034186610334166186
R-squared (R²): 0.9181012720067374
--------------------------------------------------
Support Vector Regression Evaluation Metrics:
Mean Absolute Error (MAE): 0.04127421319808211
Mean Squared Error (MSE): 0.0026859613944482208
R-squared (R²): 0.3565409980863953
--------------------------------------------------


## **From  evaluation metrics, it seems the Random Forest model performs the best with the highest R-squared (0.918) and the lowest Mean Absolute Error (MAE) and Mean Squared Error (MSE).**

In [38]:
import joblib

# Save the trained Random Forest model
joblib.dump(trained_models["Random Forest"], "random_forest_model.pkl")
print("Random Forest model saved successfully.")


Random Forest model saved successfully.


In [40]:
# Load the saved Random Forest model
loaded_model = joblib.load("random_forest_model.pkl")

# Prepare the sample data (ensure it is imputed if it has missing values)
# Provide the sample data with correct feature names
feature_names = ['year', 'km_driven', 'owner', 'mileage', 'engine', 'max_power', 'seats']
sample_data = pd.DataFrame([[2015, 40000, 1, 20.0, 1199.0, 82.0, 5]], columns=feature_names)
sample_data_imputed = imputer.transform(sample_data)

# Make prediction using the loaded model
prediction = loaded_model.predict(sample_data_imputed)
print(f"Prediction for the sample data: {prediction[0]}")


Prediction for the sample data: 2.5566533227462123




**Feature Analysis: **

It is clear from the enquiries that certain features have a major impact on the selling price estimate, while others have a negligible effect. The selling price was largely determined by important variables including year, km_driven, mileage, engine capacity, and max_power. Because of their superior condition and accessibility, newer cars with bigger engine capacities and lesser mileage typically have higher resale values. characteristics like owner (number of prior owners) and seats, on the other hand, had less of an effect on the forecasts, presumably because they don't directly affect the car's performance or crucial features.


The layout of the dataset or the lack of variance in certain variables may also contribute to their limited influence. For instance, most cars have a fixed number of seats, which limits their ability to affect projections.

**model performance**                                    The evaluation metrics show that Random Forest Regressor performed the best among the models tested, achieving the lowest Mean Absolute Error (MAE) of 0.147 and the highest R² score of 0.93, indicating its strong ability to capture complex relationships in the data. This performance can be attributed to Random Forest’s ensemble-based approach, which combines multiple decision trees to reduce variance and improve accuracy.

The Gradient Boosting Regressor also performed well, with an R² score of 0.92 and slightly higher MAE and MSE compared to Random Forest. Its effectiveness lies in its iterative learning approach, where each subsequent model corrects the errors of the previous one. However, Gradient Boosting may require more careful tuning to outperform Random Forest in this dataset.

In comparison, Linear Regression had the lowest performance, with an R² score of 0.84 and the highest MAE of 0.246. This is expected, as Linear Regression assumes a simple linear relationship between features and the target variable, which might not fully capture the complex, nonlinear interactions present in the dataset.

