In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor 
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np
import warnings

In [10]:
warnings.filterwarnings('ignore')

In [20]:
try:
    df = pd.read_csv('C:/Users/Akash/OneDrive - Erin.N.Nagarvala Day School/Desktop/jupyter notebook/car_data.csv') 
    print("Data loaded Successfully!")
except FileNotFoundError:
    print("Error: 'car_data.csv' not found. Please ensure your dataset file is in the correct directory.")
    print("For demonstration, creating a dummy dataset as 'car_data.csv' was not provided.")
  

Data loaded Successfully!


In [21]:
np.random.seed(42)
data = {
        'Brand': np.random.choice(['Toyota', 'Honda', 'Ford', 'BMW', 'Mercedes'], 100),
        'Model': np.random.choice(['Sedan', 'SUV', 'Hatchback'], 100),
        'Year': np.random.randint(2005, 2024, 100),
        'Mileage_km': np.random.randint(10000, 150000, 100),
        'Fuel_Type': np.random.choice(['Petrol', 'Diesel', 'Electric'], 100),
        'Transmission': np.random.choice(['Manual', 'Automatic'], 100),
        'Engine_Size_L': np.round(np.random.uniform(1.0, 3.5, 100), 1),
        'Horsepower_bhp': np.random.randint(80, 300, 100),
        'Seats': np.random.choice([4, 5, 7], 100),
        'Price_ Lakhs': np.round(np.random.uniform(5, 50, 100) * 10, 2) # Example prices in Lakhs
    }

In [22]:
df = pd.DataFrame(data)
df['Price_ Lakhs'] = df['Price_ Lakhs'] + (df['Year'] - 2010) * 2 + (300 - df['Horsepower_bhp']) * 0.1
df['Price_ Lakhs'] = df['Price_ Lakhs'].apply(lambda x: max(1, x)) # Ensure positive prices
print("Dummy dataset created for demonstration purposes.")

print("\n--- Original Dataset Head ---")
print(df.head())
print("\n--- Dataset Info ---")
df.info()
print("\n--- Missing Values Before Preprocessing ---")
print(df.isnull().sum())
print(f"\nTotal rows before dropping NaNs: {df.shape[0]}")


Dummy dataset created for demonstration purposes.

--- Original Dataset Head ---
      Brand      Model  Year  Mileage_km Fuel_Type Transmission  \
0       BMW  Hatchback  2021      103179    Diesel       Manual   
1  Mercedes      Sedan  2006       55714    Diesel       Manual   
2      Ford        SUV  2006      112946    Diesel       Manual   
3  Mercedes      Sedan  2009      119616    Petrol    Automatic   
4  Mercedes  Hatchback  2005      115983    Diesel    Automatic   

   Engine_Size_L  Horsepower_bhp  Seats  Price_ Lakhs  
0            2.2             136      5        491.87  
1            1.0             224      7        308.80  
2            2.2             191      5        465.73  
3            1.1             126      7         67.79  
4            1.3             230      7        485.78  

--- Dataset Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ----

In [23]:
df.dropna(inplace=True)
print(f"\nTotal rows after dropping NaNs: {df.shape[0]}")
print("\n--- Missing Values After Preprocessing ---")
print(df.isnull().sum())


Total rows after dropping NaNs: 100

--- Missing Values After Preprocessing ---
Brand             0
Model             0
Year              0
Mileage_km        0
Fuel_Type         0
Transmission      0
Engine_Size_L     0
Horsepower_bhp    0
Seats             0
Price_ Lakhs      0
dtype: int64


In [31]:
target_col = 'Price_ Lakhs'
features = df.columns.tolist()
features.remove(target_col)
numerical_features = ['Year', 'Mileage_km', 'Engine_Size_L', 'Horsepower_bhp', 'Seats']
categorical_features = ['Brand', 'Model', 'Fuel_Type', 'Transmission'] 

numerical_features = [f for f in numerical_features if f in df.columns]
categorical_features = [f for f in categorical_features if f in df.columns]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features), 
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features) 
    ])


In [32]:
print("\n--- Preprocessing Pipeline Created ---")

X = df[features]
y = df[target_col]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTraining features shape: {X_train.shape}")
print(f"Testing features shape: {X_test.shape}")
print(f"Training target shape: {y_train.shape}")
print(f"Testing target shape: {y_test.shape}")



--- Preprocessing Pipeline Created ---

Training features shape: (80, 9)
Testing features shape: (20, 9)
Training target shape: (80,)
Testing target shape: (20,)


In [33]:
model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1))])

# Train the pipeline
model_pipeline.fit(X_train, y_train)

print("\n--- Model Training Complete (RandomForestRegressor Pipeline) ---")
y_pred = model_pipeline.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse) 
r2 = r2_score(y_test, y_pred)

print(f"\n--- Model Evaluation ---")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R-squared (R2) Score: {r2:.4f}")



--- Model Training Complete (RandomForestRegressor Pipeline) ---

--- Model Evaluation ---
Mean Absolute Error (MAE): 107.5567
Mean Squared Error (MSE): 16669.5440
Root Mean Squared Error (RMSE): 129.1106
R-squared (R2) Score: 0.0062


In [34]:
new_car_data = pd.DataFrame({
    'Brand': ['Toyota'],
    'Model': ['SUV'],
    'Year': [2022],
    'Mileage_km': [30000],
    'Fuel_Type': ['Petrol'],
    'Transmission': ['Automatic'],
    'Engine_Size_L': [2.0],
    'Horsepower_bhp': [150],
    'Seats': [5]
})

In [35]:
predicted_price = model_pipeline.predict(new_car_data)

print(f"New Car Data:\n{new_car_data}")
print(f"Predicted Car Price: ₹{predicted_price[0]:,.2f} Lakhs")

New Car Data:
    Brand Model  Year  Mileage_km Fuel_Type Transmission  Engine_Size_L  \
0  Toyota   SUV  2022       30000    Petrol    Automatic            2.0   

   Horsepower_bhp  Seats  
0             150      5  
Predicted Car Price: ₹237.14 Lakhs


In [36]:
new_car_data_2 = pd.DataFrame({
    'Brand': ['Mercedes'],
    'Model': ['Sedan'],
    'Year': [2023],
    'Mileage_km': [5000],
    'Fuel_Type': ['Diesel'],
    'Transmission': ['Automatic'],
    'Engine_Size_L': [3.0],
    'Horsepower_bhp': [250],
    'Seats': [5]
})

predicted_price_2 = model_pipeline.predict(new_car_data_2)

print(f"\nNew Car Data 2:\n{new_car_data_2}")
print(f"Predicted Car Price 2: ₹{predicted_price_2[0]:,.2f} Lakhs")



New Car Data 2:
      Brand  Model  Year  Mileage_km Fuel_Type Transmission  Engine_Size_L  \
0  Mercedes  Sedan  2023        5000    Diesel    Automatic            3.0   

   Horsepower_bhp  Seats  
0             250      5  
Predicted Car Price 2: ₹284.01 Lakhs
