In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pickle

# Ensure 'car details v4.csv' is in the same directory as your notebook
df = pd.read_csv('car details v4.csv')

# Drop rows with missing values
df.dropna(inplace=True)

# Define the features and the target
features = ['Make', 'Model', 'Kilometer', 'Year', 'Fuel Type', 'Transmission']
target = 'Price'

X = df[features]
y = df[target]

# Rename columns to be user-friendly
X = X.rename(columns={
    'Make': 'Brand',
    'Kilometer': 'Kms_driven',
    'Fuel Type': 'fuel_type'
})

# Apply log transformation to the numerical features and the target variable
# Using log1p to handle potential zero values gracefully
X['Kms_driven'] = np.log1p(X['Kms_driven'])
y = np.log1p(y)

# Identify categorical and numerical features
categorical_features = ['Brand', 'Model', 'fuel_type', 'Transmission']
numerical_features = ['Kms_driven', 'Year']

# Create a preprocessor using ColumnTransformer for one-hot encoding
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)

# Create the multilinear regression model pipeline
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model_pipeline.fit(X_train, y_train)

# Save the trained model to a .pkl file
with open('car_price_model.pkl', 'wb') as file:
    pickle.dump(model_pipeline, file)

print("Multilinear regression model with log transformations has been trained and saved as 'car_price_model.pkl'")

# --- Testing and Evaluation ---

# Make predictions on the test set
y_pred = model_pipeline.predict(X_test)

# Evaluate the model's performance
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print("-" * 30)
print("Model Performance Metrics:")
print(f"R-squared (Accuracy): {r2:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")

Multilinear regression model with log transformations has been trained and saved as 'car_price_model.pkl'
------------------------------
Model Performance Metrics:
R-squared (Accuracy): 0.8705
Mean Absolute Error (MAE): 0.22
