In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# 1. Load data
df = pd.read_csv('processes2.csv')
df = df[['name', 'year', 'selling_price', 'km_driven', 'fuel', 'transmission', 'owner']]
df = df.dropna()

# 2. Remove outliers from selling_price
Q1 = df['selling_price'].quantile(0.25)
Q3 = df['selling_price'].quantile(0.75)
IQR = Q3 - Q1
df = df[(df['selling_price'] >= Q1 - 1.5*IQR) & (df['selling_price'] <= Q3 + 1.5*IQR)]

# 3. Feature engineering
df['name'] = df['name'].astype(str).apply(lambda x: " ".join(x.split()[:2]))
df['car_age'] = 2025 - df['year']
df = df.drop(['year'], axis=1)

# 4. Set features and target
X = df.drop('selling_price', axis=1)
y = df['selling_price']

# 5. Categorical & numeric columns
cat_cols = ['name', 'fuel', 'transmission', 'owner']
num_cols = ['km_driven', 'car_age']

# 6. Preprocessor
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
], remainder='passthrough')

# 7. Model pipeline
model = Pipeline([
    ('preprocess', preprocessor),
    ('regressor', RandomForestRegressor(
        n_estimators=300,
        max_depth=20,
        min_samples_split=5,
        random_state=42))
])

# 8. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 9. Train
model.fit(X_train, y_train)

# 10. Predict & evaluate
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(r2)
import joblib

joblib.dump(model, 'car_price_model.pkl')


0.766393833625326


['car_price_model.pkl']