In [36]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
import joblib

# Load and prepare data
df = pd.read_csv('preprocess_used_cars.csv')
df.columns = df.columns.str.lower()

# Create target variable
df['suitability'] = (df['qualityscore'] > df['qualityscore'].median()).astype(int)

# Feature selection
features = ['company', 'model', 'kilometer', 'modelyear', 'fueltype', 
            'transmissiontype', 'car_age', 'warranty', 'bodystyle']
target = 'suitability'

X = df[features]
y = df[target]

# Improved preprocessing pipeline
numeric_features = ['kilometer', 'modelyear', 'car_age', 'warranty']
categorical_features = ['company', 'model', 'fueltype', 'transmissiontype', 'bodystyle']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', MinMaxScaler())  # Changed to MinMaxScaler
        ]), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Optimized Random Forest model
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

model.fit(X_train, y_train)

# Evaluation
y_pred = model.predict(X_test)
print("Model Evaluation:")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

# Save artifacts
joblib.dump(model.named_steps['preprocessor'], 'preprocessor.joblib')
joblib.dump(model, 'trained_model.joblib')

Model Evaluation:
              precision    recall  f1-score   support

           0       0.76      0.82      0.79       110
           1       0.79      0.73      0.76       103

    accuracy                           0.77       213
   macro avg       0.78      0.77      0.77       213
weighted avg       0.78      0.77      0.77       213

Accuracy: 0.7746478873239436


['trained_model.joblib']