In [5]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import joblib

# ---------------------------
# Classification Module
# ---------------------------
def train_classifier(
    preprocessed_csv: str,
    model_path: str,
    preprocessor_path: str
) -> None:
    """
    Trains a RandomForest classifier on preprocessed data,
    saves the full pipeline and the preprocessor separately.
    """
    df = pd.read_csv(preprocessed_csv)

    # Ensure lowercase columns
    df.columns = df.columns.str.strip().str.lower()

    # Create binary target based on qualityscore median
    median_q = df['qualityscore'].median()
    df['suitability'] = (df['qualityscore'] > median_q).astype(int)

    # Features and target
    X = df.drop(columns=['suitability', 'qualityscore'])
    y = df['suitability']

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    # Define preprocessing pipelines
    num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = X.select_dtypes(include='object').columns.tolist()

    num_pipe = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', MinMaxScaler())
    ])
    cat_pipe = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ohe', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer([
        ('num', num_pipe, num_cols),
        ('cat', cat_pipe, cat_cols)
    ])

    # Full pipeline
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(random_state=42))
    ])

    # Train and save
    pipeline.fit(X_train, y_train)
    joblib.dump(pipeline, model_path)
    joblib.dump(preprocessor, preprocessor_path)
    print(f"Trained model saved at {model_path}")
    print(f"Preprocessor saved at {preprocessor_path}")


if __name__ == '__main__':
    # Example usage
    train_classifier(
        'preprocess_used_cars.csv',
        'trained_model.joblib',
        'preprocessor.joblib'
    )


Trained model saved at trained_model.joblib
Preprocessor saved at preprocessor.joblib
