In [6]:
import pandas as pd
import numpy as np

# ---------------------------
# Preprocessing Module
# ---------------------------
def preprocess_data(input_path: str, output_path: str) -> pd.DataFrame:
    """
    Reads raw car dataset, cleans data, imputes missing values, caps outliers,
    normalizes string columns, computes car_age, and writes cleaned CSV with
    lowercase column names.
    """
    df = pd.read_csv(input_path)

    # Normalize column names to lowercase
    df.columns = df.columns.str.strip().str.lower()

    # Normalize string columns
    cat_cols = df.select_dtypes(include='object').columns
    for col in cat_cols:
        df[col] = df[col].astype(str).str.strip().str.title()

    # Compute car_age if missing and modelyear exists
    if 'car_age' not in df.columns and 'modelyear' in df.columns:
        current_year = pd.Timestamp.now().year
        df['car_age'] = current_year - df['modelyear']

    # Impute numerical columns (median)
    num_cols = df.select_dtypes(include=[np.number]).columns
    for col in num_cols:
        df[col].fillna(df[col].median(), inplace=True)

    # Impute categorical columns (mode)
    for col in cat_cols:
        df[col].fillna(df[col].mode()[0], inplace=True)

    # Outlier capping with IQR
    def cap_outliers(series: pd.Series) -> pd.Series:
        q1, q3 = series.quantile([0.25, 0.75])
        iqr = q3 - q1
        lower, upper = q1 - 1.5 * iqr, q3 + 1.5 * iqr
        return series.clip(lower, upper)

    for col in num_cols:
        df[col] = cap_outliers(df[col])

    # Save cleaned DataFrame
    df.to_csv(output_path, index=False)
    return df


if __name__ == '__main__':
    # Example usage
    preprocess_data('usedCars.csv', 'preprocess_used_cars.csv')


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values