In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler  # Changed from StandardScaler

# 1. Data Ingestion & Initial Inspection
df = pd.read_csv('usedCars.csv')
df.columns = df.columns.str.lower()

# Remove problematic columns early
df = df.drop(columns=['price', 'id'], errors='ignore')

print("Dataset Shape:", df.shape)
print("\nData Types:\n", df.dtypes)
print("\nMissing Values:\n", df.isnull().sum())

# 2. Enhanced Data Cleaning
# Handle numerical missing values
num_cols = df.select_dtypes(include=np.number).columns
for col in num_cols:
    if df[col].isnull().sum() > 0:
        median_val = df[col].median()
        df[col] = df[col].fillna(median_val)
        print(f"Filled {col} with median: {median_val:.2f}")

# Handle categorical missing values
cat_cols = df.select_dtypes(include='object').columns
for col in cat_cols:
    if df[col].isnull().sum() > 0:
        df[col] = df[col].fillna('Unknown')
        print(f"Filled {col} with 'Unknown'")

# 3. Improved Feature Engineering
current_year = pd.Timestamp.now().year
df['car_age'] = current_year - df['modelyear']

def safe_cap_outliers(series):
    if series.nunique() > 1:
        q1 = series.quantile(0.25)
        q3 = series.quantile(0.75)
        iqr = q3 - q1
        lower = q1 - 1.5*iqr
        upper = q3 + 1.5*iqr
        return series.clip(lower, upper)
    return series

df['kilometer'] = safe_cap_outliers(df['kilometer'])
df['qualityscore'] = safe_cap_outliers(df['qualityscore'])

df.to_csv('preprocess_used_cars.csv', index=False)
print("\nFinal preprocessed dataset saved.")

Dataset Shape: (1064, 17)

Data Types:
 company              object
model                object
variant              object
fueltype             object
colour               object
kilometer             int64
bodystyle            object
transmissiontype     object
manufacturedate      object
modelyear             int64
cngkit               object
owner                object
dealerstate          object
dealername           object
city                 object
warranty              int64
qualityscore        float64
dtype: object

Missing Values:
 company                0
model                  0
variant                0
fueltype               1
colour                 0
kilometer              0
bodystyle              0
transmissiontype     714
manufacturedate        0
modelyear              0
cngkit              1042
owner                  0
dealerstate            0
dealername             0
city                   0
warranty               0
qualityscore           0
dtype: int64
Filled fueltyp