In [78]:
import pandas as pd
import numpy as np

df = pd.read_csv('../data/processed/cars_60k.csv')
print(df.head())

     car_id  prod_year  man_id  model_id  price  predicted_price  \
0  74874053       2010       3       103   9300          32436.0   
1  74037205       2018       3        94  43000         132836.0   
2  74784297       2016     155      1719  38900         145192.0   
3  74836637       2021      23      6399  99999         142446.0   
4  74912421       2011      41      1109  35000          84180.0   

   pred_min_price  pred_max_price  fuel_type_id  gear_type_id  ...  has_turbo  \
0        17515.44        62277.12             3             3  ...       True   
1             NaN             NaN             2             3  ...       True   
2             NaN             NaN             7             2  ...      False   
3             NaN             NaN             2             2  ...      False   
4        45457.20       161625.60             2             2  ...      False   

   right_wheel  vehicle_type  category_id   views saloon_material_id  \
0        False             0    

  df = pd.read_csv('../data/processed/cars_60k.csv')


In [79]:
def fill_missing_by_model_id(df):
    # columns to impute: numeric -> median, categorical -> mode
    IMPUTE_CONFIG = {
        'engine_volume': 'median',
        'cylinders': 'mode',
        'drive_type_id': 'mode',
        'fuel_type_id': 'mode',
        'gear_type_id': 'mode',
    }

    GROUP_COLS = ['man_id', 'model_id']

    for col, method in IMPUTE_CONFIG.items():

        # model-level statistic
        if method == 'median':
            stat_model = (
                df.dropna(subset=[col])
                  .groupby(GROUP_COLS)[col]
                  .median()
            )
        else:  # mode
            stat_model = (
                df.dropna(subset=[col])
                  .groupby(GROUP_COLS)[col]
                  .agg(lambda x: x.mode().iloc[0])
            )

        # merge model-level values
        df = df.merge(
            stat_model.rename(f'{col}_model_stat'),
            on=GROUP_COLS,
            how='left'
        )

        # fill from model-level
        df[col] = df[col].fillna(df[f'{col}_model_stat'])

        # cleanup
        df = df.drop(columns=[f'{col}_model_stat'])

    return df


In [80]:
cols = df.columns

numeric_cols = df.select_dtypes(include=['number']).columns
categorical_cols = df.select_dtypes(exclude=['number']).columns

print(numeric_cols)
print(categorical_cols)

df[numeric_cols] = df[numeric_cols].mask(df[numeric_cols] == 0, np.nan)

#print(df.isnull().sum())

print(len(df[df['price'].isna() & df['pred_min_price'].isna() & df['pred_max_price'].isna() & df['predicted_price'].isna()]))

def handle_missing(df):
    df = df.dropna(subset=['prod_year', 'model_id'])
    df = df.dropna(
        subset=['price', 'pred_min_price', 'pred_max_price', 'predicted_price'],
        how='all'
    )

    df['price'] = df['price'].fillna(df['predicted_price'])
    df['predicted_price'] = df['predicted_price'].fillna(df['price'])
    df['color_id'] = df['color_id'].fillna(df['color_id'].mode()[0])

    df = fill_missing_by_model_id(df)
    df = df.dropna(subset=['gear_type_id', 'drive_type_id', 'cylinders', 'engine_volume'])
    df['car_run_km'] = df['car_run_km'].fillna(0)
    df['user_type'] = df['user_type'].fillna(0)
    df['vehicle_type'] = df['vehicle_type'].fillna(0)
    df = df.dropna(subset=['comfort_features'])

    return df

def normalize_boolean(series):
    return (series.astype(str).str.strip().str.lower()
        .map({
            'true': 1,
            '1': 1,
            'yes': 1,

            'false': 0,
            '0': 0,
            'no': 0,
        })
    )

df = handle_missing(df)

for col in categorical_cols:
    if col != 'comfort_features':
        df[col] = normalize_boolean(df[col])

print(df.isnull().sum())
print(len(df))

for col in categorical_cols:
    print(f"\n{df[col].value_counts()}\n")

Index(['car_id', 'prod_year', 'man_id', 'model_id', 'price', 'predicted_price',
       'pred_min_price', 'pred_max_price', 'fuel_type_id', 'gear_type_id',
       'drive_type_id', 'color_id', 'cylinders', 'car_run_km', 'engine_volume',
       'vehicle_type', 'category_id', 'views', 'saloon_material_id',
       'user_type'],
      dtype='object')
Index(['abs', 'esd', 'el_windows', 'conditioner', 'leather', 'hydraulics',
       'chair_warming', 'climat_control', 'customs_passed', 'tech_inspection',
       'has_turbo', 'right_wheel', 'start_stop', 'back_camera',
       'comfort_features'],
      dtype='object')
24605
car_id                   0
prod_year                0
man_id                   0
model_id                 0
price                    0
predicted_price          0
pred_min_price        2563
pred_max_price        2563
fuel_type_id             0
gear_type_id             0
drive_type_id            0
color_id                 0
cylinders                0
car_run_km               0
e