In [2]:

from Preprocessing.DropAll_preprocessing_pipeline import preprocessing_pipeline



In [None]:
numeric_columns = ['power_ps', 'fuel_consumption_l_100km', 'mileage_in_km', 'age']
for col in numeric_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')
    if df[col].isnull().any():
        print(f"Warning: Column {col} contains non-numeric values after conversion.")

df['price_in_euro'] = pd.to_numeric(df['price_in_euro'], errors='coerce')
if df['price_in_euro'].isnull().any():
    print(f"Warning: price_in_euro contains {df['price_in_euro'].isnull().sum()} non-numeric values after conversion.")
    df = df.dropna(subset=['price_in_euro'])

In [None]:
y = pd.to_numeric(y, errors='coerce')
y_train = pd.to_numeric(y_train, errors='coerce')
y_test = pd.to_numeric(y_test, errors='coerce')

if y_train.isnull().any():
    print(f"Warning: {y_train.isnull().sum()} non-numeric values found in y_train and removed.")
    valid_indices = y_train[~y_train.isnull()].index
    X_train = X_train.loc[valid_indices]
    y_train = y_train[~y_train.isnull()]

if y_test.isnull().any():
    print(f"Warning: {y_test.isnull().sum()} non-numeric values found in y_test and removed.")
    valid_indices = y_test[~y_test.isnull()].index
    X_test = X_test.loc[valid_indices]
    y_test = y_test[~y_test.isnull()]

In [None]:

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])



final_knn_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', KNeighborsRegressor(
        n_neighbors=4,
        weights='distance',  
        algorithm='auto',
        p = 2,
        n_jobs=-1  
    ))
])


print("\nTraining KNN model ...")
final_knn_pipeline.fit(X_train, y_train)

y_pred_knn = final_knn_pipeline.predict(X_test)

evaluate_model(y_test, y_pred_knn, "KNN Regression")


gc.collect()





Training KNN model ...
KNN Regression Performance Metrics:
MAE: 3842.21
MSE: 150416436.67
RMSE: 12264.44
R²: 0.85
------------------------------
KNN Regression Performance Metrics:
MAE: 3842.21
MSE: 150416436.67
RMSE: 12264.44
R²: 0.85
------------------------------


23

In [None]:
# Configure feature transformers
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
])

# Create the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Set number of CPU cores to use
n_jobs = min(4, max(1, os.cpu_count() or 2))  # Limit to max 4 threads

# Create the KNN pipeline
final_knn_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', KNeighborsRegressor(
        n_neighbors=4,
        weights='distance',
        algorithm='auto',
        p=2,  # Euclidean distance
        n_jobs=n_jobs
    ))
])

In [None]:
# Train and evaluate the model
try:
    print("\nTraining KNN model...")
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        final_knn_pipeline.fit(X_train, y_train)
    
    print("Making predictions...")
    y_pred_knn = final_knn_pipeline.predict(X_test)
    
    print("\nModel Evaluation:")
    metrics = evaluate_model(y_test, y_pred_knn, "KNN Regression")
    
    print("\nGenerating visualization...")
    scatter_prediction(y_test, y_pred_knn, "KNN Regression")
    
except Exception as e:
    print(f"Error during model training or evaluation: {str(e)}")
finally:
    # Clean up memory
    gc.collect()
    print("Memory cleaned up.")

## Model Comparison

The standard KNN model with ordinal encoding for categorical features performs well for car price prediction.
However, for improved performance, consider:

1. Using semantic embeddings (as in `04_knn_embeddings.ipynb`)
2. Further feature engineering
3. Hyperparameter tuning for optimal `n_neighbors` and distance metrics