In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsRegressor

import gc  # Garbage Collector zur Speicherverwaltung
import sys
import os
sys.path.append(os.path.abspath("../.."))

from utils.scatter_plot import scatter_prediction
from utils.eval_call import evaluate_model

from Preprocessing.imputation import get_imputation_maps, apply_imputation,ContextImputer
from Preprocessing.preprocessing_pipeline_impute import preprocessing_pipeline
from Preprocessing.preprocessing_pipeline_segment import preprocessing_pipeline_segment
from Preprocessing.split import split_data


In [None]:
from sklearn.model_selection import GridSearchCV

def main():
    df = preprocessing_pipeline('../../data.csv') 
    X_train, X_test, y_train, y_test , X,y, categorical_features , numeric_features = split_data(df)

        
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())  # Scaling is crucial for KNN
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])



    final_knn_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', KNeighborsRegressor(
            n_neighbors=4,
            weights='distance',  
            algorithm='auto',
            p = 2, #Power parameter for the Minkowski metric. When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
            n_jobs=-1  
        ))
    ])


    print("\nTraining KNN model ...")
    final_knn_pipeline.fit(X_train, y_train)

    param_grid = {
        'model__n_neighbors': [3, 4, 5, 6, 7],
        'model__weights': ['uniform', 'distance'],
        'model__p': [1, 2],  # 1 Manhattan distance, 2 Euclidean
        'model__algorithm': ['auto', 'ball_tree', 'kd_tree']
    }

    
    print("\nPerforming Grid Search...")
    grid_search = GridSearchCV(
        estimator=final_knn_pipeline,
        param_grid=param_grid,
        #cv=5,  # 5 fold cross-validation
        n_jobs=1,  # single process avoid serialization issues
        verbose=1,
        scoring='neg_mean_squared_error'
    )

    
    grid_search.fit(X_train, y_train)

    
    print("\nBest parameters found:")
    print(grid_search.best_params_)
    print(f"Best score: {-grid_search.best_score_:.4f} MSE")

    
    final_knn_pipeline = grid_search.best_estimator_
    y_pred_knn = final_knn_pipeline.predict(X_test)

    
    evaluate_model(y_test, y_pred_knn, "KNN Regression")

    scatter_prediction(y_test, y_pred_knn, "KNN Regression")

    
    gc.collect()



if __name__ == "__main__":
    main()


Training KNN model ...

Performing Grid Search...
Fitting 5 folds for each of 60 candidates, totalling 300 fits


Exception in thread Thread-4 (_readerthread):
Traceback (most recent call last):
  File "C:\Users\kilia\AppData\Local\Programs\Python\Python311\Lib\threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "c:\Users\kilia\Desktop\MA_Master\Vorlesung_S5\DataMining\proj\proj\.venv\Lib\site-packages\ipykernel\ipkernel.py", line 766, in run_closure
    _threading_Thread_run(self)
  File "C:\Users\kilia\AppData\Local\Programs\Python\Python311\Lib\threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "C:\Users\kilia\AppData\Local\Programs\Python\Python311\Lib\subprocess.py", line 1599, in _readerthread
    buffer.append(fh.read())
                  ^^^^^^^^^
  File "C:\Users\kilia\AppData\Local\Programs\Python\Python311\Lib\encodings\cp1252.py", line 23, in decode
    return codecs.charmap_decode(input,self.errors,decoding_table)[0]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
UnicodeDecodeError: 'charmap' codec can't decode byte