In [11]:
import pandas as pd
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV, HalvingRandomSearchCV, GridSearchCV, RandomizedSearchCV
from sklearn.decomposition import PCA, KernelPCA, FastICA
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from enum import Enum

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

# Define an Enum class
class ML_TASK(Enum):
    CLASSIFICATION = 1
    REGRESSION = 2
    CLUSTERING = 3

class HYPERPARAM_SEARCH(Enum):
    RANDOM = 1
    GRID = 2
    HALVING_RANDOM = 3
    HALVING_GRID = 4

class DIM_REDUCTION_METHOD(Enum):
    PCA = 1
    KERNEL_PCA = 2
    ICA = 3

# Params
algo_name = 'RandomForest'
ml_task = ML_TASK.REGRESSION # True if you want to map fractions to integers for classification
hyperparam_search_method = HYPERPARAM_SEARCH.GRID # True if you want to use Random Grid Search instead of brute force
dim_reduction_method = DIM_REDUCTION_METHOD.PCA # False if you don't want to use PCA
kfold_num = 10 # Train-Test-Split = 1, LOOCV = n

# Read
X_train = pd.read_csv('../pc_X_train.csv')
y_train = pd.read_csv('../pc_y_train.csv')
y_train = y_train.iloc[:, -1] # With iloc we extract the labels

X_test = pd.read_csv('../pc_X_test.csv')
ids = X_test.iloc[:, 0]

# If classification -> Do mapping
if ml_task == ML_TASK.CLASSIFICATION:
    possible_numbers = list(set(y_train))
    mapping = {val: idx for idx, val in enumerate(possible_numbers)}
    y_train = [mapping[val] for val in y_train]


In [12]:
# Normalize Trainset
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X_train)

# PCA
dim_reduction = None
match dim_reduction_method:
    case DIM_REDUCTION_METHOD.PCA:
        dim_reduction = PCA(random_state=42)
    case DIM_REDUCTION_METHOD.KERNEL_PCA:
        dim_reduction = KernelPCA(random_state=42)
    case DIM_REDUCTION_METHOD.ICA:
        dim_reduction = FastICA(random_state=42)

# If classification
model = None
scoring_method = None
if ml_task == ML_TASK.CLASSIFICATION:
    model = RandomForestClassifier(random_state=42)
    scoring_method = 'accuracy'
    # raise ValueError("Linear Regression is a regression algorithm. Please use ML_TASK.REGRESSION")
elif ml_task == ML_TASK.REGRESSION:
    model = RandomForestRegressor(random_state=42)
    scoring_method = 'neg_root_mean_squared_error'
    # raise ValueError("x is a classification algorithm. Please use ML_TASK.x")

# Pipeline
steps = []
steps.append(('dim_reduction', dim_reduction))
steps.append(('model', model))

pipe = Pipeline(steps=steps)

# parameters = {'dim_reduction__n_components': [1,2,4,8,16], 
#               'model__criterion': ['gini', 'entropy', 'log_loss'], 
#               'model__splitter': ['best', 'random'],
#               'model__min_samples_split': [1,2,4,8,16,32],
#               'model__max_depth': [1,2,3,4,5,6,7,8,16,32,64]}

parameters = {'dim_reduction__n_components': [20,40,50,60,70,80,90,100,120,140,160,200,220,240,260,300],
              'model__n_estimators': [30,60,100,150,180,230,300],
              'model__max_depth': [None, 2,4,10,20,30],
              'model__min_samples_split': [2,4,6,8]}

# Grid Search
grid_params_list = [pipe, parameters] 
grid_params_dict = {'cv': kfold_num, 'scoring': scoring_method, 'verbose':2, 'n_jobs':6}

grid = None
match hyperparam_search_method:
    case HYPERPARAM_SEARCH.GRID:
        grid = GridSearchCV(*grid_params_list, **grid_params_dict)
    case HYPERPARAM_SEARCH.RANDOM:
        grid = RandomizedSearchCV(*grid_params_list, **grid_params_dict)
    case HYPERPARAM_SEARCH.HALVING_GRID:
        grid = HalvingGridSearchCV(*grid_params_list, **grid_params_dict)
    case HYPERPARAM_SEARCH.HALVING_RANDOM:
        grid = HalvingRandomSearchCV(*grid_params_list, **grid_params_dict)

grid.fit(X_train, y_train)

print("Best Parameters:", grid.best_params_)
print("Best Score:", grid.best_score_)

model = grid.best_estimator_.named_steps['model']
dim_reduction = grid.best_estimator_.named_steps['dim_reduction']

# Normalize Testset
X_normalized_test = scaler.transform(X_test)
X_reduced_test = dim_reduction.transform(X_normalized_test)

# Predict
predictions = model.predict(X_reduced_test)

# If classification -> Transform back to Regression Task
if ml_task == ML_TASK.CLASSIFICATION:
    predictions = [possible_numbers[val] for val in predictions]

# Save 
output_filename = f'bork_GYCAOB_{algo_name}_C.csv' if ml_task == ML_TASK.CLASSIFICATION else f'bork_GYCAOB_{algo_name}_R.csv'
results_df = pd.DataFrame({'id': ids,'score': predictions})
results_df.to_csv(output_filename, index=False)

Fitting 10 folds for each of 2688 candidates, totalling 26880 fits
[CV] END dim_reduction__n_components=20, model__max_depth=None, model__min_samples_split=2, model__n_estimators=30; total time=   0.5s
[CV] END dim_reduction__n_components=20, model__max_depth=None, model__min_samples_split=2, model__n_estimators=30; total time=   0.5s
[CV] END dim_reduction__n_components=20, model__max_depth=None, model__min_samples_split=2, model__n_estimators=30; total time=   0.5s
[CV] END dim_reduction__n_components=20, model__max_depth=None, model__min_samples_split=2, model__n_estimators=30; total time=   0.5s
[CV] END dim_reduction__n_components=20, model__max_depth=None, model__min_samples_split=2, model__n_estimators=30; total time=   0.5s
[CV] END dim_reduction__n_components=20, model__max_depth=None, model__min_samples_split=2, model__n_estimators=30; total time=   0.6s
[CV] END dim_reduction__n_components=20, model__max_depth=None, model__min_samples_split=2, model__n_estimators=30; total t

