In [None]:
import os
import pandas as pd
import numpy as np
import threading
import time
import pickle
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from lightgbm import LGBMRegressor
import warnings
warnings.filterwarnings("ignore")
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical

def train_model(model_name, model, X_train, y_train, X_test, y_test, results, training_threshold, dataset_name, X_val=None, y_val=None):
    y_pred = [None]
    training_time = [None]
    training_completed = [False]

    def train():
        start_time = time.time()
        try:
            print(f"Starting training for {model_name}...")
            
            fit_kwargs = {}
            if 'LightGBM' in model_name and X_val is not None and y_val is not None:
                fit_kwargs = {
                    'lightgbm__eval_set': [(X_val, y_val)],
                    'lightgbm__early_stopping_rounds': 20
                }
            
            model.fit(X_train, y_train, **fit_kwargs)
            
            y_pred[0] = model.predict(X_test)
            training_time[0] = time.time() - start_time
            training_completed[0] = True
            print(f"Completed training for {model_name} in {training_time[0]:.2f} seconds.")
        except Exception as e:
            print(f"Error training model {model_name}: {e}")
            training_completed[0] = False

    thread = threading.Thread(target=train)
    thread.start()
    thread.join(timeout=training_threshold)

    if not training_completed[0]:
        print(f"Model {model_name} exceeded training time ({training_threshold} seconds) or encountered an error.")
        y_pred[0] = np.nan
        training_time[0] = np.nan
    else:
        mse = mean_squared_error(y_test, y_pred[0])
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_test, y_pred[0])
        r_squared = r2_score(y_test, y_pred[0])

        n = len(y_test)
        p = X_test.shape[1]
        if n > p + 1 and p > 0:
            adjusted_r_squared = 1 - (1 - r_squared) * ((n - 1) / (n - p - 1))
        else:
            adjusted_r_squared = r_squared

        print(f"Model {model_name} trained successfully in {training_time[0]:.2f} seconds.")
        print(f"MSE: {mse}, RMSE: {rmse}, MAE: {mae}, R²: {r_squared}, Adjusted R²: {adjusted_r_squared}")

        best_params = None
        if isinstance(model, BayesSearchCV):
            best_params = model.best_params_
            print(f"Best parameters for {model_name}: {best_params}")

        models_dir = os.path.join("models", dataset_name)
        os.makedirs(models_dir, exist_ok=True)
        model_filename = f"{model_name.replace(' ', '_')}.pkl"
        model_filepath = os.path.join(models_dir, model_filename)
        with open(model_filepath, 'wb') as f:
            pickle.dump(model, f)
        print(f"Model {model_name} saved to {model_filepath}")

        result = {
            'Model': model_name,
            'Dataset': dataset_name,
            'Training Time (s)': training_time[0],
            'MSE': mse,
            'RMSE': rmse,
            'MAE': mae,
            'R2 Score': r_squared,
            'Adjusted R2 Score': adjusted_r_squared
        }
        if best_params:
            result['Best Params'] = str(best_params)
        results.append(result)

def main():
    param_spaces = {
        'Ridge Regression': {
            'ridge__alpha': Real(0.1, 10.0, prior='log-uniform')
        },
        'Lasso Regression': {
            'lasso__alpha': Real(0.01, 1.0, prior='log-uniform')
        },
        'Elastic Net Regression': {
            'elasticnet__alpha': Real(0.01, 1.0, prior='log-uniform'),
            'elasticnet__l1_ratio': Real(0.1, 0.9)
        },
        'LightGBM Regression': {
            'lightgbm__num_leaves': Integer(31, 50),
            'lightgbm__learning_rate': Real(0.01, 0.05, prior='log-uniform'),
            'lightgbm__n_estimators': Integer(100, 200)
        },
        'Random Forest Regression': {
            'randomforest__n_estimators': Integer(50, 100),
            'randomforest__max_depth': Categorical([5, 10]),
            'randomforest__min_samples_split': Integer(2, 5)
        },
        'XGBoost Regression': {
            'xgboost__learning_rate': Real(0.01, 0.05, prior='log-uniform'),
            'xgboost__max_depth': Integer(3, 5),
            'xgboost__n_estimators': Integer(100, 200)
        }
    }
    pipelines = {
        'Linear Regression': Pipeline([
            ('linearregression', LinearRegression())
        ]),
        'Ridge Regression': Pipeline([
            ('ridge', Ridge())
        ]),
        'Lasso Regression': Pipeline([
            ('lasso', Lasso())
        ]),
        'Elastic Net Regression': Pipeline([
            ('elasticnet', ElasticNet())
        ]),
        'LightGBM Regression': Pipeline([
            ('lightgbm', LGBMRegressor())
        ]),
        'Random Forest Regression': Pipeline([
            ('randomforest', RandomForestRegressor())
        ]),
        'XGBoost Regression': Pipeline([
            ('xgboost', xgb.XGBRegressor(use_label_encoder=False, eval_metric='rmse'))
        ]),
    }

    models = {}
    for name, pipeline in pipelines.items():
        if name in param_spaces:
            models[name] = BayesSearchCV(
                estimator=pipeline,
                search_spaces=param_spaces[name],
                cv=3,
                scoring='r2',
                n_jobs=1,
                verbose=1,
                n_iter=10,
                random_state=42
            )
        else:
            models[name] = pipeline

    # Training threshold
    training_threshold = 7200  # seconds

    # List of datasets
    datasets = ['dataset1', 'dataset5']

    # Results folder
    results_dir = "model_results"
    os.makedirs(results_dir, exist_ok=True)

    # Models directory
    models_dir = "models"
    os.makedirs(models_dir, exist_ok=True)

    # Process each dataset
    for dataset_name in datasets:
        print(f"\nProcessing {dataset_name}")

        # Load pre-split data
        data_path = f"/home/dev/project/modelling/preprocessing/results/{dataset_name}"
        try:
            train_data = pd.read_csv(os.path.join(data_path, "train.csv"))
            test_data = pd.read_csv(os.path.join(data_path, "test.csv"))
            val_data = pd.read_csv(os.path.join(data_path, "val.csv"))
            print(f"Successfully loaded pre-split data for {dataset_name}")
        except Exception as e:
            print(f"Error loading data for {dataset_name}: {e}")
            continue
        
        # drop column 'mssv' if it exists
        if 'mssv' in train_data.columns:
            train_data = train_data.drop(columns=['mssv'])
            test_data = test_data.drop(columns=['mssv'])
            val_data = val_data.drop(columns=['mssv'])
        
        # Define target variable
        target_variable = 'diem_hp'
        if target_variable not in train_data.columns:
            print(f"Target variable '{target_variable}' not found in training data for dataset '{dataset_name}'. Skipping this dataset.")
            continue

        # Separate features and target
        X_train = train_data.drop(columns=[target_variable])
        y_train = train_data[target_variable]
        X_test = test_data.drop(columns=[target_variable])
        y_test = test_data[target_variable]
        X_val = val_data.drop(columns=[target_variable])
        y_val = val_data[target_variable]

        # Handle missing values
        X_train = X_train.fillna(0)
        X_test = X_test.fillna(0)
        X_val = X_val.fillna(0)
        y_train = y_train.fillna(0)
        y_test = y_test.fillna(0)
        y_val = y_val.fillna(0)

        print("All features are now numeric and missing values are handled.")

        # Prepare results storage
        results = []

        # Train and evaluate each model
        for model_name, model in models.items():
            print(f"\nTraining model: {model_name}")
            train_model(
                model_name=model_name,
                model=model,
                X_train=X_train,
                y_train=y_train,
                X_test=X_test,
                y_test=y_test,
                results=results,
                training_threshold=training_threshold,
                dataset_name=dataset_name
            )

        # Save results to CSV
        results_df = pd.DataFrame(results)
        dataset_results_dir = os.path.join(results_dir, dataset_name)
        os.makedirs(dataset_results_dir, exist_ok=True)
        results_file = os.path.join(dataset_results_dir, 'model_results.csv')
        results_df.to_csv(results_file, index=False)
        print(f"Results for {dataset_name} saved to {results_file}")

if __name__ == "__main__":
    main()
