In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import matplotlib.pyplot as plt
from sklearn.utils import resample
import os

In [2]:
folder_path = '../../training_data'
datasets = [name for name in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, name))]

In [None]:
n_bootstraps = 10

for dataset in datasets:
    feature_df = pd.read_csv('../../training_data/' + dataset + '/features.csv').iloc[:, 1:21]
    target_df = pd.read_csv('../../training_data/' + dataset + '/target.csv').iloc[:, 1:3]
    folds_df = pd.read_csv('../../training_data/' + dataset + '/folds.csv').iloc[:, 1:2]
    n_folds = len(folds_df.iloc[:, 0].unique())

    # Concatenate feature_df and target_df
    feature_target_df = pd.concat([feature_df, target_df], axis=1)

    # Replace -inf with NaN for 'min.log.penalty'
    feature_target_df['min.log.lambda'].replace(-np.inf, np.nan, inplace=True)

    # Replace inf with NaN for 'max.log.penalty'
    feature_target_df['max.log.lambda'].replace(np.inf, np.nan, inplace=True)

    # Replace '#NAME?' with 0
    feature_target_df.replace('#NAME?', 0, inplace=True)

    for fold in range(1, n_folds + 1):
        # Get the indices for the current fold
        fold_indices = folds_df.index[folds_df['fold'] == fold].tolist()

        # Exclude rows corresponding to the current fold
        training_df = feature_target_df.drop(fold_indices)

        # Initialize arrays to store the importances for each bootstrap
        importances_min = np.zeros((n_bootstraps, training_df.shape[1] - 2))
        importances_max = np.zeros((n_bootstraps, training_df.shape[1] - 2))

        # Bootstrapping for min.log.penalty
        for i in range(n_bootstraps):
            # Drop NaNs specific to min.log.penalty
            df_min = training_df.dropna(subset=['min.log.lambda'])
            X_min = df_min.drop(columns=["min.log.lambda", "max.log.lambda"])
            y_min = df_min['min.log.lambda']

            X_resampled, y_resampled = resample(X_min, y_min)  # Create a bootstrap sample
            rf_min = RandomForestRegressor(n_estimators=100, random_state=42)
            rf_min.fit(X_resampled, y_resampled)  # Fit the model on the bootstrap sample
            importances_min[i, :] = rf_min.feature_importances_  # Store the feature importances

        # Bootstrapping for max.log.penalty
        for i in range(n_bootstraps):
            # Drop NaNs specific to max.log.penalty
            df_max = training_df.dropna(subset=['max.log.lambda'])
            X_max = df_max.drop(columns=["min.log.lambda", "max.log.lambda"])
            y_max = df_max['max.log.lambda']

            X_resampled, y_resampled = resample(X_max, y_max)  # Create a bootstrap sample
            rf_max = RandomForestRegressor(n_estimators=100, random_state=42)
            rf_max.fit(X_resampled, y_resampled)  # Fit the model on the bootstrap sample
            importances_max[i, :] = rf_max.feature_importances_  # Store the feature importances

        # Calculate the mean importances and confidence intervals for both targets
        mean_importances_min = np.mean(importances_min, axis=0)
        std_importances_min = np.std(importances_min, axis=0)
        lower_bound_min = mean_importances_min - std_importances_min
        upper_bound_min = mean_importances_min + std_importances_min

        mean_importances_max = np.mean(importances_max, axis=0)
        std_importances_max = np.std(importances_max, axis=0)
        lower_bound_max = mean_importances_max - std_importances_max
        upper_bound_max = mean_importances_max + std_importances_max

        # Average the feature importances and confidence intervals
        mean_importances = (mean_importances_min + mean_importances_max) / 2
        lower_bound = (lower_bound_min + lower_bound_max) / 2
        upper_bound = (upper_bound_min + upper_bound_max) / 2

        # Sort the indices based on mean_importances
        sorted_indices = np.argsort(mean_importances)[::-1]

        # Sort the data accordingly
        sorted_mean_importances = mean_importances[sorted_indices]
        sorted_lower_bound = lower_bound[sorted_indices]
        sorted_upper_bound = upper_bound[sorted_indices]
        sorted_features = training_df.columns[sorted_indices]

        # Save feature importance data to CSV for the current fold
        importance_df = pd.DataFrame({
            'Feature': sorted_features,
            'Mean Importance': sorted_mean_importances,
            'Lower Bound': sorted_lower_bound,
            'Upper Bound': sorted_upper_bound
        })
        importance_df.to_csv(f'feature_importance/{dataset}_fold{fold}.csv', index=False)