The crowding distance is used to handle the case where all features end up in the top rank. After calculating MIC scores to evaluate feature relevance, the crowding distance is computed to differentiate between features. This distance measures the relative density of features in the rank, helping to select the top 50% of features with the lowest crowding distances. By doing this, the code identifies which features are more distinct within the top rank, allowing for more refined feature selection.

In [None]:
pip install pandas numpy scikit-multilearn scipy pymoo matplotlib scikit-learn minepy

In [None]:
# Imports
import pandas as pd
import numpy as np
import logging
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from scipy.sparse import lil_matrix
from pymoo.optimize import minimize
from pymoo.config import Config
Config.warnings['not_compiled'] = False
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
from pymoo.util.nds.non_dominated_sorting import NonDominatedSorting
from minepy import pstats, cstats
from pymoo.optimize import minimize
from multiprocessing import Pool
from minepy import MINE  # Make sure to install minepy

In [None]:
"""
Explanation
find_all_zero_columns: Identifies columns that contain only zeros.
calculate_mic: Computes the MIC scores for features.
calculate_crowding_distance: Computes crowding distances for features based on their MIC scores.
determine_ranks: Integrates the feature selection process by selecting the top 50% of features with the lowest crowding distances.
"""

def find_all_zero_columns(X):
    zero_columns = X.columns[(X == 0).all()]
    zero_column_indices = [X.columns.get_loc(col) for col in zero_columns]
    return zero_column_indices

def calculate_mic(xtrain_validate, ytrain_validate):
    mine = MINE(alpha=9, c=5, est="mic_e")
    mic_matrix = []
    for feature in xtrain_validate.columns:
        mine.compute_score(xtrain_validate[feature], ytrain_validate.iloc[:, 0])
        mic_matrix.append(mine.mic())
    return np.array(mic_matrix)

def calculate_crowding_distance(mic_scores):
    num_features = mic_scores.shape[0]

    crowding_distance = np.zeros(num_features)

    sorted_indices = np.argsort(mic_scores)
    sorted_values = mic_scores[sorted_indices]

    crowding_distance[sorted_indices[0]] = float('inf')
    crowding_distance[sorted_indices[-1]] = float('inf')

    for j in range(1, num_features - 1):
        crowding_distance[sorted_indices[j]] += (sorted_values[j + 1] - sorted_values[j - 1]) / (sorted_values[-1] - sorted_values[0])

    return crowding_distance

def determine_ranks(all_Data):
    sorted_data = []

    for i in range(len(all_Data)):
        name, data_matrix, feature_names, labels = all_Data[i]

        print("The name of the data set is: ", name)
        print("Feature names:", feature_names)
        print("Categories:", labels)
        print("Data matrix shape:", data_matrix.shape)

        df = pd.DataFrame(data_matrix, columns=feature_names + labels)

        y = df[labels]
        X = df.drop(columns=labels)

        xtrain_validate, xtest, ytrain_validate, ytest = train_test_split(X, y, test_size=0.3, random_state=42)
        xtrain, xvalidate, ytrain, yvalidate = train_test_split(xtrain_validate, ytrain_validate, test_size=0.2, random_state=42)

        zero_column_indices = find_all_zero_columns(xtrain_validate)
        print("Indices of columns that are all zeros:", zero_column_indices)
        print(f"There are {len(zero_column_indices)} features with no presence in the train validate sets")
        print(f"The shape of this validate data matrix is: {xtrain_validate.shape}")

        mic_scores = calculate_mic(xtrain_validate, ytrain_validate)
        crowding_distances = calculate_crowding_distance(mic_scores)

        # Select 50% of the features with the lowest crowding distances
        num_selected_features = max(1, int(len(crowding_distances) * 0.5))
        selected_features_indices = np.argsort(crowding_distances)[:num_selected_features]

        print(f"Selected {num_selected_features} features out of {len(crowding_distances)} based on crowding distance.")
        print(f"The indices of selected features are: {selected_features_indices}")

        print("\n")

        sorted_data.append((name, selected_features_indices))

    return sorted_data

In [None]:
# Load the dataset
all_Data = []

# replace 'name' with the dataset's name
data_matrix_name, feature_names_name, labels_name = parse_arff_data_name('name.arff')
name = ['name', data_matrix_name, feature_names_name, labels_name]
all_Data.append(name)

sorted_data = determine_ranks(all_Data)

In [None]:
import multiprocessing
import pickle
from multiprocessing import Manager
from google.colab import files

def process_dataset(dataset):
    # Extract dataset information
    name, data_matrix, feature_names, labels = dataset

    # Print dataset being processed
    print(f"Processing dataset: {name}")

    # Call determine_ranks function for the current dataset
    sorted_data = determine_ranks([dataset])

    return (name, sorted_data)

if __name__ == '__main__':
    # Use multiprocessing.Pool to parallelize processing
    with multiprocessing.Pool() as pool:
        sorted_datasets = pool.map(process_dataset, all_Data)

    # Collect results from multiprocessing.Pool
    sorted_data = []
    for dataset_result in sorted_datasets:
        sorted_data.append(dataset_result)

    # Save sorted_data to a pickle file
    with open('sorted_data.pkl', 'wb') as f:
        pickle.dump(sorted_data, f)

    # Download the saved pickle file (Google Colab specific)
    files.download('sorted_data.pkl')