In [None]:
pip install pandas numpy scikit-multilearn scipy pymoo matplotlib scikit-learn minepy

In [None]:
# Imports
import pandas as pd
import numpy as np
import logging
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from scipy.sparse import lil_matrix
from pymoo.optimize import minimize
from pymoo.config import Config
Config.warnings['not_compiled'] = False
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
from pymoo.util.nds.non_dominated_sorting import NonDominatedSorting
from minepy import pstats, cstats
from pymoo.optimize import minimize
from multiprocessing import Pool
import pickle
from google.colab import files

In [None]:
def find_all_zero_columns(X):
    # Find the columns that are all zeros
    zero_columns = X.columns[(X == 0).all()]
    # Get the indices of these columns
    zero_column_indices = [X.columns.get_loc(col) for col in zero_columns]
    return zero_column_indices


def determine_ranks(all_Data):

  sorted_data = []

  for i in range(len(all_Data)):
    name, data_matrix, feature_names, labels = all_Data[i]

    print("The name of the data set is: ", name)
    print("Feature names:", feature_names)
    print("Categories:", labels)
    print("Data matrix shape:", data_matrix.shape)

    # Convert the data matrix into a DataFrame
    df = pd.DataFrame(data_matrix, columns=feature_names + labels)

    # Separate the labels and features in the DataFrame
    y = df[labels]
    X = df.drop(columns=labels)

    xtrain_validate, xtest, ytrain_validate, ytest = train_test_split(X, y, test_size=0.3, random_state = 42)
    xtrain, xvalidate, ytrain, yvalidate = train_test_split(xtrain_validate, ytrain_validate, test_size=0.2, random_state = 42)

    zero_column_indices = find_all_zero_columns(xtrain_validate)
    print("Indices of columns that are all zeros:", zero_column_indices)
    print(f"There are {len(zero_column_indices)} features with no presence in the train validate sets")
    print(f"The shape of this validate data matrix is: {xtrain_validate.shape}")

    ytrain_lil = lil_matrix(ytrain)
    yvalidate_lil = lil_matrix(yvalidate)
    ytest_lil = lil_matrix(ytest)

    # calculate the mic matrix to select initial features for phase 1
    xtrain_validate_transpose = xtrain_validate.transpose()
    ytrain_validate_transpose = ytrain_validate.transpose()

    mic_matrix, tic_c =  cstats(xtrain_validate_transpose.values, ytrain_validate_transpose.values, alpha=9, c=5, est="mic_e")

    mic_matrix = np.array(mic_matrix)

    # Instantiate the NonDominatedSorting class
    nds = NonDominatedSorting(maximize=True)

    # Perform non-dominated sorting on the mic_matrix
    sorted_indices = nds.do(mic_matrix, only_non_dominated_front=False)

    for i in range(len(sorted_indices)):
      print(f'Rank {i} in {name} has {len(sorted_indices[i])} features: ', sorted_indices[i])

    print(f"The are {xtrain.shape[1]} features total")

    print("\n#######################################################################################\n")

    sorted_data.append((name, sorted_indices))

  return sorted_data

In [None]:
# Load the dataset
all_Data = []

data_matrix_name, feature_names_name, labels_name = parse_arff_data_name('name.arff')
name = ['name', data_matrix_name, feature_names_name, labels_name]
all_Data.append(name)

In [None]:
def process_dataset(dataset):
    # Extract dataset information
    name, data_matrix, feature_names, labels = dataset

    # Print dataset being processed
    print(f"Processing dataset: {name}")

    # Call determine_ranks function for the current dataset
    sorted_data = determine_ranks([dataset])

    return (name, sorted_data)

if __name__ == '__main__':
    sorted_datasets = []

    # Process each dataset sequentially
    for dataset in all_Data:
        sorted_dataset = process_dataset(dataset)
        sorted_datasets.append(sorted_dataset)

    # Save sorted_data to a pickle file
    with open('sorted_data.pkl', 'wb') as f:
        pickle.dump(sorted_datasets, f)

    # Download the saved pickle file (Google Colab specific)
    files.download('sorted_data.pkl')