In [1]:
import pandas as pd

In [2]:
from scipy.stats import mannwhitneyu

def mann_whitney_selection(df, k, output_csv_path):
    """
    Perform Mann-Whitney U test on features (excluding 'data_group' and 'event_type') 
    to select the top k features based on p-values.
    
    Parameters:
    - df (pd.DataFrame): Input DataFrame.
    - k (int): Number of top features to select.
    - output_csv_path (str): Path to save the selected features CSV file.
    
    """
    target_variable = 'has_parkinson'

    # Exclude 'data_group' and 'event_type' columns
    columns_to_exclude = ['data_group', 'event_type']
    columns_to_test = [col for col in df.columns if col not in columns_to_exclude]

    # Separate data into two groups based on the target variable
    group_0 = df[df[target_variable] == 0]
    group_1 = df[df[target_variable] == 1]

    # Dictionary to store Mann-Whitney U test results for each feature
    u_test_results = {}

    # Perform Mann-Whitney U test for each feature
    for column in columns_to_test:
        stat, p_value = mannwhitneyu(group_0[column], group_1[column])
        u_test_results[column] = {'Mann-Whitney U Statistic': stat, 'P-value': p_value}

    # Convert results to a DataFrame for easier analysis
    u_test_df = pd.DataFrame.from_dict(u_test_results, orient='index')

    # Sort features based on p-values in ascending order
    u_test_df = u_test_df.sort_values(by='P-value')

    # Select the top k features
    selected_features = u_test_df.index[:k]

    # Subset the DataFrame with selected features
    df_selected = df[selected_features]

    # Save the selected features to a CSV file
    df_selected.to_csv(output_csv_path, index=False)

    print(f"Selected features have been saved to {output_csv_path}")


In [4]:
from scipy.stats import kruskal

def kruskal_wallis_selection(df, k, output_csv_path):
    """
    Perform Kruskal-Wallis H test on features (excluding 'has_parkinson' and 'event_type') 
    to select the top k features based on p-values.
    
    Parameters:
    - df (pd.DataFrame): Input DataFrame.
    - k (int): Number of top features to select.
    - output_csv_path (str): Path to save the selected features CSV file.

    """

    target_variable = 'data_group'
    
    # Exclude 'has_parkinson' and 'event_type' columns
    columns_to_exclude = ['has_parkinson', 'event_type']
    columns_to_test = [col for col in df.columns if col not in columns_to_exclude]

    # List of unique values in the target variable column
    target_values = df[target_variable].unique()

    # Dictionary to store Kruskal-Wallis H test results for each feature
    kruskal_results = {}

    # Perform Kruskal-Wallis H test for each feature and target value
    for column in columns_to_test:
        feature_values = [df[df[target_variable] == value][column] for value in target_values]
        
        # Check if there is variability in each group
        if all(value.std() == 0 for value in feature_values):
            print(f"Skipping '{column}' because all numbers are identical in at least one group.")
            continue
        
        stat, p_value = kruskal(*feature_values)
        kruskal_results[column] = {'Kruskal-Wallis H Statistic': stat, 'P-value': p_value}

    # Convert results to a DataFrame for easier analysis
    kruskal_df = pd.DataFrame.from_dict(kruskal_results, orient='index')

    # Sort features based on p-values in ascending order
    kruskal_df = kruskal_df.sort_values(by='P-value')

    # Select the top k features
    selected_features = kruskal_df.index[:k]

    # Subset the DataFrame with selected features
    df_selected = df[selected_features].copy()

    df_selected['data_group'] = df['data_group']

    # Save the selected features to a CSV file
    df_selected.to_csv(output_csv_path, index=False)

    print(f"Selected features have been saved to {output_csv_path}")


In [5]:
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def perform_rfe_and_save(df, target_variable,
                          output_csv_path):
    """
    Perform Recursive Feature Elimination (RFE) for feature selection and save the result to a CSV file.

    Parameters:
    - df (pd.DataFrame): Input DataFrame with features and target variable.
    - target_variable (str): The target variable column name.
    - output_csv_path (str): Path to save the selected features DataFrame.

    Returns:
    - pd.Index: Index of selected features.
    - float: Accuracy of the model on the test set using the selected features.
    """

    # Exclude specified columns from feature selection
    if target_variable == 'has_parkinson':
        X = df.drop(columns=[target_variable, 'data_group', 'event_type'])
    else:
        X = df.drop(columns=[target_variable, 'has_parkinson', 'event_type'])

    y = df[target_variable]

    test_size=0.2
    random_state=42

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Choose a classification model (e.g., RandomForestClassifier)
    model = RandomForestClassifier()

    # Initialize RFE with the chosen model and desired number of features
    rfe = RFE(model, n_features_to_select=20)

    # Fit RFE to the training data
    rfe.fit(X_train, y_train)

    # Get the selected features
    selected_features = X.columns[rfe.support_]

    # Evaluate the model with the selected features on the test set
    y_pred = rfe.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    # Create a DataFrame with selected features and target variable
    df_selected = pd.concat([X[selected_features], y], axis=1)

    # Save the selected features DataFrame to a CSV file
    df_selected.to_csv(output_csv_path, index=False)

    # Print the selected features and accuracy
    print("Selected Features:", selected_features)
    print(f"Selected features DataFrame has been saved to {output_csv_path}")

    return selected_features, accuracy

## Math Features

In [6]:
df = pd.read_csv('csv_data/all/math_features_data.csv')

### Mann-Whitney U test (Binary) and Kruskal-Wallis test (Ternary)

In [7]:
mann_whitney_selection(df, 15, 'csv_data/15_most_important/bi_math_mw.csv')

Selected features have been saved to csv_data/20_most_important/bi_math_mw.csv


In [8]:
kruskal_wallis_selection(df, 15, 'csv_data/15_most_important/tri_math_kw.csv')

Skipping 'data_group' because all numbers are identical in at least one group.
Selected features have been saved to csv_data/20_most_important/tri_math_kw.csv


### Recursive Feature Eliminations

In [7]:
perform_rfe_and_save(df,'has_parkinson','csv_data/20_most_important/bi_math_rfe.csv')

KeyboardInterrupt: 

In [None]:
perform_rfe_and_save(df,'data_group','csv_data/15_most_important/tri_math_rfe.csv')

## Medic Features

In [9]:
df = pd.read_csv('csv_data/all/medic_features_data.csv')

### Mann-Whitney U test (Binary) and Kruskal-Wallis test (Ternary)

In [10]:
mann_whitney_selection(df, 15, 'csv_data/15_most_important/bi_med_mw.csv')

Selected features have been saved to csv_data/20_most_important/bi_med_mw.csv


In [11]:
kruskal_wallis_selection(df, 15, 'csv_data/15_most_important/tri_med_kw.csv')

Skipping 'data_group' because all numbers are identical in at least one group.
Skipping 'velocity_minimum' because all numbers are identical in at least one group.
Skipping 'velocity_x_minimum' because all numbers are identical in at least one group.
Skipping 'acceleration_x_minimum' because all numbers are identical in at least one group.
Skipping 'velocity_y_minimum' because all numbers are identical in at least one group.
Skipping 'acceleration_y_minimum' because all numbers are identical in at least one group.
Selected features have been saved to csv_data/20_most_important/tri_med_kw.csv


### Recursive Feature Elimination

In [None]:
perform_rfe_and_save(df,'has_parkinson','csv_data/15_most_important/bi_med_rfe.csv')

In [None]:
perform_rfe_and_save(df,'data_group','csv_data/15_most_important/tri_med_rfe.csv')

## All features

In [14]:
df = pd.read_csv('csv_data/all/result_data.csv')

### Mann-Whitney U test (Binary) and Kruskal-Wallis test (Ternary)

In [15]:
mann_whitney_selection(df, 15, 'csv_data/15_most_important/bi_all_mw.csv')

Selected features have been saved to csv_data/20_most_important/bi_all_mw.csv


In [16]:
kruskal_wallis_selection(df, 15, 'csv_data/15_most_important/tri_all_kw.csv')

Skipping 'data_group' because all numbers are identical in at least one group.
Skipping 'velocity_minimum' because all numbers are identical in at least one group.
Skipping 'velocity_x_minimum' because all numbers are identical in at least one group.
Skipping 'acceleration_x_minimum' because all numbers are identical in at least one group.
Skipping 'velocity_y_minimum' because all numbers are identical in at least one group.
Skipping 'acceleration_y_minimum' because all numbers are identical in at least one group.
Selected features have been saved to csv_data/20_most_important/tri_all_kw.csv


### Recursive Feature Elimination

In [None]:
perform_rfe_and_save(df,'has_parkinson','csv_data/15_most_important/bi_all_rfe.csv')

In [None]:
perform_rfe_and_save(df,'data_group','csv_data/15_most_important/tri_all_rfe.csv')