# Alzheimer's disease, Frontotemporal dementia and Healthy subjects dataset #

In [15]:
import mne
import pandas as pd
import numpy as np
import os
from mne.preprocessing import ICA
import time

from scipy.signal import welch
from scipy.stats import kurtosis, skew
import pywt
import antropy as ant

Firstly, two EEG files are imported to check their data and compare to see if there is anything off between files as well. Important to note is that these data files are already preprocessed by the original researchers themselves. 

Everything seems in normal ranges. 

In [16]:
data = mne.io.read_raw_eeglab("Alzheimer-Frontotemporal-healthy/derivatives/sub-001/eeg/sub-001_task-eyesclosed_eeg.set", preload=True)
data.describe()
data.plot()

In [17]:
raw = mne.io.read_raw_eeglab("Alzheimer-Frontotemporal-healthy/derivatives/sub-002/eeg/sub-002_task-eyesclosed_eeg.set", preload=True)
raw.describe()
raw.plot()

Here the feature extraction functions for the EEG data are defined.

In [18]:
def band_powers_ratios(raw_frame, explicit=True): # raw frame is 128 x 75250
    freqs, psd = welch(raw_frame, fs=250, nperseg=min(1000, raw_frame.shape[1]), axis=1)

    # Define the frequency ranges of the five main brain wavelength bands
    freq_bands = {
        "Delta": [1, 4], 
        "Theta": [4, 8],
        "Alpha": [8, 13],
        "Beta":  [13, 30],
        "Gamma": [30, 45]
        }

    # Calculate global average band power for entire sample
    powers = {}
    for band, (l, h) in freq_bands.items():
        mask = (freqs >= l) & (freqs <= h)
        # Take average across frequency bands, and then aggregate across channels to obtain a global metric
        band_psd = np.mean(psd[:, mask], axis=1) # Shape (n_channels,)
        powers[band] = np.mean(band_psd)  # Keep in µV²/Hz
    
    # Calculate ratios between global average band powers
    ratios = {
        "Theta/Alpha": powers["Theta"] / powers["Alpha"], # Classic Dementia biomarker
        "Theta/Beta":  powers["Theta"] / powers["Beta"], # Correlates with cognitive decline
        "Delta/Alpha": powers["Delta"] / powers["Alpha"] # Also considered a good indicator of cognitive decline
    }
    
    # Convert the Power Spectral Density (PSD) to decibels, its standard measure
    powers_db = {band: 10 * np.log10(val + 1e-12) for band, val in powers.items()}
    
    merged = {**powers_db, **ratios}
    merged_dataframe = pd.DataFrame([merged])
    
    if explicit:
        print(merged_dataframe.shape)
        print(merged_dataframe)
        
    return merged_dataframe


# Discrete Wavelet Transform - decompose a signal into different frequency components at different time scales
# Unlike Fourier Transform, it can tell you at what time a frequency exists in the signal
def db_wavelet_features(raw_frame, wavelet="db4", level=4, explicit=True):
    # Store features for each channel, as it works on 1D data
    all_channel_features = []
    
    for ch_idx in range(raw_frame.shape[0]):
        channel_features = []
        channel_data = raw_frame.iloc[ch_idx, :].values.astype(np.float32)
        
        # Perform Discrete Wavelet Transform using Daubechies
        
        # Daubechies wavelets are a specific family of wavelet functions with good properties for EEG data
        # We use "wavelet=db4" because the 4th order offers good balance between smoothness, localization and computational efficiency
        coefs = pywt.wavedec(channel_data, wavelet=wavelet, level=level)
        # We list them in reversed order to gain the standard ordering:
        # cA4 - slow, overall trend; cD4:cD1 - go from slower to faster (increase in Herz range they investigate)
        for i, c in reversed(list(enumerate(coefs))):
            # We take the Root Mean Square as it is directly related to signal power. 
            # It measures how much power/energy is in a specific frequency component and has desirable mathematical properties for this use case
            channel_features.extend([np.sqrt(np.mean(np.square(c)))])

        # Extracting global wavelet entropy - it captures how evenly energy is distributed across frequency bands, in time-frequency domain using wavelet decomposition
        # Captures major band imbalances and preserves temporal information about when those imbalances occur.
        
        # Calcuate energy per wavelet
        energies = np.array([np.sum(np.square(c)) for c in coefs])
        # See how much they contribute to the overall signal
        distribution = energies / (np.sum(energies)+1e-12)
        # Calculate Shannon entropy on the distribution
        entropy = -np.sum(distribution * (np.log2(distribution)+1e-12))
        channel_features.append(entropy) 
        
        all_channel_features.append(channel_features)

    # Average features across all channels
    avg_features = np.mean(all_channel_features, axis=0)
    
    # Generate feature labels
    labels = []
    labels.extend(["cA4_RMS"])
    labels.extend([f"cD{i}_{'RMS'}" for i in range(4,0,-1)])
    labels.append("wavelet_entropy")

    # Combine the data with the labels to create a dataframe
    final = pd.DataFrame(data=[avg_features], columns=labels)

    if explicit:
        print(final.shape)
        print(final)
    
    return final

def other_metrics(raw_frame, explicit=True):
    # Asign all of the electrodes to five main head areas
    head_sections = {
        'frontal': [0, 1, 2, 3, 10, 11, 16],      # Fp1, Fp2, F3, F4, F7, F8, Fz
        'central': [4, 5, 17],                     # C3, C4, Cz
        'temporal': [12, 13, 14, 15],              # T3, T4, T5, T6
        'parietal': [6, 7, 18],                    # P3, P4, Pz
        'occipital': [8, 9]                        # O1, O2
    }
    
    features = []
    labels = []
    
    # Global metrics (across all electrodes) - averaged across all channels
    spectral_entropies = [] # measures how evenly power is distributed across all frequencies in the frequency domain, using Fourier Transform. 
                            # more sensitive to subtle frequency shifts
    perm_entropies = [] # measures complexity by looking at the order/patterns of consecutive data points
    mobility_values = [] # measures the "mean frequency" of how fast the signal oscillates
    complexity_values = [] # measures how much the signal deviates from a simple sine wave
    
    for ch_idx in range(raw_frame.shape[0]):
        channel_data = raw_frame.iloc[ch_idx,:].values.astype(np.float32)
        
        try:
            spectral_entropies.append(ant.spectral_entropy(channel_data, sf=250))
        except:
            pass
        
        try:
            perm_entropies.append(ant.perm_entropy(channel_data))
        except:
            pass
        
        try:
            m, c = ant.hjorth_params(channel_data)
            mobility_values.append(m)
            complexity_values.append(c)
        except:
            pass
    
    # Add global averages
    features.extend([
        np.mean(spectral_entropies) if spectral_entropies else np.nan,
        np.mean(perm_entropies) if perm_entropies else np.nan,
        np.mean(mobility_values) if mobility_values else np.nan,
        np.mean(complexity_values) if complexity_values else np.nan
    ])
    
    labels.extend([
        "global_spectral_entropy",
        "global_permutation_entropy",
        "global_hjorth_mobility",
        "global_hjorth_complexity"
    ])
    
    # Compute features for each head section, not global
    
    # Process each head section
    for section, indices in head_sections.items():
        section_metrics = {
            # The reason the first three features are not computed on a gloval scale is that they are sensitive mainly to localized 
            # behavior in EEG signals, and averaging them across the entire scalp would obscure critical regional variations.
            'sample_entropy': [], # measures complexity by checking how often similar patterns repeat in the signal
            'higuchi': [], # Higuchi Fractal Dimension - measures the "self-similarity" or "roughness" of the signal across different time scales - how jagged or smooth the waveform is
            'dfa': [], # Detrended Fluctuation Analysis - measures long-range correlations in the signal
            'spectral_entropy': [],
            'permutation_entropy': [],
            'mobility': [],
            'complexity': []
        }
        
        for idx in indices:
            electrode_data = raw_frame.iloc[idx, :].values.astype(np.float32)
            electrode_data = np.ascontiguousarray(electrode_data)
            
            try:
                section_metrics['sample_entropy'].append(ant.sample_entropy(electrode_data))
            except:
                pass
            
            try:
                section_metrics['higuchi'].append(ant.higuchi_fd(electrode_data))
            except:
                pass
            
            try:
                section_metrics['dfa'].append(ant.detrended_fluctuation(electrode_data))
            except:
                pass
            
            try:
                section_metrics['spectral_entropy'].append(
                    ant.spectral_entropy(electrode_data, sf=250))
            except:
                pass
            
            try:
                section_metrics['permutation_entropy'].append(
                    ant.perm_entropy(electrode_data))
            except:
                pass
            
            try:
                m, c = ant.hjorth_params(electrode_data)
                section_metrics['mobility'].append(m)
                section_metrics['complexity'].append(c)
            except:
                pass
        
        # Add section averages
        features.extend([
            np.mean(section_metrics['sample_entropy']) 
                if section_metrics['sample_entropy'] else np.nan,
            np.mean(section_metrics['higuchi']) 
                if section_metrics['higuchi'] else np.nan,
            np.mean(section_metrics['dfa']) 
                if section_metrics['dfa'] else np.nan,
            np.mean(section_metrics['spectral_entropy']) 
                if section_metrics['spectral_entropy'] else np.nan,
            np.mean(section_metrics['permutation_entropy']) 
                if section_metrics['permutation_entropy'] else np.nan,
            np.mean(section_metrics['mobility']) 
                if section_metrics['mobility'] else np.nan,
            np.mean(section_metrics['complexity']) 
                if section_metrics['complexity'] else np.nan
        ])
        
        # Add feature labels
        labels.extend([
            f"{section}_sample_entropy",
            f"{section}_higuchi",
            f"{section}_DFA",
            f"{section}_spectral_entropy",
            f"{section}_permutation_entropy",
            f"{section}_hjorth_mobility",
            f"{section}_hjorth_complexity"
        ])
    
    # Replace NaN with 0
    features = [0 if np.isnan(x) else x for x in features]
    
    final = pd.DataFrame([features], columns=labels)
    
    if explicit:
        print(f"Other metrics shape: {final.shape}")
        print(final)
    
    return final
    
# This function combines the prior three feature extaction functions and returns a complete feature set for each data sample
def overall_features(raw_frame, explicit=False): 
    df1 = band_powers_ratios(raw_frame, explicit=explicit)
    df2 = db_wavelet_features(raw_frame, explicit=explicit)
    df3 = other_metrics(raw_frame, explicit=explicit)

    merged = pd.concat([df1, df2, df3], axis=1)
    
    if explicit:
        print(merged.shape)
        print()
        print(list(merged.columns))
        
    return merged

Nextly, a function is defined that iterates through all the EEG files browsing through the subdirectories, loads the files and converts them to pd.DataFrame. After that, each subject's EEG file is split into 4s chunks and the features are computed on each chunk. Thereafter the average across all chunks is taken for one subject which forms the final features for that subject. This is done because EEG is very dynamic and calculating features on the entirety of these long recordings would not capture anything relevant. 
The final result is a concatenation of the features calculated for each subject. 

In [24]:
def processing():
    
    all_data = []
    
    # Create a list of the paths to each subject folder
    sub_folders = []
    for sub_id in os.listdir("Alzheimer-Frontotemporal-healthy/derivatives/"):
        if "sub-" in sub_id:
            sub_folders.append(os.path.join(os.getcwd(), "Alzheimer-Frontotemporal-healthy/derivatives/", sub_id))
    
    # For each subject folder 
    for subject in (sorted(sub_folders)):
        # Take the time to understand how fast processing one subject is
        start_time = time.time()
        
        subject_id = os.path.basename(subject)
        edf_path = os.path.join(subject, "eeg", f"{subject_id}_task-eyesclosed_eeg.set")
        print(edf_path)
        
        # If there is a missing file, skip this session
        if not os.path.exists(edf_path):
            print(f"MISSING FILE ----------------------: {subject_id}")
            continue           

        # Load the EEG file and convert to a Pandas dataframe
        raw = mne.io.read_raw_eeglab(edf_path, preload=True)
        raw_frame = raw.get_data(picks="eeg") # Shape: (19, n_samples)
        pd_frame = pd.DataFrame(raw_frame)
        
        # Calculate how many chunks of less than 4s data are cut out, because they were at the end of the recording
        skipped_chunks = 0
            
        # Create 4s segments of data. Recording frequency is 250 Hz, so 1000 samples are 4s.
        all_data_one_subject = []
        n_samples = pd_frame.shape[1]
        for start in range(0, n_samples, 1000):
            end = start + 1000
            chunk = pd_frame.iloc[:, start:end].astype(np.float32)
            if chunk.shape[1] == 1000:
                all_data_one_subject.append(overall_features(chunk))
            else:
                skipped_chunks += 1

        # Some minor issue catching
        if len(all_data_one_subject) == 0:
            print(f"No valid chunks found for {subject_id}, skipping.")
            continue

        # Concatenate all chunk-level features (chunks, features)
        subject_features = pd.concat(all_data_one_subject, axis=0)
        
        # For Power Spectral Density (PSD) features, we need to take the mean in linear space, and then reconvert to dB
        power_cols = ["Delta", "Theta", "Alpha", "Beta", "Gamma"]
        for col in power_cols:
            # convert to linear from decibel
            linear_powers = 10 ** (subject_features[col]/10)
            # take the mean in linear space
            mean_linear = linear_powers.mean()
            # convert back to decibels
            subject_features[col] = 10 * np.log10(mean_linear + 1e-12) # add negligible modifier to avoid log(0)
            
        
        # Compute mean feature values across all chunks for this subject / for all else besides PSD no conversion needed
        final_one_subject = subject_features.mean(axis=0).to_frame().T  # (1, num_features)
        # Add to list of data for all subjects
        all_data.append(final_one_subject)
        
        end_time = time.time()
        print(f"Finished processing: {subject_id}")
        print(f"Skipped {skipped_chunks} segments that were shorter than 4s at the end of the recordings.")
        print(f"{subject_id} processed in {end_time-start_time} seconds\n\n\n\n")
        print("\n\n\n\n-----------------------------------------------------------------------------------------------------------------------------------------------------\n\n\n\n")
        
    return pd.concat(all_data, axis=0)

# If suppression of output is desired, uncomment the below line. 
# mne.set_log_level('WARNING')  # Or 'ERROR' to suppress even more output 

Compute features from preprocessed data. 

In [25]:
complete_set = processing()
complete_set.to_csv("all_features_new.csv", header=True, index=False)

Perform some analysis of the newly-computed features. 

In [26]:
all_features = pd.read_csv("all_features.csv")
all_features.describe()

In [29]:
my_row = all_features.iloc[1] 

my_row_dict = my_row.to_dict()

for column_name, value in my_row_dict.items():
    print(f"{column_name} -> Value: {value}")

In [31]:
all_features.std()

In [32]:
"""
Check the coefficient of varation for the data
CV > 2: Good variation, keep feature
CV 0.1-0.2: Low variation, borderline useful
CV < 0.1: Very low variation, likely not useful
"""
cv = all_features.std() / all_features.mean().abs()
print(cv[['cA4_RMS', 'cD4_RMS', 'cD3_RMS', 'cD2_RMS', 'cD1_RMS']])

In [33]:
all_features = all_features.drop(columns="cD1_RMS")

In [35]:
# Check gamma because of low variance, it has a constant value, need to drop this feature
print(f"Gamma min: {complete_set['Gamma'].min()}")
print(f"Gamma max: {complete_set['Gamma'].max()}")

# Drop Gamma
complete_set = complete_set.drop(columns="Gamma")

In [36]:
complete_set.to_csv("Alzheimer-Frontotemporal-healthy/all_features_cleaned.csv")

# Finally, combine the individual subject information gathered in the study with the features computed.

In [38]:
all_features_cleaned = pd.read_csv("Alzheimer-Frontotemporal-healthy/all_features_cleaned.csv")
sub_info = pd.read_csv("Alzheimer-Frontotemporal-healthy/participants.tsv", sep="\t")
final = pd.concat([sub_info, all_features_cleaned], axis=1)
final = final.drop(columns='Unnamed: 0') # Random column that appeared
final.to_csv("Alzheimer-Frontotemporal-healthy/sub_info_and_features.csv", index=False)