In [120]:
import mne
import pandas as pd
import numpy as np
import os
from mne.preprocessing import ICA
import time

from scipy.signal import welch
from scipy.stats import kurtosis, skew
import pywt
import antropy as ant

Firstly, a single EEG is imported to explore its data.

In [None]:
# Trying out the Alzheimer's disease, Frontotemporal dementia and Healthy subjects dataset
import mne

data = mne.io.read_raw_eeglab("Alzheimer-Frontotemporal-healthy/derivatives/sub-001/eeg/sub-001_task-eyesclosed_eeg.set")
print(raw._orig_units)
# data.describe()
# data.plot()

In [None]:
# Trying out the Alzheimer's disease, Frontotemporal dementia and Healthy subjects dataset
import mne
import os

raw = mne.io.read_raw_eeglab("Alzheimer-Frontotemporal-healthy/derivatives/sub-002/eeg/sub-002_task-eyesclosed_eeg.set")
raw.describe()
raw.plot()

Here the feature extraction functions for the raw EEG data are defined.

In [138]:
def band_powers_ratios(raw_frame, explicit=True): # raw frame is 128 x 75250
    # rescale from volts to microvolts, so that final measures are more easily interpretable
    freqs, psd = welch(raw_frame, fs=250, nperseg=min(1000, raw_frame.shape[1]), axis=1)

    freq_bands = {
        "Delta": [1, 4], 
        "Theta": [4, 8],
        "Alpha": [8, 13],
        "Beta":  [13, 30],
        "Gamma": [30, 45]
        }

    # Calculate global average band power for entire sample
    powers = {}
    for band, (l, h) in freq_bands.items():
        mask = (freqs >= l) & (freqs <= h)
        # Take average across frequency bands, and then aggregate across channels to obtain a global metric
        band_psd = np.mean(psd[:, mask], axis=1) # Shape (n_channels,)
        powers[band] = 10 * np.log10(np.mean(band_psd) + 1e-12)  # Avoid log(0) and convert to decibel (standard for PSD)
    
    # Calculate ratios between global average band powers
    powers_linear = {band: 10 ** (val / 10) for band, val in powers.items()}
    ratios = {
        "Theta/Alpha": powers_linear["Theta"] / powers_linear["Alpha"],
        "Theta/Beta":  powers_linear["Theta"] / powers_linear["Beta"],
        "Delta/Alpha": powers_linear["Delta"] / powers_linear["Alpha"]
    }
    
    merged = {**powers, **ratios}
    merged_dataframe = pd.DataFrame([merged])
    
    if explicit:
        print(merged_dataframe.shape)
        print(merged_dataframe)
        
    return merged_dataframe


# Discrete Wavelet Transform
def db_wavelet_features(raw_frame, wavelet="db4", level=4, explicit=True):
    # Store features for each channel, as it works on 1D data
    all_channel_features = []
    
    for ch_idx in range(raw_frame.shape[0]):
        channel_features = []
        channel_data = raw_frame.iloc[ch_idx, :].values.astype(np.float32)
        
        # Perform Discrete Wavelet Transform using Daubechies
        coefs = pywt.wavedec(channel_data, wavelet=wavelet, level=level)
        for i, c in reversed(list(enumerate(coefs))):
            channel_features.extend([np.sqrt(np.mean(np.square(c)))])

        # Extracting global wavelet entropy
        # Calcuate energy per wavelet
        energies = np.array([np.sum(np.square(c)) for c in coefs])
        # See how much they contribute to the overall signal
        distribution = energies / (np.sum(energies)+1e-12)
        # Calculate Shannon entropy on the distribution
        entropy = -np.sum(distribution * (np.log2(distribution)+1e-12))
        channel_features.append(entropy) 
        
        all_channel_features.append(channel_features)

    # Average features across all channels
    avg_features = np.mean(all_channel_features, axis=0)
    
    # Generate feature labels
    labels = []
    labels.extend(["cA4_RMS"])
    labels.extend([f"cD{i}_{'RMS'}" for i in range(4,0,-1)])
    labels.append("wavelet_entropy")

    # Combine the data with the labels to create a dataframe
    final = pd.DataFrame(data=[avg_features], columns=labels)

    if explicit:
        print(final.shape)
        print(final)
    
    return final

def other_metrics(raw_frame, explicit=True):  
    head_sections = {
        'frontal': [0, 1, 2, 3, 10, 11, 16],      # Fp1, Fp2, F3, F4, F7, F8, Fz
        'central': [4, 5, 17],                     # C3, C4, Cz
        'temporal': [12, 13, 14, 15],              # T3, T4, T5, T6
        'parietal': [6, 7, 18],                    # P3, P4, Pz
        'occipital': [8, 9]                        # O1, O2
    }
    
    features = []
    labels = []
    
    # Global metrics (across all electrodes) - averaged across all channels
    spectral_entropies = []
    perm_entropies = []
    mobility_values = []
    complexity_values = []
    
    for ch_idx in range(raw_frame.shape[0]):
        channel_data = raw_frame.iloc[ch_idx,:].values.astype(np.float32)
        
        try:
            spectral_entropies.append(ant.spectral_entropy(channel_data, sf=250))
        except:
            pass
        
        try:
            perm_entropies.append(ant.perm_entropy(channel_data))
        except:
            pass
        
        try:
            m, c = ant.hjorth_params(channel_data)
            mobility_values.append(m)
            complexity_values.append(c)
        except:
            pass
    
    # Add global averages
    features.extend([
        np.mean(spectral_entropies) if spectral_entropies else np.nan,
        np.mean(perm_entropies) if perm_entropies else np.nan,
        np.mean(mobility_values) if mobility_values else np.nan,
        np.mean(complexity_values) if complexity_values else np.nan
    ])
    
    labels.extend([
        "global_spectral_entropy",
        "global_permutation_entropy",
        "global_hjorth_mobility",
        "global_hjorth_complexity"
    ])
    
    # Process each head section
    for section, indices in head_sections.items():
        section_metrics = {
            'sample_entropy': [],
            'higuchi': [],
            'dfa': [],
            'spectral_entropy': [],
            'permutation_entropy': [],
            'mobility': [],
            'complexity': []
        }
        
        for idx in indices:
            electrode_data = raw_frame.iloc[idx, :].values.astype(np.float32)
            electrode_data = np.ascontiguousarray(electrode_data)
            
            try:
                section_metrics['sample_entropy'].append(ant.sample_entropy(electrode_data))
            except:
                pass
            
            try:
                section_metrics['higuchi'].append(ant.higuchi_fd(electrode_data))
            except:
                pass
            
            try:
                section_metrics['dfa'].append(ant.detrended_fluctuation(electrode_data))
            except:
                pass
            
            try:
                section_metrics['spectral_entropy'].append(
                    ant.spectral_entropy(electrode_data, sf=250))
            except:
                pass
            
            try:
                section_metrics['permutation_entropy'].append(
                    ant.perm_entropy(electrode_data))
            except:
                pass
            
            try:
                m, c = ant.hjorth_params(electrode_data)
                section_metrics['mobility'].append(m)
                section_metrics['complexity'].append(c)
            except:
                pass
        
        # Add section averages
        features.extend([
            np.mean(section_metrics['sample_entropy']) 
                if section_metrics['sample_entropy'] else np.nan,
            np.mean(section_metrics['higuchi']) 
                if section_metrics['higuchi'] else np.nan,
            np.mean(section_metrics['dfa']) 
                if section_metrics['dfa'] else np.nan,
            np.mean(section_metrics['spectral_entropy']) 
                if section_metrics['spectral_entropy'] else np.nan,
            np.mean(section_metrics['permutation_entropy']) 
                if section_metrics['permutation_entropy'] else np.nan,
            np.mean(section_metrics['mobility']) 
                if section_metrics['mobility'] else np.nan,
            np.mean(section_metrics['complexity']) 
                if section_metrics['complexity'] else np.nan
        ])
        
        labels.extend([
            f"{section}_sample_entropy",
            f"{section}_higuchi",
            f"{section}_DFA",
            f"{section}_spectral_entropy",
            f"{section}_permutation_entropy",
            f"{section}_hjorth_mobility",
            f"{section}_hjorth_complexity"
        ])
    
    # Replace NaN with 0
    features = [0 if np.isnan(x) else x for x in features]
    
    final = pd.DataFrame([features], columns=labels)
    
    if explicit:
        print(f"Other metrics shape: {final.shape}")
        print(final)
    
    return final
    
def overall_features(raw_frame, explicit=False): # This function combines the prior three feature extaction functions and returns a complete feature set for each data sample
    df1 = band_powers_ratios(raw_frame, explicit=explicit)
    df2 = db_wavelet_features(raw_frame, explicit=explicit)
    df3 = other_metrics(raw_frame, explicit=explicit)

    merged = pd.concat([df1, df2, df3], axis=1)
    
    if explicit:
        print(merged.shape)
        print()
        print(list(merged.columns))
        
    return merged

In [139]:
def processing():
    
    all_data = []
    
    # Create a list of the paths to each subject folder
    sub_folders = []
    for sub_id in os.listdir("Alzheimer-Frontotemporal-healthy/derivatives/"):
        if "sub-" in sub_id:
            sub_folders.append(os.path.join(os.getcwd(), "Alzheimer-Frontotemporal-healthy/derivatives/", sub_id))
    
    # For each subject folder 
    for subject in (sorted(sub_folders)):
        start_time = time.time()
        
        subject_id = os.path.basename(subject)
        edf_path = os.path.join(subject, "eeg", f"{subject_id}_task-eyesclosed_eeg.set")
        print(edf_path)
        
        # If there is a missing file, skip this session
        if not os.path.exists(edf_path):
            print(f"MISSING FILE ----------------------: {subject_id}")
            continue           

        raw = mne.io.read_raw_eeglab(edf_path, preload=True)
        raw_frame = raw.get_data(picks="eeg") # Shape: (19, n_samples)
        pd_frame = pd.DataFrame(raw_frame)
        
        skipped_chunks = 0

            
        # Create 4s segments of data
        all_data_one_subject = []
        n_samples = pd_frame.shape[1]
        for start in range(0, n_samples, 1000):
            end = start + 1000
            chunk = pd_frame.iloc[:, start:end].astype(np.float32)
            if chunk.shape[1] == 1000:
                all_data_one_subject.append(overall_features(chunk))
            else:
                skipped_chunks += 1

        if len(all_data_one_subject) == 0:
            print(f"No valid chunks found for {subject_id}, skipping.")
            continue

        # Concatenate all chunk-level features (chunks, features)
        subject_features = pd.concat(all_data_one_subject, axis=0)
        # Compute mean feature values across all chunks for this subject
        final_one_subject = subject_features.mean(axis=0).to_frame().T  # (1, num_features)
        # Add to list
        all_data.append(final_one_subject)
        
        end_time = time.time()
        print(f"Finished processing: {subject_id}")
        print(f"Skipped {skipped_chunks} segments that were shorter than 4s at the end of the recordings.")
        print(f"{subject_id} processed in {end_time-start_time} seconds\n\n\n\n")
        print("\n\n\n\n-----------------------------------------------------------------------------------------------------------------------------------------------------\n\n\n\n")
        
    return pd.concat(all_data, axis=0)

# mne.set_log_level('WARNING')  # Or 'ERROR' to suppress even more output 

Compute features from preprocessed data. 

In [194]:
complete_set = processing()
complete_set.to_csv("all_features.csv", header=True, index=False)

In [195]:
all_features = pd.read_csv("all_features.csv")
all_features.describe()

Unnamed: 0,Delta,Theta,Alpha,Beta,Theta/Alpha,Theta/Beta,Delta/Alpha,cA4_RMS,cD4_RMS,cD3_RMS,...,parietal_permutation_entropy,parietal_hjorth_mobility,parietal_hjorth_complexity,occipital_sample_entropy,occipital_higuchi,occipital_DFA,occipital_spectral_entropy,occipital_permutation_entropy,occipital_hjorth_mobility,occipital_hjorth_complexity
count,88.0,88.0,88.0,88.0,88.0,88.0,88.0,88.0,88.0,88.0,...,88.0,88.0,88.0,88.0,88.0,88.0,88.0,88.0,88.0,88.0
mean,-103.129312,-111.162507,-115.61976,-118.371847,3.146488,6.275635,20.964946,7.77488e-08,9.773483e-07,6e-06,...,1.549022,0.052425,6.441718,0.20223,1.138733,1.645152,2.737385,1.554504,0.060304,5.762769
std,0.649095,1.76138,1.114195,0.953315,1.212179,3.454211,4.375697,5.15087e-08,3.57945e-07,2e-06,...,0.033278,0.007514,0.824013,0.042561,0.037948,0.056845,0.19475,0.051057,0.01122,1.034896
min,-104.36597,-113.513469,-117.18312,-119.260657,1.153251,2.406482,5.351467,4.136917e-08,6.378793e-07,4e-06,...,1.47679,0.04235,4.212693,0.135519,1.071769,1.416028,2.40099,1.430122,0.04417,3.207159
25%,-103.467207,-112.602107,-116.434366,-118.898763,2.336695,4.106344,18.161454,5.510485e-08,7.730037e-07,5e-06,...,1.531042,0.046973,5.951293,0.170763,1.115019,1.626078,2.577708,1.523003,0.051954,4.961396
50%,-103.255175,-111.590079,-115.749031,-118.6638,2.772265,5.057567,20.915231,6.522598e-08,8.901056e-07,6e-06,...,1.545894,0.05099,6.574165,0.198331,1.133567,1.652868,2.703931,1.555053,0.058367,5.838881
75%,-102.943712,-110.022473,-115.093136,-118.148545,3.599146,7.356218,24.156296,8.230835e-08,1.062063e-06,7e-06,...,1.571056,0.056887,7.089277,0.223833,1.153332,1.68699,2.878192,1.58587,0.066497,6.547607
max,-100.16983,-105.252137,-110.321474,-113.485023,7.611849,24.455624,30.528468,4.631469e-07,2.970969e-06,1.7e-05,...,1.629001,0.080672,7.946976,0.349816,1.259677,1.726333,3.298351,1.68096,0.098988,7.522396


In [196]:
my_row = all_features.iloc[1] 

my_row_dict = my_row.to_dict()

for column_name, value in my_row_dict.items():
    print(f"{column_name} -> Value: {value}")

Delta -> Value: -103.579271494289
Theta -> Value: -110.6226264453298
Alpha -> Value: -116.20858785997082
Beta -> Value: -119.026781491099
Theta/Alpha -> Value: 4.050791410081116
Theta/Beta -> Value: 7.902627690769325
Delta/Alpha -> Value: 20.56709016322568
cA4_RMS -> Value: 4.39660219342386e-08
cD4_RMS -> Value: 7.187799724306387e-07
cD3_RMS -> Value: 4.7548196562274825e-06
cD2_RMS -> Value: 1.201431132358266e-05
wavelet_entropy -> Value: 0.1264133900403976
global_spectral_entropy -> Value: 2.647349549385849
global_permutation_entropy -> Value: 1.537454524203489
global_hjorth_mobility -> Value: 0.0533972755074501
global_hjorth_complexity -> Value: 5.984928131103516
frontal_sample_entropy -> Value: 0.187453946090347
frontal_higuchi -> Value: 1.1410155665397637
frontal_DFA -> Value: 1.6544103327901136
frontal_spectral_entropy -> Value: 2.658856214668956
frontal_permutation_entropy -> Value: 1.558284648267642
frontal_hjorth_mobility -> Value: 0.0564203262329101
frontal_hjorth_complexity -

In [197]:
all_features.std()

Delta                            6.490947e-01
Theta                            1.761380e+00
Alpha                            1.114195e+00
Beta                             9.533148e-01
Theta/Alpha                      1.212179e+00
Theta/Beta                       3.454211e+00
Delta/Alpha                      4.375697e+00
cA4_RMS                          5.150870e-08
cD4_RMS                          3.579450e-07
cD3_RMS                          2.014061e-06
cD2_RMS                          3.014804e-06
wavelet_entropy                  5.095408e-02
global_spectral_entropy          1.491814e-01
global_permutation_entropy       3.322473e-02
global_hjorth_mobility           1.009485e-02
global_hjorth_complexity         7.150698e-01
frontal_sample_entropy           4.321709e-02
frontal_higuchi                  3.336070e-02
frontal_DFA                      6.558983e-02
frontal_spectral_entropy         1.682291e-01
frontal_permutation_entropy      3.557743e-02
frontal_hjorth_mobility          1

In [None]:
"""
Check the coefficient of varation for the data
CV > 2: Good variation, keep feature
CV 0.1-0.2: Low variation, borderline useful
CV < 0.1: Very low variation, likely not useful
"""
cv = all_features.std() / all_features.mean().abs()
print(cv[['cA4_RMS', 'cD4_RMS', 'cD3_RMS', 'cD2_RMS', 'cD1_RMS']])

In [None]:
# Check gamma because of low variance, it has a constant value, need to drop this feature
print(f"Gamma min: {complete_set['Gamma'].min()}")
print(f"Gamma max: {complete_set['Gamma'].max()}")

# Drop Gamma
complete_set = complete_set.drop(columns="Gamma")

In [None]:
all_features = all_features.drop(columns="cD1_RMS")
complete_set.to_csv("Alzheimer-Frontotemporal-healthy/all_features_cleaned.csv")

In [203]:
all_features_cleaned = pd.read_csv("Alzheimer-Frontotemporal-healthy/all_features_cleaned.csv")
sub_info = pd.read_csv("Alzheimer-Frontotemporal-healthy/participants.tsv", sep="\t")
final = pd.concat([sub_info, all_features_cleaned], axis=1)
# final = final.drop(columns='Unnamed: 0') # Random column that appeared
final.to_csv("Alzheimer-Frontotemporal-healthy/sub_info_and_features.csv", index=False)