In [5]:
import os
import numpy as np
import pandas as pd
import mne
from mne.preprocessing import ICA
from scipy import signal
from scipy.stats import skew, kurtosis
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [6]:
def preprocess_eeg(vhdr_file):
    """
    Preprocess EEG data with comprehensive artifact removal.
    
    This function combines the robust preprocessing pipeline from Method 1 with
    the extra processing steps from Method 2:
      - Additional artifact checks (ECG artifact removal)
      - Channel selection for ICA (only EEG and EOG channels)
      - Explicit reference projection

    Parameters
    ----------
    vhdr_file : str
        Path to BrainVision header file.
    save_dir : str or None
        Directory to save preprocessed data plots (optional).

    Returns
    -------
    raw : mne.io.Raw
        Preprocessed EEG data with artifacts removed.
    """
    # 1. Load the raw data
    raw = mne.io.read_raw_brainvision(vhdr_file, preload=True)    
    # 3. Drop unnecessary channels if they exist
    if 'Mass' in raw.ch_names:
        raw.drop_channels('Mass')
    
    # 4. Set non-EEG channel types if available
    non_eeg_channels = {
        'VPVA': 'eog',
        'VNVB': 'eog',
        'HPHL': 'eog',
        'HNHR': 'eog',
        'Erbs': 'ecg',
        'OrbOcc': 'emg',
    }
    channel_types = {ch: typ for ch, typ in non_eeg_channels.items() if ch in raw.ch_names}
    if channel_types:
        raw.set_channel_types(channel_types)
    
    # 5. Set the montage (use on_missing='ignore' to avoid errors for auxiliary channels)
    montage = mne.channels.make_standard_montage('standard_1020')
    raw.set_montage(montage, on_missing='ignore')
    
    # 6. Apply a notch filter (50 Hz) to EEG channels and bandpass filter (0.1 - 40 Hz)
    raw.notch_filter(freqs=50, picks='eeg')
    raw.filter(l_freq=0.1, h_freq=40.0)
    
    # 7. Set average reference with projection and apply the projection
    raw.set_eeg_reference('average', projection=True)
    raw = raw.apply_proj()
    
    # 8. Instantiate the ICA model (with additional iterations for convergence)
    ica = ICA(n_components=20, random_state=42, max_iter=800)
    
    # 9. Pick only EEG and EOG channels for ICA fitting to reduce computational load
    eeg_picks = mne.pick_types(raw.info, eeg=True, eog=True, exclude='bads')
    ica.fit(raw, picks=eeg_picks)
    
    # --- Extra Processing Steps ---
    # ECG artifact removal: detect and exclude ECG-related ICA components.
    ecg_indices, ecg_scores = ica.find_bads_ecg(raw, ch_name='Erbs')
    ica.exclude = ecg_indices  # Start exclusion list with ECG-related components
    
    # EOG artifact removal: detect and add EOG-related ICA components to the exclusion list.
    eog_indices, eog_scores = ica.find_bads_eog(raw, ch_name=['VPVA', 'VNVB', 'HPHL', 'HNHR'])
    ica.exclude.extend(eog_indices)

    # 10. Apply ICA to remove the identified artifact components (modifies raw in place)
    raw = ica.apply(raw)
    # print(f"Preprocessing complete.")
    return raw


In [30]:
raw = mne.io.read_raw_fif('/mnt/data/saikrishna/Team_4/split_fif_new/mdd/sub-88038069_ses-1_task-restEO_eeg_1_eeg.fif', preload=True)

Opening raw data file /mnt/data/saikrishna/Team_4/split_fif_new/mdd/sub-88038069_ses-1_task-restEO_eeg_1_eeg.fif...
    Read a total of 1 projection items:
        Average EEG reference (1 x 26) active
    Range : 0 ... 29999 =      0.000 ...    59.998 secs
Ready.
Reading 0 ... 29999  =      0.000 ...    59.998 secs...


In [22]:
import numpy as np
import pandas as pd
import mne
from scipy import signal
from scipy.stats import skew, kurtosis
import os
import bisect

# Load the participant metadata
participants_df = pd.read_csv("TDBRAIN_participants_V2_cleaned.tsv", sep="\t")
bin_edges = [0, 6, 13, 18, 26, 41, 61]      # left edges
labels =    [0, 6, 13, 18, 26, 41, 61]


def get_gender(subject_id):
    """
    Get gender from the participants TSV file.
    Returns: 1 (Male), 0 (Female), -1 (Unknown)
    """
    row = participants_df.loc[participants_df["participants_ID"] == subject_id]
    if not row.empty:
        return int(row["gender"].values[0])  # Convert to int (0 or 1)
    return -1  # Default if not found
def get_age(subject_id):
    """
    Get age from the participants TSV file.
    Returns: Age in years
    """
    row = participants_df.loc[participants_df["participants_ID"] == subject_id]
    if not row.empty:
        age = int(row["age"].values[0])  # Convert to int
    idx = bisect.bisect_right(bin_edges, age) - 1
    return labels[idx]



def extract_features(vhdr_path, condition):
    """
    Extract features from preprocessed EEG data and save to CSV
    
    Parameters:
    -----------
    raw : mne.io.Raw
        Preprocessed EEG data
    condition : str
        Condition of the EEG recording (EO or EC)
    save_path : str, optional
        Path to save the extracted features CSV file
        
    Returns:
    --------
    features_df : pd.DataFrame
        DataFrame containing extracted features
    """
    # Get data and sampling frequency
    # raw = preprocess_eeg(vhdr_path)
    raw = mne.io.read_raw_fif(vhdr_path, preload=True)
    # raw.pick_types(eeg=True)  
    other_channels = ['VPVA', 'VNVB', 'HPHL', 'HNHR', 'Erbs', 'OrbOcc']
    raw.pick('eeg')  # Pick only EEG channels
    subject_id = vhdr_path.split('/')[-1].split('_')[0]
    # raw.drop_channels('Mass')  # Drop 'Mass' channel if it exists
    data = raw.get_data()
    sfreq = raw.info['sfreq']
    ch_names = raw.ch_names
    # print(f"Channel names: {ch_names}")
    # Initialize feature dictionary
    features = {}
    
    features['gender'] = get_gender(subject_id)
    features['age'] = get_age(subject_id)
    # features['age']=
    
    # Define frequency bands
    bands = {
        'delta': (0.5, 4),
        'theta': (4, 8),
        'alpha': (8, 13),
        'beta': (13, 30),
        'gamma': (30, 40)
    }
    
    # Calculate features for each channel
    for i, ch in enumerate(ch_names):
        # Get channel data
        ch_data = data[i]
        prefix = f"{condition}_{ch.lower()}"
        # print(prefix)
        # Time domain features
        features[f'{prefix}_mean'] = np.mean(ch_data)
        features[f'{prefix}_std'] = np.std(ch_data)
        features[f'{prefix}_skew'] = skew(ch_data)
        features[f'{prefix}_kurtosis'] = kurtosis(ch_data)
        
        # Frequency domain features
        freqs, psd = signal.welch(ch_data, fs=sfreq, nperseg=int(sfreq*2))
        features[f'{prefix}_psd_mean'] = np.mean(psd)
        # Band-specific FFT features
        fft_vals = np.abs(np.fft.fft(ch_data))
        fft_freqs = np.fft.fftfreq(len(ch_data), d=1/sfreq)
        
        for band, (fmin, fmax) in bands.items():
            idx_band = np.logical_and(fft_freqs >= fmin, fft_freqs <= fmax)
            band_fft_vals = fft_vals[idx_band]
            features[f'{prefix}_{band}_fft_avg_power'] = np.mean(band_fft_vals)
        
        # Band-specific Morlet Wavelet Transform (MWT)
        valid_freqs = [f for f in freqs if f > 0]  # Ensure only positive frequencies
        
        if valid_freqs:
            mwt = mne.time_frequency.tfr_array_morlet(
                data[np.newaxis, [i], :], sfreq, freqs=valid_freqs, n_cycles=2, output='power'
            ).squeeze()
            
            for j, f in enumerate(valid_freqs):
                for band, (fmin, fmax) in bands.items():
                    if fmin <= f <= fmax:
                        features[f'{prefix}_{band}_mwt_avg_power'] = np.mean(mwt[j])
    
    # Convert to DataFrame and save as CSV
    features_df = pd.DataFrame([features])
    
    return features_df


In [23]:
def process_and_combine(eo_file_path, ec_file_path, output_file):
    all_features = []
    eo=False
    ec=False
    # Process EO file
    try:       
        features_eo = extract_features(eo_file_path,"EO")
        all_features.append(features_eo)
        eo=True
    except Exception as e:
        print(f"Error loading file: {e}")
        return None
    
    features_ec = extract_features(ec_file_path,"EC")
    all_features.append(features_ec)
    ec=True
    
    # Combine EO and EC features
    if eo and ec:
        combined_features = pd.concat(all_features,axis=1)
    # print("*****************************",combined_features.shape,"***********************************")
    # out_path = (out_dir,output_file)
    # Save combined features to a single CSV file
        combined_features.to_csv(output_file,index=False)
        print(f"Features successfully saved to {output_file}")
    # return combined_features

In [None]:
process_and_combine("../dataset_s/healthy/sub-87974621/ses-1/eeg/sub-87974621_ses-1_task-restEC_eeg.vhdr","../dataset_s/healthy/sub-87974621/ses-1/eeg/sub-87974621_ses-1_task-restEO_eeg.vhdr","preprocessed.csv")

In [27]:
import os

def process_folder(source_folder, destination_folder):
    """
    Processes EO and EC files for all subjects and sessions, saving the features to CSV files.

    Args:
        source_folder (str): Path to the root folder containing subject EEG files.
        destination_folder (str): Path to the folder where CSV files will be saved.
    """
    if not os.path.exists(destination_folder):
        os.makedirs(destination_folder)

    eo_path, ec_path = None, None
    ec_file = None  # Initialize ec_file to avoid the UnboundLocalError

    # Iterate through files in the source folder
    for file in os.listdir(source_folder):
        file_path = os.path.join(source_folder, file)

        if file.endswith("EC_eeg_1_eeg.fif"):
            ec_path = file_path
            ec_file = file  # Store EC file name for output filename generation
            output_filename = ec_file.replace("EC_eeg_1_eeg.fif", "eeg_combined_1.csv")
        elif file.endswith("EC_eeg_2_eeg.fif"):
            ec_path = file_path
            ec_file = file
            output_filename = ec_file.replace("EC_eeg_2_eeg.fif", "eeg_combined_2.csv")
        elif file.endswith("EO_eeg_1_eeg.fif"):
            eo_path = file_path
            ec_file = file
            output_filename = ec_file.replace("EO_eeg_1_eeg.fif", "eeg_combined_1.csv")
        elif file.endswith("EO_eeg_2_eeg.fif"):
            eo_path = file_path
            ec_file = file
            output_filename = ec_file.replace("EO_eeg_2_eeg.fif", "eeg_combined_2.csv")
        # Process only when both EO and EC files are found
        if eo_path and ec_path and ec_file and ec_file:
            output_filepath = os.path.join(destination_folder, output_filename)

            print(f"Processing: \n  EO: {eo_path} \n  EC: {ec_path} \n  Output: {output_filepath}")
            process_and_combine(eo_path, ec_path, output_filepath)

            # Reset paths after processing
            eo_path, ec_path, ec_file = None, None, None


In [29]:
# Example usage
process_folder("/mnt/data/saikrishna/Team_4/split_fif_new/mdd","preprocessed_data_new/mdd")

Processing: 
  EO: /mnt/data/saikrishna/Team_4/split_fif_new/mdd/sub-88073433_ses-1_task-restEO_eeg_2_eeg.fif 
  EC: /mnt/data/saikrishna/Team_4/split_fif_new/mdd/sub-88071949_ses-1_task-restEC_eeg_2_eeg.fif 
  Output: preprocessed_data_new/mdd/sub-88073433_ses-1_task-resteeg_combined_2.csv
Opening raw data file /mnt/data/saikrishna/Team_4/split_fif_new/mdd/sub-88073433_ses-1_task-restEO_eeg_2_eeg.fif...
    Read a total of 1 projection items:
        Average EEG reference (1 x 26) active
    Range : 30000 ... 59999 =     60.000 ...   119.998 secs
Ready.
Reading 0 ... 29999  =      0.000 ...    59.998 secs...
Opening raw data file /mnt/data/saikrishna/Team_4/split_fif_new/mdd/sub-88071949_ses-1_task-restEC_eeg_2_eeg.fif...
    Read a total of 1 projection items:
        Average EEG reference (1 x 26) active
    Range : 30000 ... 59999 =     60.000 ...   119.998 secs
Ready.
Reading 0 ... 29999  =      0.000 ...    59.998 secs...
Features successfully saved to preprocessed_data_new/mdd/s

KeyboardInterrupt: 