In [1]:
import warnings
warnings.filterwarnings("ignore")

import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.signal import butter, filtfilt

In [2]:
# === PARAMETERS ===

# Sampling Rate is 500 Hz
# As we are recording 1 data point for every 2 milliSeconds.
fs = 500  # Sampling rate in Hz

In [3]:
# Both the Low cut and High cut bandpass values were mentioned 
# in "Using Machine Learning to assit auditory processing evaluation" Paper

# Low cutoff frequency for bandpass
lowcut = 1

# High cutoff frequency
highcut = 30



In [4]:
# The amplification factor was also taken from the paper
# "Using Machine Learning to assit auditory processing evaluation"

# 30k amplification
amplification_factor = 30000  

In [5]:
# The artifact_threshold value was also taken from the paper
# "Using Machine Learning to assit auditory processing evaluation"

# in microvolts
artifact_threshold = 79.2  

In [6]:
def preprocess_data_sample(data):
    
    # === STEP 1: Amplification ===
    data_amplified = data * amplification_factor

    # === STEP 2: Bandpass Filtering (1–30 Hz) ===
    def bandpass_filter(data, lowcut=lowcut, highcut=highcut, fs=fs, order=5):
        nyq = 0.5 * fs
        low = lowcut / nyq
        high = highcut / nyq
        b, a = butter(order, [low, high], btype='band')
        return filtfilt(b, a, data)
    
    return bandpass_filter(data_amplified)

In [7]:
# Loop through each frequency of the data
for current_sheet in tqdm(["500Hz", "1000Hz", "2000Hz", "4000Hz"]):
    
    # Load the sheet data
    df = pd.read_excel(f"../raw_dataset/Cortical_waveforms.xlsx", sheet_name=current_sheet, engine='openpyxl')
    
    # Remove the first 8 rows in each sheet as they are empty
    df2 = df.loc[8:]
    
    # Remove the first column as it is empty
    df2 = df2.drop(columns=["Unnamed: 0"])
    
    # Rename the first column with data as "Time"
    df2.loc[8, "Unnamed: 1"] = "Time"
    
    # Rename all the columns
    df2.columns = df2.loc[8].values
    
    # Remove the row in the data with column names
    df2 = df2.loc[9:]
    
    # Do not read past this row
    df2 = df2.loc[:475]
    
    # Reset the index to Time
    df2.index = df2["Time"]
    df2 = df2.drop(columns=["Time"])
    
    # Create a list of columns
    columns = list(df2.columns)
    
    # Filter the data according to the time range (0 to 500)
    mask1 = df2.index >= 0
    mask2 = df2.index <= 500
    new_df = df2[(mask1) & (mask2)]
    
    
    # Step 2: Apply logic to filter the data for 1-17Hz and 1-30Hz
    patient_vs_columns = {}
    unique_dbs = []

    for column in columns:
        current_patient = " ".join(column.split(" ")[0:3])
        dB = " ".join(column.split(" ")[3:]).replace(".xml", "")

        if dB not in unique_dbs:
            unique_dbs.append(dB)

        if current_patient not in patient_vs_columns:
            patient_vs_columns[current_patient] = [column]
        else:
            patient_vs_columns[current_patient].append(column)

    # Prepare column names for the final DataFrame
    col_names = []

    for col_name in unique_dbs:
        if "L1" in col_name:
            col_name = col_name.replace("L1", "").strip() + "dB" + "_2"
        if "L" in col_name:
            col_name = col_name.replace("L", "").strip() + "dB" + "_1"
        if "R1" in col_name:
            col_name = col_name.replace("R1", "").strip() + "dB" + "_2"
        if "R" in col_name:
            col_name = col_name.replace("R", "").strip() + "dB" + "_1"
        col_names.append(col_name)
        
    col_names = list(set(col_names))

    # Create separate DataFrames for each frequency range
    #final_df_17Hz = pd.DataFrame(columns=["Patient_ID", "Time", "Ear"] + sorted(col_names))
    final_df_30Hz = pd.DataFrame(columns=["Patient_ID", "Time", "Ear"] + sorted(col_names))

    for patient_ID in tqdm(patient_vs_columns.keys()):
        #temp_df_17Hz = pd.DataFrame(columns=["Patient_ID", "Time", "Ear"] + sorted(col_names))
        #temp_df_17Hz["Patient_ID"] = [patient_ID for _ in range(new_df.shape[0])]        
        #temp_df_17Hz["Time"] = new_df.index
        #final_df_17Hz = final_df_17Hz.append(temp_df_17Hz).reset_index(drop=True)
        
        temp_df_30Hz = pd.DataFrame(columns=["Patient_ID", "Time", "Ear"] + sorted(col_names))
        temp_df_30Hz["Patient_ID"] = [patient_ID for _ in range(new_df.shape[0])]
        temp_df_30Hz["Time"] = new_df.index
        final_df_30Hz = final_df_30Hz.append(temp_df_30Hz).reset_index(drop=True)
        

    # Fill the final DataFrames with the patient and decibel data
    for patient_ID, columns_list in tqdm(patient_vs_columns.items()):
    
        #mask_17Hz = final_df_17Hz["Patient_ID"] == patient_ID
        
        mask_30Hz = final_df_30Hz["Patient_ID"] == patient_ID

        for column in columns_list:
            col_name = " ".join(column.split(" ")[3:]).replace(".xml", "")

            if "L1" in col_name:
                ear = "Left"
                col_name = col_name.replace("L1", "").strip() + "dB" + "_2"
            if "L" in col_name:
                ear = "Left"
                col_name = col_name.replace("L", "").strip() + "dB" + "_1"
            if "R1" in col_name:
                ear = "Right"
                col_name = col_name.replace("R1", "").strip() + "dB" + "_2"
            if "R" in col_name:
                ear = "Right"
                col_name = col_name.replace("R", "").strip() + "dB" + "_1"

            # Apply bandpass filters for 1-17 Hz and 1-30 Hz
            # filtered_1_17Hz = bandpass_filter(new_df[column].values, 1, 17)
            
            filtered_1_30Hz = preprocess_data_sample(new_df[column].values)

            # Set ear and filtered values in the final DataFrames
            # final_df_17Hz.loc[mask_17Hz, "Ear"] = [ear for _ in range(len(new_df))]
            # final_df_17Hz.loc[mask_17Hz, col_name] = filtered_1_17Hz
            
            final_df_30Hz.loc[mask_30Hz, "Ear"] = [ear for _ in range(len(new_df))]
            final_df_30Hz.loc[mask_30Hz, col_name] = filtered_1_30Hz

    # final_df_17Hz.fillna(0, inplace=True)
    final_df_30Hz.fillna(0, inplace=True)
    
    
    new_df = pd.DataFrame(columns=["Patient_ID", "Intensity", "Ear"] + [i for i in range(0, 501, 2)])

    for patient_id in final_df_30Hz["Patient_ID"].unique():

        mask = final_df_30Hz["Patient_ID"]==patient_id
        ear = final_df_30Hz.loc[mask, "Ear"].unique()[0]

        #print(f"patient id: {patient_id.replace(' ', '_')}, ear: {ear}")

        for column in final_df_30Hz.columns[3:]:
            #print(final_df_30Hz.loc[mask, column].shape)

            index = len(new_df)
            new_df.loc[index, "Patient_ID"] = patient_id.replace(" ", "_")
            new_df.loc[index, "Intensity"] = column
            new_df.loc[index, "Ear"] = ear
            new_df.loc[index, [i for i in range(0, 501, 2)]] = final_df_30Hz.loc[mask, column].values

            
    time_cols = [j for j in range(0, 501, 2)]

    for i in range(0, len(new_df), 2):

        if sum(new_df.loc[i, time_cols])==0 and sum(new_df.loc[i+1, time_cols])==0:
            pass

        elif sum(new_df.loc[i, time_cols])==0 and sum(new_df.loc[i+1, time_cols])!=0:
            new_df.loc[i, time_cols] = new_df.loc[i+1, time_cols]

        elif sum(new_df.loc[i, time_cols])!=0 and sum(new_df.loc[i+1, time_cols])==0:
            new_df.loc[i+1, time_cols] = new_df.loc[i, time_cols]

        elif sum(new_df.loc[i, time_cols])!=0 and sum(new_df.loc[i+1, time_cols])!=0:
            pass
        
    final_new_df = pd.DataFrame(columns=["Patient_ID", "Intensity", "Ear"] + [i for i in range(0, 501, 2)])

    time_cols = [j for j in range(0, 501, 2)]

    for i in range(0, len(new_df), 2):

        index = len(final_new_df)

        if sum(new_df.loc[i, time_cols])==0:
            pass
        else:
            final_new_df.loc[index, "Patient_ID"] = new_df.loc[i, "Patient_ID"]
            final_new_df.loc[index, "Intensity"] = new_df.loc[i, "Intensity"].split("_")[0]
            final_new_df.loc[index, "Ear"] = new_df.loc[i, "Ear"]

            final_new_df.loc[index, time_cols] = (new_df.loc[i, time_cols] + new_df.loc[i+1, time_cols]) / 2
            
    final_new_df.to_csv(f"../processed_dataset_with_smoothing/{current_sheet}.csv")

  0%|          | 0/4 [00:00<?, ?it/s]
  0%|          | 0/48 [00:00<?, ?it/s][A
 38%|███▊      | 18/48 [00:00<00:00, 167.30it/s][A
 73%|███████▎  | 35/48 [00:00<00:00, 105.87it/s][A
100%|██████████| 48/48 [00:00<00:00, 89.45it/s] [A

  0%|          | 0/48 [00:00<?, ?it/s][A
  2%|▏         | 1/48 [00:00<00:08,  5.31it/s][A
 21%|██        | 10/48 [00:00<00:00, 40.44it/s][A
 40%|███▉      | 19/48 [00:00<00:00, 56.95it/s][A
 56%|█████▋    | 27/48 [00:00<00:00, 64.34it/s][A
 75%|███████▌  | 36/48 [00:00<00:00, 72.29it/s][A
100%|██████████| 48/48 [00:00<00:00, 63.93it/s][A
 25%|██▌       | 1/4 [00:54<02:42, 54.16s/it]
  0%|          | 0/48 [00:00<?, ?it/s][A
 35%|███▌      | 17/48 [00:00<00:00, 167.97it/s][A
 71%|███████   | 34/48 [00:00<00:00, 116.26it/s][A
100%|██████████| 48/48 [00:00<00:00, 95.41it/s] [A

  0%|          | 0/48 [00:00<?, ?it/s][A
 19%|█▉        | 9/48 [00:00<00:00, 85.72it/s][A
 38%|███▊      | 18/48 [00:00<00:00, 85.89it/s][A
 56%|█████▋    | 27/48 [00:0