In [1]:
import numpy as np
import pandas as pd


# Resulting parquet files with features

You can download the parquet files here:

train:
  https://www.dropbox.com/scl/fi/sutaus5q4q1b79w14kky5/train_features_from_kaggle_spec.parquet?rlkey=eggkhu9d94f9aunrjan3yenlm&dl=0
  
test: 
 https://www.dropbox.com/scl/fi/v48xs7memff4wq6avujqz/test_features_from_kaggle_spec.parquet?rlkey=e7tfd0bc1xdrtdqz212wnmkk9&dl=0 

In [4]:
spec_path='../../hms-harmful-brain-activity-classification/train_spectrograms'

In [5]:
#getting the spectrogram frequencies and zones
def clean(s: str):
    return s.split('_')

spec=pd.read_parquet(f'{spec_path}/353733.parquet')
spec_cols=spec.columns[1:]
zones=['LL', 'RL', 'LP', 'RP']

#for c in spec.columns[1:]:
d={'LL':[], 'RL':[], 'LP':[], 'RP':[]}
for c in spec.columns[1:]:
   area, ampl=clean(c)
   d[area].append(float(ampl))


# Getting the features for the train set

In [8]:
df=pd.read_csv('../data/train_final_less_filtering.csv', index_col=0)
df.head()

Unnamed: 0,eeg_id,eeg_sub_id,eeg_label_offset_seconds,spectrogram_id,spectrogram_sub_id,spectrogram_label_offset_seconds,label_id,patient_id,expert_consensus,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote,eeg_50sec_nan_row_count,eeg_10sec_nan_row_count,spectrogram_600sec_nan_row_count,spectrogram_10sec_nan_row_count,total_votes
0,1628180742,0,0.0,353733,0,0.0,127492639,42516,Seizure,3,0,0,0,0,0,0,0,0,0,3
1,1628180742,1,6.0,353733,1,6.0,3887563113,42516,Seizure,3,0,0,0,0,0,0,0,0,0,3
2,1628180742,2,8.0,353733,2,8.0,1142670488,42516,Seizure,3,0,0,0,0,0,0,0,0,0,3
3,1628180742,3,18.0,353733,3,18.0,2718991173,42516,Seizure,3,0,0,0,0,0,0,0,0,0,3
4,1628180742,4,24.0,353733,4,24.0,3080632009,42516,Seizure,3,0,0,0,0,0,0,0,0,0,3


In the following, we get the numpy array of all the features. the spectrograms that have a lot of NaN values raise an exception (the spectrogram entry that caused an exception is printed) 

In [None]:
features=[]
features+=[f'total_pw_{z}' for z in zones] #total power P= \sum s_k^2 , where s_k are the amplitutes of the Fourier transform
                                                       #relative power contained in
features+=[f'spec_band_pw_leq_4hz_{z}' for z in zones] #band delta
features+=[f'spec_band_pw_leq_8hz_{z}' for z in zones] #band theta
features+=[f'spec_band_pw_leq_13hz_{z}' for z in zones] #band alpha
features+=[f'spec_band_pw_leq_30hz_{z}' for z in zones]  #beta
features+=[f'spec_edge_fr_{z}' for z in zones] #min frequency up to which 50% the spectral power up to 40 Hz
features+= [f'{c}_max_10m' for c in spec_cols] #maxs over 10 min 
features+= [f'{c}_max_20s' for c in spec_cols] #maxs over 20 sec
features+=[f'{c}_mean_10m' for c in spec_cols] #means over 10 min as done in Catboost Starter https://www.kaggle.com/code/cdeotte/catboost-starter-lb-0-60
features+= [f'{c}_mean_20s' for c in spec_cols] #means over 20 sec as in Catboost starter
features+= [f'{c}_min_10m' for c in spec_cols] #mins over 10 min as in Catboost starter
features+= [f'{c}_min_20s' for c in spec_cols] #mins over 20 sec as in Catboost starter
           # 
           # 'spec_band_pw_leq_100hz':[], #we don't have these in kaggle spectrograms

feature_data=np.zeros((len(df), len(features)))
for i, (index, row) in enumerate(df.iterrows()):
     if i%100==0:
          print(f'processing {i}')
     try:
          spec=pd.read_parquet(f'{spec_path}/{row.spectrogram_id}.parquet')
          offset=row.spectrogram_label_offset_seconds//2
          feature_data[i][0:4]=np.array([spec.iloc[int(offset)+149, j*100+1:j*100+101].sum() 
                                             for j, z in enumerate(zones)]) #total power
          feature_data[i][4:8]=np.array([spec.iloc[int(offset)+149, j*100+1:j*100+19].sum() 
                                             for j, z in enumerate(zones)])/feature_data[i][0:4] #spec_band_pw_leq_4hz
          feature_data[i][8:12]=np.array([spec.iloc[int(offset)+149, j*100+19:j*100+39].sum() 
                                             for j, z in enumerate(zones)])/feature_data[i][0:4] #spec_band_pw_leq_8hz
          feature_data[i][12:16]=np.array([spec.iloc[int(offset)+149, j*100+39:j*100+65].sum() 
                                             for j, z in enumerate(zones)])/feature_data[i][0:4] #spec_band_pw_leq_13hz
          feature_data[i][16:20]=np.array([spec.iloc[int(offset)+149, j*100+65:j*100+101].sum() 
                                             for j, z in enumerate(zones)])/feature_data[i][0:4] #spec_band_pw_leq_30hz
          feature_data[i][20:24]=np.array([ d[z][np.argwhere(spec.iloc[int(offset)+149, j*100+1:j*100+101].cumsum()>feature_data[i][j]*0.5)[0,0]] 
                                             for j, z in enumerate(zones)]) #spec_edge_fr

          feature_data[i][24:424]= np.array(spec.iloc[int(offset):int(offset)+300, 1:].max(axis='index'))   #maxs over 10 min 
          feature_data[i][424:824]= np.array(spec.iloc[int(offset)+145:int(offset)+155, 1:].max(axis='index'))   #maxs over 20 sec
          feature_data[i][824:1224]= np.array(spec.iloc[int(offset):int(offset)+300, 1:].mean(axis='index')) #means over 10 min
          feature_data[i][1224:1624]= np.array(spec.iloc[int(offset)+145:int(offset)+155, 1:].mean(axis='index'))  #means over 20 sec
          feature_data[i][1624:2024]= np.array(spec.iloc[int(offset):int(offset)+300, 1:].min(axis='index')) #mins over 10 min
          feature_data[i][2024:2424]= np.array(spec.iloc[int(offset)+145:int(offset)+155, 1:].mean(axis='index')) #mins over 20 sec
     except:
          print(f'index {i} \n {row}')

     
     

In [7]:
features_df=pd.DataFrame(index=df.index, columns=features, data=feature_data)

Saving the features

In [9]:
#features_df.to_csv('train_features_from_kaggle_spec.csv')

the dataframe with features

In [None]:

df=pd.read_csv('test_final_less_filtering.csv', index_col=0)


# Extracting features for test dataset

In [None]:
features=[]
features+=[f'total_pw_{z}' for z in zones] #total power P= \sum s_k^2 , where s_k are the amplitutes of the Fourier transform
                                                       #relative power contained in
features+=[f'spec_band_pw_leq_4hz_{z}' for z in zones] #band delta
features+=[f'spec_band_pw_leq_8hz_{z}' for z in zones] #band theta
features+=[f'spec_band_pw_leq_13hz_{z}' for z in zones] #band alpha
features+=[f'spec_band_pw_leq_30hz_{z}' for z in zones]  #beta
features+=[f'spec_edge_fr_{z}' for z in zones] #min frequency up to which 50% the spectral power up to 40 Hz
features+= [f'{c}_max_10m' for c in spec_cols] #maxs over 10 min 
features+= [f'{c}_max_20s' for c in spec_cols] #maxs over 20 sec
features+=[f'{c}_mean_10m' for c in spec_cols] #means over 10 min as done in Catboost Starter https://www.kaggle.com/code/cdeotte/catboost-starter-lb-0-60
features+= [f'{c}_mean_20s' for c in spec_cols] #means over 20 sec as in Catboost starter
features+= [f'{c}_min_10m' for c in spec_cols] #mins over 10 min as in Catboost starter
features+= [f'{c}_min_20s' for c in spec_cols] #mins over 20 sec as in Catboost starter
           # 
           # 'spec_band_pw_leq_100hz':[], #we don't have these in kaggle spectrograms

feature_data=np.zeros((len(df), len(features)))
for i, (index, row) in enumerate(df.iterrows()):
     if i%100==0:
          print(f'processing {i}')
     try:
          spec=pd.read_parquet(f'{spec_path}/{row.spectrogram_id}.parquet')
          offset=row.spectrogram_label_offset_seconds//2
          feature_data[i][0:4]=np.array([spec.iloc[int(offset)+149, j*100+1:j*100+101].sum() 
                                             for j, z in enumerate(zones)]) #total power
          feature_data[i][4:8]=np.array([spec.iloc[int(offset)+149, j*100+1:j*100+19].sum() 
                                             for j, z in enumerate(zones)])/feature_data[i][0:4] #spec_band_pw_leq_4hz
          feature_data[i][8:12]=np.array([spec.iloc[int(offset)+149, j*100+19:j*100+39].sum() 
                                             for j, z in enumerate(zones)])/feature_data[i][0:4] #spec_band_pw_leq_8hz
          feature_data[i][12:16]=np.array([spec.iloc[int(offset)+149, j*100+39:j*100+65].sum() 
                                             for j, z in enumerate(zones)])/feature_data[i][0:4] #spec_band_pw_leq_13hz
          feature_data[i][16:20]=np.array([spec.iloc[int(offset)+149, j*100+65:j*100+101].sum() 
                                             for j, z in enumerate(zones)])/feature_data[i][0:4] #spec_band_pw_leq_30hz
          feature_data[i][20:24]=np.array([ d[z][np.argwhere(spec.iloc[int(offset)+149, j*100+1:j*100+101].cumsum()>feature_data[i][j]*0.5)[0,0]] 
                                             for j, z in enumerate(zones)]) #spec_edge_fr

          feature_data[i][24:424]= np.array(spec.iloc[int(offset):int(offset)+300, 1:].max(axis='index'))   #maxs over 10 min 
          feature_data[i][424:824]= np.array(spec.iloc[int(offset)+145:int(offset)+155, 1:].max(axis='index'))   #maxs over 20 sec
          feature_data[i][824:1224]= np.array(spec.iloc[int(offset):int(offset)+300, 1:].mean(axis='index')) #means over 10 min
          feature_data[i][1224:1624]= np.array(spec.iloc[int(offset)+145:int(offset)+155, 1:].mean(axis='index'))  #means over 20 sec
          feature_data[i][1624:2024]= np.array(spec.iloc[int(offset):int(offset)+300, 1:].min(axis='index')) #mins over 10 min
          feature_data[i][2024:2424]= np.array(spec.iloc[int(offset)+145:int(offset)+155, 1:].mean(axis='index')) #mins over 20 sec
     except:
          print(f'iloc {i} \n index {index} \n {row}')

     
     

In [21]:
features_df_test=pd.DataFrame(index=df.index, columns=features, data=feature_data)
features_df_test


Unnamed: 0,total_pw_LL,total_pw_RL,total_pw_LP,total_pw_RP,spec_band_pw_leq_4hz_LL,spec_band_pw_leq_4hz_RL,spec_band_pw_leq_4hz_LP,spec_band_pw_leq_4hz_RP,spec_band_pw_leq_8hz_LL,spec_band_pw_leq_8hz_RL,...,RP_18.16_min_20s,RP_18.36_min_20s,RP_18.55_min_20s,RP_18.75_min_20s,RP_18.95_min_20s,RP_19.14_min_20s,RP_19.34_min_20s,RP_19.53_min_20s,RP_19.73_min_20s,RP_19.92_min_20s
9,99.370000,106.220001,208.720001,229.770000,0.801147,0.880154,0.761403,0.795491,0.149140,0.083129,...,0.032000,0.033000,0.030,0.024000,0.024000,0.023,0.024000,0.023,0.021,0.021000
10,54.379999,43.680000,185.160001,180.080001,0.707245,0.790522,0.742763,0.742559,0.228209,0.148581,...,0.031000,0.031000,0.028,0.023000,0.023000,0.021,0.023000,0.022,0.020,0.021000
35,226.219999,898.660003,286.770000,1731.910014,0.769472,0.657701,0.703072,0.751967,0.204889,0.291289,...,0.326000,0.287000,0.252,0.233000,0.217000,0.196,0.173000,0.146,0.125,0.109000
89,1477.240002,221.280000,1751.819998,208.800000,0.964752,0.937997,0.964688,0.928400,0.018670,0.049440,...,1.414000,1.469000,1.399,1.315000,1.239000,1.126,0.961000,0.942,0.923,0.912000
107,784.920008,866.530004,607.409998,473.359998,0.528245,0.454225,0.621606,0.642154,0.197893,0.190576,...,0.364000,0.313000,0.287,0.249000,0.357000,0.472,0.521000,0.510,0.483,0.433000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106682,57500.600289,16388.359905,25612.360056,32716.539937,0.752971,0.865010,0.617579,0.900406,0.138077,0.073921,...,19.248999,20.468002,19.260,16.709002,16.539001,14.454,14.282001,14.881,16.114,17.563999
106683,247.540001,153.560001,642.170002,141.530000,0.499838,0.365134,0.664388,0.288843,0.348388,0.342407,...,0.097000,0.095000,0.100,0.102000,0.089000,0.087,0.080000,0.116,0.212,0.204000
106684,211.629999,104.580001,534.749997,87.430000,0.485328,0.433639,0.662366,0.487590,0.363795,0.306273,...,0.094000,0.097000,0.100,0.098000,0.089000,0.089,0.083000,0.131,0.226,0.221000
106685,182.320002,118.750000,413.910000,120.070000,0.518978,0.560421,0.677200,0.572666,0.293166,0.212716,...,0.096000,0.098000,0.097,0.091000,0.086000,0.088,0.080000,0.132,0.238,0.228000


In [22]:
features_df_test.to_csv('test_features_from_kaggle_spec.csv')