In [109]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

import glob 
import os

# Sckit-Learn
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score

# Classifier Models
import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import cv

from scipy import stats
from scipy import signal

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

import scipy.stats
from scipy.signal import welch
from scipy.stats import skew, kurtosis

import warnings
warnings.filterwarnings('ignore')


In [110]:
test = pd.read_csv('../hms-harmful-brain-activity-classification/test.csv')
print('Test shape',test.shape)
test.head()

Test shape (1, 3)


Unnamed: 0,spectrogram_id,eeg_id,patient_id
0,853520,3911565283,6885


In [111]:
TEST_SPECTROGRAM_PATH = '../hms-harmful-brain-activity-classification/test_spectrograms'

# Create a DataFrame with the paths to all .parquet files under the base path
SPEC_df = pd.DataFrame({'path': glob.glob(TEST_SPECTROGRAM_PATH + '**/*.parquet')})
# Extract test_type from the directory name
SPEC_df['test_type'] = SPEC_df['path'].str.split('/').str.get(-2).str.split('_').str.get(-1)
# Extract id from the file name
SPEC_df['id'] = SPEC_df['path'].str.split('/').str.get(-1).str.split('.').str.get(0)

# Read a specific Parquet file
df_spectrogram = pd.read_parquet(TEST_SPECTROGRAM_PATH + '/853520.parquet')
# Display the first few rows
df_spectrogram.head()

Unnamed: 0,time,LL_0.59,LL_0.78,LL_0.98,LL_1.17,LL_1.37,LL_1.56,LL_1.76,LL_1.95,LL_2.15,...,RP_18.16,RP_18.36,RP_18.55,RP_18.75,RP_18.95,RP_19.14,RP_19.34,RP_19.53,RP_19.73,RP_19.92
0,1,14.91,17.110001,11.66,11.73,6.08,4.54,4.31,3.38,2.05,...,0.07,0.06,0.05,0.06,0.05,0.05,0.06,0.05,0.04,0.05
1,3,11.13,10.95,10.77,5.07,4.03,3.24,3.61,2.98,1.54,...,0.05,0.04,0.04,0.04,0.04,0.04,0.03,0.03,0.03,0.02
2,5,10.88,10.57,8.79,5.33,2.44,1.48,1.83,0.99,0.89,...,0.04,0.04,0.04,0.03,0.03,0.04,0.04,0.05,0.06,0.06
3,7,19.450001,18.200001,17.719999,13.38,4.17,1.88,1.84,1.22,1.27,...,0.03,0.03,0.05,0.08,0.07,0.07,0.08,0.03,0.03,0.03
4,9,21.65,22.530001,23.16,17.0,7.19,3.89,3.65,2.72,2.35,...,0.04,0.04,0.05,0.05,0.06,0.05,0.05,0.05,0.04,0.03


In [112]:
def PCA_FeatureExtraction(spectrogram_df):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(spectrogram_df)
    pca = PCA(n_components = 1)
    X_pca = pca.fit_transform(X_scaled)

    # Define EEG frequency bands
    eeg_bands = {'Delta': (0.5, 4), 'Theta': (4, 8), 'Alpha': (8, 12), 'Beta': (12, 30), 'Gamma': (30, 45)}
    fs = 200  # Sample rate

    # Function to filter and extract features
    def extract_features(segment):
        features = []
        for band in eeg_bands:
            low, high = eeg_bands[band]
            sos = signal.butter(3, [low, high], btype='bandpass', fs=fs, output='sos')
            filtered = signal.sosfilt(sos, segment)
            features.extend([np.mean(filtered), np.std(filtered), np.max(filtered), np.min(filtered)])
        return features

    # Apply feature extraction to each PCA component
    features_list = []
    for component in X_pca.T:  # Transpose to iterate over components
        component_features = extract_features(component)
        features_list.append(component_features)

    # Define column names for the DataFrame
    column_names = []
    for band in eeg_bands:
        for stat in ['mean', 'std', 'max', 'min']:
            column_names.append(f"{band}_{stat}")
    
    # Create the DataFrame from the features list
    features_df = pd.DataFrame(features_list, columns=column_names)

    return features_df


In [113]:
def dbscan_pca_feature_extraction(df, eps=0.5, min_samples=5, pca_n_components=1):
    # Standardize the data
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df)
    
    # Apply DBSCAN
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    clusters = dbscan.fit_predict(X_scaled)
    
    # Check the number of clusters formed
    n_clusters_ = len(set(clusters)) - (1 if -1 in clusters else 0)
    
    # Apply PCA
    pca = PCA(n_components=pca_n_components)
    X_pca = pca.fit_transform(X_scaled)
    
    # Feature extraction from PCA components
    features = {
        'PCA_mean': np.mean(X_pca, axis=0),
        'PCA_std': np.std(X_pca, axis=0),
        'PCA_max': np.max(X_pca, axis=0),
        'PCA_min': np.min(X_pca, axis=0)
    }
    
    # Create a DataFrame with the features
    features_df = pd.DataFrame(features)
    
    # Add DBSCAN cluster labels to the features DataFrame
    features_df['DBSCAN_Cluster'] = clusters[0]  # Assuming the use of the first DBSCAN label for demonstration
    
    return features_df

In [114]:
All_Spec_df = []
for i in range(len(test)):
    df_spectrogram = pd.read_parquet(TEST_SPECTROGRAM_PATH + '/' + str(test['spectrogram_id'][i]) + '.parquet')
    print(test['spectrogram_id'][i])
    df_spectrogram.fillna(df_spectrogram.mean(), inplace=True)
    ok = PCA_FeatureExtraction(df_spectrogram)
    ok.insert(0, 'spectrogram_id', test['spectrogram_id'][i])

    All_Spec_df.append(ok)

    
# Concatenate all DataFrames in the list into a single DataFrame
combined_df_1 = pd.concat(All_Spec_df, ignore_index=True)

# Now 'combined_df' contains all the data from the individual DataFrames
combined_df_1.head()

853520


Unnamed: 0,spectrogram_id,Delta_mean,Delta_std,Delta_max,Delta_min,Theta_mean,Theta_std,Theta_max,Theta_min,Alpha_mean,...,Alpha_max,Alpha_min,Beta_mean,Beta_std,Beta_max,Beta_min,Gamma_mean,Gamma_std,Gamma_max,Gamma_min
0,853520,1.121511,3.615552,7.794637,-5.647541,0.0186,1.434875,3.9607,-4.253749,-0.039071,...,2.21719,-3.346877,0.034391,2.297447,8.699139,-9.945384,-0.003853,1.422188,4.983995,-4.737098


In [115]:
All_Spec_df_2 = []
for i in range(len(combined_df_1)):
    df_spectrogram = pd.read_parquet(TEST_SPECTROGRAM_PATH + '/' + str(test['spectrogram_id'][i]) + '.parquet')
    print(test['spectrogram_id'][i])
    df_spectrogram.fillna(df_spectrogram.mean(), inplace=True)
    ok = dbscan_pca_feature_extraction(df_spectrogram)
    ok.insert(0, 'spectrogram_id', test['spectrogram_id'][i])

    All_Spec_df_2.append(ok)

    
# Concatenate all DataFrames in the list into a single DataFrame
combined_df_2 = pd.concat(All_Spec_df_2, ignore_index=True)
combined_df_2.head()

853520


Unnamed: 0,spectrogram_id,PCA_mean,PCA_std,PCA_max,PCA_min,DBSCAN_Cluster
0,853520,1.017253e-07,9.322651,36.916191,-12.836059,-1


In [116]:
combined_df = pd.merge(combined_df_1, combined_df_2, on = 'spectrogram_id')
combined_df = pd.merge(combined_df, test, on = 'spectrogram_id')

In [117]:
combined_df.head()

Unnamed: 0,spectrogram_id,Delta_mean,Delta_std,Delta_max,Delta_min,Theta_mean,Theta_std,Theta_max,Theta_min,Alpha_mean,...,Gamma_std,Gamma_max,Gamma_min,PCA_mean,PCA_std,PCA_max,PCA_min,DBSCAN_Cluster,eeg_id,patient_id
0,853520,1.121511,3.615552,7.794637,-5.647541,0.0186,1.434875,3.9607,-4.253749,-0.039071,...,1.422188,4.983995,-4.737098,1.017253e-07,9.322651,36.916191,-12.836059,-1,3911565283,6885


In [118]:
# Function to load EEG data
def load_eeg_data(file_path):
    return pd.read_csv(file_path)

# Function to extract time domain features for PCA components
def extract_pca_time_domain_features(pca_data):
    return {
        'eeg_mean': np.mean(pca_data, axis=0).item(),
        'eeg_std_dev': np.std(pca_data, axis=0).item(),
        'eeg_skewness': skew(pca_data, axis=0).item(),
        'eeg_kurtosis': kurtosis(pca_data, axis=0).item()
    }

In [119]:
combined_df

Unnamed: 0,spectrogram_id,Delta_mean,Delta_std,Delta_max,Delta_min,Theta_mean,Theta_std,Theta_max,Theta_min,Alpha_mean,...,Gamma_std,Gamma_max,Gamma_min,PCA_mean,PCA_std,PCA_max,PCA_min,DBSCAN_Cluster,eeg_id,patient_id
0,853520,1.121511,3.615552,7.794637,-5.647541,0.0186,1.434875,3.9607,-4.253749,-0.039071,...,1.422188,4.983995,-4.737098,1.017253e-07,9.322651,36.916191,-12.836059,-1,3911565283,6885


In [120]:
combined_df.head()

Unnamed: 0,spectrogram_id,Delta_mean,Delta_std,Delta_max,Delta_min,Theta_mean,Theta_std,Theta_max,Theta_min,Alpha_mean,...,Gamma_std,Gamma_max,Gamma_min,PCA_mean,PCA_std,PCA_max,PCA_min,DBSCAN_Cluster,eeg_id,patient_id
0,853520,1.121511,3.615552,7.794637,-5.647541,0.0186,1.434875,3.9607,-4.253749,-0.039071,...,1.422188,4.983995,-4.737098,1.017253e-07,9.322651,36.916191,-12.836059,-1,3911565283,6885


In [121]:
combined_df['eeg_id'][0]

3911565283

In [122]:
EEG_PATH = '../hms-harmful-brain-activity-classification/test_eegs/'

combined_df_3 = []
for i in range(len(combined_df)):
    df_eeg = pd.read_parquet(EEG_PATH + str(combined_df['eeg_id'][i]) + '.parquet')
    print(combined_df['eeg_id'][i])
    df_eeg.fillna(df_eeg.mean(), inplace=True)
    scaler = StandardScaler()
    eeg_data_scaled = scaler.fit_transform(df_eeg.drop(['Unnamed: 0'], axis = 1, errors='ignore'))  # Drop non-EEG columns
    pca = PCA(n_components = 1)  # Retain 1 principal component for simplicity
    eeg_data_pca = pca.fit_transform(eeg_data_scaled)

    # Extract time-domain features from PCA components
    ok = extract_pca_time_domain_features(eeg_data_pca)
    new_item = {"eeg_id": combined_df['eeg_id'][i]}
    ok = {**new_item, **ok}

    combined_df_3.append(ok)

3911565283


In [123]:
combined_df_3 = pd.DataFrame(combined_df_3)
combined_df_3.head()

Unnamed: 0,eeg_id,eeg_mean,eeg_std_dev,eeg_skewness,eeg_kurtosis
0,3911565283,-2.441406e-08,3.686257,0.792493,1.510437


In [124]:
combined_df = pd.merge(combined_df, combined_df_3, on = 'eeg_id')
combined_df.drop(columns = ['eeg_id', 'patient_id'], axis = 1, inplace = True)
combined_df.head()

Unnamed: 0,spectrogram_id,Delta_mean,Delta_std,Delta_max,Delta_min,Theta_mean,Theta_std,Theta_max,Theta_min,Alpha_mean,...,Gamma_min,PCA_mean,PCA_std,PCA_max,PCA_min,DBSCAN_Cluster,eeg_mean,eeg_std_dev,eeg_skewness,eeg_kurtosis
0,853520,1.121511,3.615552,7.794637,-5.647541,0.0186,1.434875,3.9607,-4.253749,-0.039071,...,-4.737098,1.017253e-07,9.322651,36.916191,-12.836059,-1,-2.441406e-08,3.686257,0.792493,1.510437


In [125]:
xgb_clf_loaded = xgb.XGBClassifier()

# Load the model from the file
xgb_clf_loaded.load_model('./xgb_model_2.json')

In [126]:
y_pred = xgb_clf_loaded.predict(combined_df)

In [127]:
# Assuming xgb_clf is your trained XGBoost model
predictions = xgb_clf_loaded.predict_proba(combined_df)

# Convert predictions to DataFrame
df_predictions = pd.DataFrame(predictions, columns=['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote'])

# Display the DataFrame
df_predictions.head()

Unnamed: 0,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
0,0.041207,0.057616,0.233272,0.005922,0.528676,0.133308


In [128]:
TARGETS = ['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote',
    'other_vote']

In [129]:
sub = pd.DataFrame({'eeg_id':test.eeg_id.values})
sub[TARGETS] = predictions
# sub.to_csv('submission.csv',index=False)
# print('Submission shape',sub.shape)
# sub.head()

In [130]:
sub_df = pd.read_csv('../hms-harmful-brain-activity-classification/sample_submission.csv')
sub_df = sub_df[["eeg_id"]].copy()
sub_df = sub_df.merge(sub, on="eeg_id", how="left")
sub_df.to_csv("submission.csv", index=False)
sub_df.head()

Unnamed: 0,eeg_id,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
0,3911565283,0.041207,0.057616,0.233272,0.005922,0.528676,0.133308
