In [1]:
import h5py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from numpy import trapz
from scipy import signal, stats

In [2]:
y_train = pd.read_csv('y_train.csv')
y_test = pd.read_csv('y_test.csv')

In [3]:
def chunk_eeg_signal(data, chunk_seconds=10, overlap_seconds=5, sampling_rate=250):
    """
    Chunk EEG signal data with overlapping windows.
    
    """
    # Calculate number of samples per chunk and overlap
    samples_per_chunk = chunk_seconds * sampling_rate
    overlap_samples = overlap_seconds * sampling_rate
    stride = samples_per_chunk - overlap_samples
    
    # Get total number of samples and channels
    n_channels, n_samples = data.shape
    
    # Calculate number of complete chunks we can make
    n_chunks = (n_samples - samples_per_chunk) // stride + 1
    
    # Initialize arrays to store chunks and labels
    chunks = np.zeros((n_chunks, n_channels, samples_per_chunk))
    
    # Create chunks
    for i in range(n_chunks):
        start_idx = i * stride
        end_idx = start_idx + samples_per_chunk
        
        # Store the chunk
        chunks[i] = data[:, start_idx:end_idx]
    
    return chunks

In [4]:
def process_eeg_chunk(chunk_data, sampling_rate=250):
	"""
	Process a single EEG chunk into features.
	
	Parameters:
	-----------
	chunk_data : numpy.ndarray
		EEG data of shape (n_channels, n_samples)
	sampling_rate : int
		Sampling rate in Hz
		
	Returns:
	--------
	numpy.ndarray
		Features array of shape (n_channels, n_features)
	"""
	n_channels = chunk_data.shape[0]
	n_features = 20  # Increased number of features
	features = np.zeros((n_channels, n_features))
	
	# Define frequency bands
	bands = {
		'delta': (0.5, 4),
		'theta': (4, 8),
		'alpha': (8, 13),
		'beta': (13, 30),
		'gamma': (30, 45)
	}
	
	for channel in range(n_channels):
		signal_channel = chunk_data[channel]
		feature_idx = 0
		
		# 1. Compute power spectral density with better frequency resolution
		nperseg = min(signal_channel.shape[0], 2 * sampling_rate)  # 2-second windows
		freqs, psd = signal.welch(signal_channel, sampling_rate, nperseg=nperseg, 
					window='hann', noverlap=nperseg//2)
		
		# 2. Calculate band powers and ratios
		band_powers = {}
		total_power = trapz(y=psd, x=freqs)
		
		for band_name, (low, high) in bands.items():
			idx = np.logical_and(freqs >= low, freqs <= high)
			band_power = trapz(y=psd[idx], x=freqs[idx])
			band_powers[band_name] = band_power / total_power
		
		# 3. Time domain features
		features[channel, feature_idx] = np.std(signal_channel)  # Standard deviation
		feature_idx += 1
		
		# Zero crossing rate
		features[channel, feature_idx] = len(np.where(np.diff(np.signbit(signal_channel)))[0]) / len(signal_channel)
		feature_idx += 1
		
		# Line length (signal complexity)
		features[channel, feature_idx] = np.sum(np.abs(np.diff(signal_channel)))
		feature_idx += 1
		
		# 4. Frequency domain features
		for band_power in band_powers.values():
			features[channel, feature_idx] = band_power
			feature_idx += 1
			
		# Add frequency band ratios
		features[channel, feature_idx] = band_powers['theta'] / band_powers['beta']  # theta/beta ratio
		feature_idx += 1
		
		features[channel, feature_idx] = band_powers['alpha'] / band_powers['beta']  # alpha/beta ratio
		feature_idx += 1
		
		# Spectral edge frequency (95%)
		cumsum = np.cumsum(psd)
		features[channel, feature_idx] = freqs[np.where(cumsum >= 0.95 * cumsum[-1])[0][0]]
		feature_idx += 1
		
		# 5. Statistical features
		features[channel, feature_idx] = stats.kurtosis(signal_channel)
		feature_idx += 1
		
		features[channel, feature_idx] = stats.skew(signal_channel)  # Add skewness
		feature_idx += 1
		
		# 6. Hjorth parameters (improved calculation)
		diff1 = np.diff(signal_channel)
		diff2 = np.diff(diff1)
		
		mobility = np.sqrt(np.var(diff1) / np.var(signal_channel))
		features[channel, feature_idx] = mobility
		feature_idx += 1
		
		complexity = np.sqrt(np.var(diff2) * np.var(signal_channel)) / np.var(diff1)
		features[channel, feature_idx] = complexity
		feature_idx += 1
		
		# 8. Peak frequency per band
		for band_name, (low, high) in bands.items():
			idx = np.logical_and(freqs >= low, freqs <= high)
			if np.any(idx):
				peak_freq = freqs[idx][np.argmax(psd[idx])]
				features[channel, feature_idx] = peak_freq
				feature_idx += 1
	
	return features

def process_all_chunks(chunks_data, sampling_rate=250):
	"""
	Process multiple EEG chunks into features.
	
	Parameters:
	-----------
	chunks_data : numpy.ndarray
		EEG chunks of shape (n_chunks, n_channels, n_samples)
	sampling_rate : int
		Sampling rate in Hz (default: 250)
		
	Returns:
	--------
	numpy.ndarray
		Features array of shape (n_chunks, n_channels, n_features)
	"""
	n_chunks = chunks_data.shape[0]
	n_channels = chunks_data.shape[1]
	n_features = 20  # Number of features per channel
	
	# Initialize array for all features
	all_features = np.zeros((n_chunks, n_channels, n_features))
	
	# Process each chunk
	for i in range(n_chunks):
		all_features[i] = process_eeg_chunk(chunks_data[i], sampling_rate)
	
	return all_features

In [5]:
def zero_elim(data):
	# Detect and delete zeros in subarrays of length >= 250
	channel_0 = data[0, :]
	zero_start = None
	indices_to_delete = []

	for i, value in enumerate(channel_0):
		if value == 0:
			if zero_start is None:
				zero_start = i  # Mark the start of a zero sequence
		else:
			if zero_start is not None:
				# End of a zero sequence
				zero_length = i - zero_start
				if zero_length >= 250:
					indices_to_delete.extend(range(zero_start, i))
				zero_start = None

	# Handle case where the array ends with a long zero sequence
	if zero_start is not None:
		zero_length = len(channel_0) - zero_start
		if zero_length >= 250:
			indices_to_delete.extend(range(zero_start + 1, len(channel_0)))

	# Delete indices from all 52 channels
	if indices_to_delete:
		data = np.delete(data, indices_to_delete, axis=1)

	return data

In [6]:
def get_data(id):
	data = []
	with h5py.File('data_250hz.h5', 'r') as f:
		for i in range(52):
			parcel = f[id + '/parcel_' + str(i)][:]
			parcel = np.squeeze(parcel)
			data.append(parcel)
	data = np.array(data)
	data = zero_elim(data)
	return data

In [12]:
train_data = {}
for id in tqdm(y_train['id']):
	data = get_data(id)
	chunked_data = chunk_eeg_signal(data)
	all_features = process_all_chunks(chunked_data)
	train_data[id] = all_features

100%|██████████| 120/120 [27:31<00:00, 13.76s/it]


In [13]:
# Save the dict
np.save('train_data.npy', train_data)

In [14]:
train_all_data = []
for id in tqdm(y_train['id']):
	train_data = get_data(id)
	all_features = process_eeg_chunk(train_data)
	train_all_data.append(all_features)

  0%|          | 0/120 [00:00<?, ?it/s]

100%|██████████| 120/120 [04:32<00:00,  2.27s/it]


In [15]:
train_all_data = np.array(train_all_data)
train_all_data.shape

(120, 52, 20)

In [16]:
np.save('train_all_data.npy', train_all_data)

In [17]:
test_all_data = []
for id in tqdm(y_test['id']):
	test_data = get_data(id)
	test_features = process_eeg_chunk(test_data)
	test_all_data.append(test_features)

100%|██████████| 40/40 [01:37<00:00,  2.43s/it]


In [18]:
test_all_data = np.array(test_all_data)
test_all_data.shape

(40, 52, 20)

In [19]:
# Save the features
np.save('test_features.npy', test_all_data)

In [10]:
test_chunk = {}
for id in tqdm(y_test['id']):
	test_data = get_data(id)
	test_chunked_data = chunk_eeg_signal(test_data)
	test_features = process_all_chunks(test_chunked_data)
	test_chunk[id] = test_features

100%|██████████| 40/40 [08:18<00:00, 12.45s/it]


In [11]:
np.save('test_data.npy', test_chunk)