In [1]:
"""
EEG Data Preprocessing Pipeline
Converts raw EEG CSV data to JSON/JSONL format for ML training
"""

import pandas as pd
import numpy as np
import json
import os
from scipy.fft import fft

# Configuration
WINDOW_SIZE = 1000  # 4 seconds at 250Hz sampling rate
FREQ_BANDS = {
    'alpha': (8, 12),
    'beta': (12, 30),
    'theta': (4, 8),
    'gamma': (30, 45)
}
CHANNELS = ['FC3', 'FCz', 'FC4', 'C3', 'Cz', 'C4', 'CP3', 'CPz', 'CP4']

def compute_band_power(data, sample_rate=250):
    """Calculate power in frequency bands using FFT"""
    n = len(data)
    fft_vals = np.abs(fft(data, n=n))
    freqs = np.fft.fftfreq(n, d=1/sample_rate)

    band_powers = {}
    for band, (low, high) in FREQ_BANDS.items():
        idx = np.where((freqs >= low) & (freqs <= high))
        band_powers[band] = np.mean(fft_vals[idx] ** 2)

    return band_powers

def extract_features(window_data):
    """Extract frequency features for all channels"""
    features = {}
    for channel in CHANNELS:
        channel_data = window_data[channel].values
        band_powers = compute_band_power(channel_data)
        for band, power in band_powers.items():
            key = f"{channel}_{band}"
            features[key] = float(power)
    return features

def csv_to_json(csv_path, json_path, jsonl_path):
    """Main processing pipeline"""
    # Load CSV data
    df = pd.read_csv(csv_path)

    # Validate data
    if len(df) < WINDOW_SIZE:
        raise ValueError(f"Data too short ({len(df)} samples). Needs at least {WINDOW_SIZE} samples.")

    # Process in windows
    json_data = []
    for start in range(0, len(df) - WINDOW_SIZE + 1, WINDOW_SIZE):
        window = df.iloc[start:start + WINDOW_SIZE]
        label = str(int(window['label'].iloc[0]))  # Get label from first sample

        # Extract features
        features = extract_features(window)

        # Create JSON entry
        entry = {
            "messages": [
                {
                    "role": "system",
                    "content": "Analyze EEG features and predict the movement intention class (integer 1-5)."
                },
                {
                    "role": "user",
                    "content": "EEG Features:\n" + "\n".join(
                        [f"{k}: {v:.6f}" for k, v in features.items()]
                    )
                },
                {
                    "role": "assistant",
                    "content": label
                }
            ]
        }
        json_data.append(entry)

    # Save JSON
    with open(json_path, 'w') as f:
        json.dump(json_data, f, indent=2)

    # Convert to JSONL
    with open(jsonl_path, 'w') as f:
        for entry in json_data:
            f.write(json.dumps(entry) + '\n')

    print(f"Processed {len(json_data)} windows")
    print(f"JSON saved to: {json_path}")
    print(f"JSONL saved to: {jsonl_path}")

if __name__ == "__main__":
    # Configure paths
    BASE_DIR = ""
    CSV_PATH = os.path.join(BASE_DIR, 'concat_files.csv')
    JSON_PATH = os.path.join(BASE_DIR, 'json/train.json')
    JSONL_PATH = os.path.join(BASE_DIR, 'jsonl/train.jsonl')

    # Run pipeline
    csv_to_json(CSV_PATH, JSON_PATH, JSONL_PATH)

Processed 525 windows
JSON saved to: json/train.json
JSONL saved to: jsonl/train.jsonl
