# Feature extraction from nanopore event traces

This notebook loads `combined_peaks_data.json` files produced by the screening pipeline and computes the five baseline features (ΔI, standard deviation, skewness, kurtosis, and dwell time `t_off`) described in Wang *et al.* (2023). The resulting feature table will be reused for quick model prototyping (e.g. an SVM classifier).



## Workflow overview

1. Configure the paths to the processed data folders that contain `combined_peaks_data.json`.
2. Load every event trace and estimate the baseline and blockade regions.
3. Compute ΔI, SD, skewness, kurtosis, and `t_off` for each event.
4. Export the aggregated feature table for downstream modeling.



In [None]:
from pathlib import Path
import json
from typing import Dict, Iterable, List, Optional

import numpy as np
import pandas as pd
from scipy.stats import kurtosis, skew

pd.set_option('display.max_columns', 20)
pd.set_option('display.width', 120)



## Configure data sources

Set `DATA_ROOT` to the directory that contains one or more subfolders produced by `combine_peaks_data.py`. Each subfolder should contain a `combined_peaks_data.json`. If you prefer to point to specific files, populate `EXPLICIT_FILES` with their paths instead.



In [None]:
# Path to the directory that contains processed runs (update this to your environment)
DATA_ROOT = Path('/path/to/processed/data/root')  # <-- change me

# Optional: list explicit combined JSON files if they live in different roots
EXPLICIT_FILES: List[str] = []  # e.g. ['/data/run1/combined_peaks_data.json']

OUTPUT_DIR = Path('feature_tables')
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

if EXPLICIT_FILES:
    combined_files = [Path(p).expanduser().resolve() for p in EXPLICIT_FILES]
elif DATA_ROOT.exists():
    if DATA_ROOT.is_file() and DATA_ROOT.name == 'combined_peaks_data.json':
        combined_files = [DATA_ROOT.expanduser().resolve()]
    else:
        combined_files = sorted(path.expanduser().resolve() for path in DATA_ROOT.rglob('combined_peaks_data.json'))
else:
    combined_files = []
    print('Update DATA_ROOT or EXPLICIT_FILES with the location of combined_peaks_data.json files.')

combined_files



## Helper utilities

The functions below load events, identify the blockade portion of each trace, and compute the five requested features. The baseline is estimated from the high-current samples (outside the blockade), whereas the blockade level is taken from the low-current samples.



In [None]:
def load_events(json_path: Path) -> List[Dict]:
    """Load the list of events from a combined peaks JSON file."""
    with open(json_path, 'r') as fh:
        events = json.load(fh)
    for event in events:
        if 'source_file' not in event:
            event['source_file'] = json_path.stem
    return events


def _safe_median(values: np.ndarray) -> float:
    values = values[np.isfinite(values)]
    if values.size == 0:
        return float('nan')
    return float(np.median(values))


def _event_masks(norm_signal: np.ndarray, fraction: float = 0.25) -> Dict[str, np.ndarray]:
    """Return boolean masks that delineate blockade vs baseline samples."""
    sorted_norm = np.sort(norm_signal[np.isfinite(norm_signal)])
    if sorted_norm.size == 0:
        mask = np.zeros_like(norm_signal, dtype=bool)
        return {
            'event_mask': mask,
            'baseline_mask': ~mask,
            'baseline_level': float('nan'),
            'blockade_level': float('nan'),
            'delta_norm': float('nan'),
        }

    top_k = max(int(np.ceil(sorted_norm.size * 0.2)), 1)
    baseline_level = float(np.median(sorted_norm[-top_k:]))
    blockade_level = float(np.median(sorted_norm[:top_k]))
    delta_norm = baseline_level - blockade_level

    if delta_norm <= 0:
        delta_norm = baseline_level - float(np.min(sorted_norm))

    threshold = baseline_level - fraction * delta_norm if delta_norm > 0 else baseline_level - 1e-3
    event_mask = norm_signal <= threshold

    if not np.any(event_mask):
        fallback_threshold = baseline_level - 0.1 * max(abs(delta_norm), abs(baseline_level) * 0.05)
        event_mask = norm_signal <= fallback_threshold

    if not np.any(event_mask):
        event_mask = norm_signal < baseline_level

    baseline_mask = ~event_mask

    return {
        'event_mask': event_mask,
        'baseline_mask': baseline_mask,
        'baseline_level': baseline_level,
        'blockade_level': blockade_level,
        'delta_norm': delta_norm,
    }


def _safe_std(values: np.ndarray) -> float:
    if values.size < 2:
        return float('nan')
    return float(np.std(values, ddof=1))


def _safe_skew(values: np.ndarray) -> float:
    if values.size < 3:
        return float('nan')
    return float(skew(values, bias=False))


def _safe_kurtosis(values: np.ndarray) -> float:
    if values.size < 4:
        return float('nan')
    return float(kurtosis(values, fisher=True, bias=False))


def compute_event_features(event: Dict, *, sample_id: Optional[str] = None, source_path: Optional[Path] = None) -> Dict:
    raw_signal = event.get('raw_signal') or event.get('norm_signal')
    if raw_signal is None:
        raise KeyError('Event is missing a normalized signal ("raw_signal" or "norm_signal").')

    raw_signal = np.asarray(raw_signal, dtype=float)
    raw_signal_abs = np.asarray(event['raw_signal_not_norm'], dtype=float)

    n_samples = min(raw_signal.size, raw_signal_abs.size)
    raw_signal = raw_signal[:n_samples]
    raw_signal_abs = raw_signal_abs[:n_samples]

    valid = np.isfinite(raw_signal) & np.isfinite(raw_signal_abs)
    raw_signal = raw_signal[valid]
    raw_signal_abs = raw_signal_abs[valid]

    masks = _event_masks(raw_signal)
    event_mask = masks['event_mask']
    baseline_mask = masks['baseline_mask']

    if np.any(baseline_mask):
        baseline_current = _safe_median(raw_signal_abs[baseline_mask])
    else:
        baseline_current = _safe_median(raw_signal_abs)

    if np.any(event_mask):
        blockade_current = _safe_median(raw_signal_abs[event_mask])
        drop_series = baseline_current - raw_signal_abs[event_mask]
    else:
        blockade_current = _safe_median(raw_signal_abs)
        drop_series = baseline_current - raw_signal_abs

    delta_I = float(baseline_current - blockade_current)
    sd_drop = _safe_std(drop_series)
    skew_drop = _safe_skew(drop_series)
    kurt_drop = _safe_kurtosis(drop_series)

    dt = float(event.get('dt', np.nan))
    if np.isnan(dt) or dt == 0:
        t_off = float(event.get('t_end', np.nan) - event.get('t_start', np.nan))
    else:
        if np.any(event_mask):
            event_indices = np.where(event_mask)[0]
            t_off = float((event_indices[-1] - event_indices[0] + 1) * dt)
        else:
            t_off = float(raw_signal.size * dt)

    feature_record = {
        'sample_id': sample_id if sample_id is not None else (source_path.parent.name if source_path else None),
        'source_file': event.get('source_file', source_path.stem if source_path else None),
        'peak_index': event.get('peak_index'),
        'delta_I': delta_I,
        'sd': sd_drop,
        'skew': skew_drop,
        'kurtosis': kurt_drop,
        't_off': t_off,
        'baseline_current': baseline_current,
        'blocked_current': blockade_current,
        'n_event_samples': int(np.sum(event_mask)),
        'total_samples': int(raw_signal.size),
    }

    if source_path is not None:
        feature_record['combined_file'] = str(source_path)

    if 'snr_db' in event:
        feature_record['snr_db'] = event['snr_db']

    return feature_record



## Extract features from every combined file

Run the cell below after configuring the paths. A feature dictionary is created for every event and aggregated into a single DataFrame.



In [None]:
feature_records: List[Dict] = []

for combined_path in combined_files:
    events = load_events(combined_path)
    sample_id = combined_path.parent.name
    for event in events:
        try:
            feature_records.append(
                compute_event_features(event, sample_id=sample_id, source_path=combined_path)
            )
        except Exception as exc:
            print(f'Failed to process event from {combined_path}: {exc}')

features_df = pd.DataFrame(feature_records)
print(f'Loaded {len(feature_records)} events from {len(combined_files)} combined files.')
features_df.head()



## Inspect summary statistics

The table below provides a quick sanity check for the magnitude of each feature. Adjust the baseline detection heuristics above if the distributions look suspicious.



In [None]:
feature_columns = ['delta_I', 'sd', 'skew', 'kurtosis', 't_off']

if not features_df.empty:
    display(features_df[feature_columns].describe())
else:
    print('Feature table is empty. Verify DATA_ROOT/EXPLICIT_FILES before continuing.')



## Persist the feature table

Export the features to CSV so that other notebooks (e.g. model fitting) can consume them.



In [None]:
output_path = OUTPUT_DIR / 'event_features.csv'

if not features_df.empty:
    features_df.to_csv(output_path, index=False)
    print(f'Saved {len(features_df)} feature rows to {output_path}')
else:
    print('Skipped saving because the feature table is empty.')



## Optional: map samples to class labels

Create a dictionary that maps folder names (or `sample_id`) to the DNA classes you want to predict. Uncomment and adapt the cell below when preparing the modeling notebook.



In [None]:
# SAMPLE_LABEL_MAP = {
#     '500mV_100bp_1ngmkl_10MHz_boost_240830164444': '100bp',
#     '500mV_200bp_1ngmkl_10MHz_boost_240830165006': '200bp',
# }
#
# if not features_df.empty:
#     features_df['label'] = features_df['sample_id'].map(SAMPLE_LABEL_MAP)
#     display(features_df[['sample_id', 'label']].drop_duplicates())

