based on notebooks in  
https://github.com/dandi/example-notebooks/tree/master/tutorials/neurodatarehack_2024 

In [1]:
import json
import numpy as np
from dandi.dandiapi import DandiAPIClient
from tqdm.notebook import tqdm
from isodate import parse_duration, Duration
from datetime import datetime
from warnings import simplefilter
simplefilter("ignore")  # Suppress namespace warnings from reading older NWB files

from nwbinspector.tools import get_s3_urls_and_dandi_paths
from pynwb import NWBHDF5IO
import remfile
import h5py

import lindi, pynwb

from dandi.dandiapi import DandiAPIClient



In [2]:
client = DandiAPIClient()
dandisets = list(client.get_dandisets())

# nwb dandisets

In [3]:
nwb_dandisets = []

for dandiset in tqdm(dandisets):
    raw_metadata = dandiset.get_raw_metadata()

    if any(
        data_standard['identifier'] == "RRID:SCR_015242"  # this is the RRID for NWB
        for data_standard in raw_metadata['assetsSummary'].get('dataStandard', [])
    ):
        nwb_dandisets.append(dandiset)
print(f"There are currently {len(nwb_dandisets)} NWB datasets on DANDI!")

  0%|          | 0/697 [00:00<?, ?it/s]

There are currently 420 NWB datasets on DANDI!


# dandiset behavior types 


In [4]:
raw_metadata = dandisets[2].get_raw_metadata()

In [5]:
raw_metadata

{'id': 'DANDI:000005/0.220126.1853',
 'doi': '10.48324/dandi.000005/0.220126.1853',
 'url': 'https://dandiarchive.org/dandiset/000005/0.220126.1853',
 'name': 'Electrophysiology data from thalamic and cortical neurons during somatosensation',
 'about': [{'name': 'dorsal plus ventral thalamus',
   'schemaKey': 'Anatomy',
   'identifier': 'UBERON:0001897'}],
 'access': [{'status': 'dandi:OpenAccess',
   'schemaKey': 'AccessRequirements',
   'contactPoint': {'schemaKey': 'ContactPoint'}}],
 'license': ['spdx:CC-BY-4.0'],
 'version': '0.220126.1853',
 '@context': 'https://raw.githubusercontent.com/dandi/schema/master/releases/0.6.0/context.json',
 'citation': 'Yu, Jianing; Gutnisky, Diego A; Hires, S Andrew; Svoboda, Karel (2022) Electrophysiology data from thalamic and cortical neurons during somatosensation (Version 0.220126.1853) [Data set]. DANDI archive. https://doi.org/10.48324/dandi.000005/0.220126.1853',
 'keywords': [],
 'protocol': [],
 'schemaKey': 'Dandiset',
 'identifier': 'DA

In [6]:

def contains_behavior(data):
    if isinstance(data, dict):
        return any(contains_behavior(v) for v in data.values())
    elif isinstance(data, list):
        return any(contains_behavior(v) for v in data)
    elif isinstance(data, str):
        return 'behavior' in data.lower()
    return False

def find_behavior_keys(data, parent_key=""):
    keys_with_behavior = []

    if isinstance(data, dict):
        for key, value in data.items():
            full_key = f"{parent_key}.{key}" if parent_key else key
            keys_with_behavior.extend(find_behavior_keys(value, full_key))
    elif isinstance(data, list):
        for index, item in enumerate(data):
            full_key = f"{parent_key}[{index}]"
            # keys_with_behavior.extend(find_behavior_keys(item, full_key))
            keys_with_behavior.extend(find_behavior_keys(item, parent_key))
    elif isinstance(data, str):
        if 'behavior' in data.lower():
            keys_with_behavior.append(parent_key)

    return keys_with_behavior


In [7]:
brbe_nwb_dandisets = []

for dset in tqdm(nwb_dandisets):
    raw_metadata = dset.get_raw_metadata()

    if contains_behavior(raw_metadata):
        behavior_keys = find_behavior_keys(raw_metadata)
        brbe_nwb_dandisets.append(dset)

print(len(brbe_nwb_dandisets))

  0%|          | 0/420 [00:00<?, ?it/s]

178


In [8]:
find_behavior_keys(raw_metadata)

['assetsSummary.approach.name',
 'assetsSummary.variableMeasured',
 'assetsSummary.measurementTechnique.name']

In [9]:

behavior_keys_list = []
for dset in tqdm(nwb_dandisets):
    raw_metadata = dset.get_raw_metadata()

    behavior_keys = find_behavior_keys(raw_metadata)
    behavior_keys_list.extend(behavior_keys)


  0%|          | 0/420 [00:00<?, ?it/s]

In [10]:
behavior_keys_list

['name',
 'citation',
 'description',
 'assetsSummary.approach.name',
 'assetsSummary.measurementTechnique.name',
 'relatedResource.name',
 'contributor.affiliation.name',
 'contributor.affiliation.name',
 'contributor.name',
 'description',
 'assetsSummary.approach.name',
 'assetsSummary.variableMeasured',
 'assetsSummary.measurementTechnique.name',
 'assetsSummary.approach.name',
 'assetsSummary.variableMeasured',
 'assetsSummary.measurementTechnique.name',
 'assetsSummary.approach.name',
 'assetsSummary.variableMeasured',
 'assetsSummary.variableMeasured',
 'assetsSummary.measurementTechnique.name',
 'assetsSummary.approach.name',
 'assetsSummary.variableMeasured',
 'assetsSummary.variableMeasured',
 'assetsSummary.measurementTechnique.name',
 'assetsSummary.approach.name',
 'assetsSummary.variableMeasured',
 'assetsSummary.measurementTechnique.name',
 'assetsSummary.approach.name',
 'assetsSummary.variableMeasured',
 'assetsSummary.measurementTechnique.name',
 'assetsSummary.approa

In [11]:

unique_behavior_keys = np.sort(list(set(behavior_keys_list)))

print("Unique values in behavior_keys_list:")
print(unique_behavior_keys)

Unique values in behavior_keys_list:
['about.name' 'acknowledgement' 'assetsSummary.approach.name'
 'assetsSummary.measurementTechnique.name'
 'assetsSummary.variableMeasured' 'citation'
 'contributor.affiliation.name' 'contributor.name' 'description'
 'keywords' 'name' 'relatedResource.identifier' 'relatedResource.name'
 'relatedResource.repository' 'relatedResource.url' 'studyTarget'
 'wasGeneratedBy.description']


In [12]:
beh_nwb_dandisets = []

for dset in tqdm(nwb_dandisets):
    raw_metadata = dset.get_raw_metadata()

    approaches = raw_metadata['assetsSummary'].get('approach', [])
    measurement_techniques = raw_metadata['assetsSummary'].get('measurementTechnique', [])
    variables_measured = raw_metadata['assetsSummary'].get('variableMeasured', [])

    if (
        any('behavior' in a.get('name', '').lower() for a in approaches) or
        any('behavior' in m.get('name', '').lower() for m in measurement_techniques) or
        any('behavior' in str(v).lower() for v in variables_measured)
    ):
        beh_nwb_dandisets.append(dset)

print(len(beh_nwb_dandisets))

  0%|          | 0/420 [00:00<?, ?it/s]

122


In [13]:
ephy_beh_nwb_dandisets = []

for dset in tqdm(beh_nwb_dandisets):
    raw_metadata = dset.get_raw_metadata()
    approaches = raw_metadata['assetsSummary'].get('approach', [])
    if (
        any('electrophysiological' in a.get('name', '') for a in approaches) 
    ):
        ephy_beh_nwb_dandisets.append(dset)

print(len(ephy_beh_nwb_dandisets))

  0%|          | 0/122 [00:00<?, ?it/s]

64


In [14]:
beh_no_ephys_dandisets = list(set(beh_nwb_dandisets) - set(ephy_beh_nwb_dandisets))

print("Items in beh_nwb_dandisets but not in ephy_beh_nwb_dandisets:")
for item in beh_no_ephys_dandisets:
    print(item)

Items in beh_nwb_dandisets but not in ephy_beh_nwb_dandisets:
DANDI:000037/0.240209.1623
DANDI:000703/draft
DANDI:001211/draft
DANDI:000727/0.240106.0043
DANDI:000704/draft
DANDI:000728/0.240827.1809
DANDI:000705/draft
DANDI:000016/draft
DANDI:001097/0.240814.1849
DANDI:000706/draft
DANDI:001453/0.250518.1950
DANDI:000707/draft
DANDI:000206/0.220103.2119
DANDI:000888/0.241014.2127
DANDI:001361/0.250609.2249
DANDI:000708/draft
DANDI:001131/0.240826.1647
DANDI:000889/0.241014.2127
DANDI:000535/0.230524.0416
DANDI:000579/0.230728.1727
DANDI:000971/0.240802.2004
DANDI:000250/draft
DANDI:001170/draft
DANDI:000559/0.240502.0456
DANDI:000212/draft
DANDI:000402/0.230307.2132
DANDI:000951/0.240418.2218
DANDI:000483/0.230421.2321
DANDI:000054/0.210819.1547
DANDI:000776/0.241009.1509
DANDI:000540/0.230515.0530
DANDI:000485/0.241014.2127
DANDI:000036/0.230515.1917
DANDI:001176/draft
DANDI:000697/draft
DANDI:000486/0.241014.2127
DANDI:001256/0.241120.2150
DANDI:000235/0.230316.1600
DANDI:000698/dra

# read nwb via lindi 

In [15]:
dandiset_id = beh_nwb_dandisets[1].get_raw_metadata()['id'].split('/')[0].split(':')[-1]
s3_urls = get_s3_urls_and_dandi_paths(dandiset_id=dandiset_id)
# print(s3_urls)
# print(list(s3_urls.values()))

filepath = list(s3_urls.values())[0]

with DandiAPIClient() as client:
    asset = client.get_dandiset(dandiset_id).get_asset_by_path(filepath)
    s3_url = asset.get_content_url(follow_redirects=1, strip_query=True)
f = lindi.LindiH5pyFile.from_hdf5_file(asset.download_url)
nwb = pynwb.NWBHDF5IO(file=f, mode='r').read()

In [16]:
dandiset_id

'000006'

In [17]:
nwb.objects

{'2cd1d438-bbdf-460f-8bfd-103466c3d0fc': root pynwb.file.NWBFile at 0x4838641344
 Fields:
   acquisition: {
     lick_times <class 'pynwb.behavior.BehavioralEvents'>
   }
   devices: {
     H-129 <class 'pynwb.device.Device'>
   }
   electrode_groups: {
     H-129: 64 <class 'pynwb.ecephys.ElectrodeGroup'>
   }
   electrodes: electrodes <class 'pynwb.ecephys.ElectrodesTable'>
   experiment_description: Extracellular electrophysiology recordings performed on mouse anterior lateral motor cortex (ALM) in delay response task. Neural activity from two neuron populations, pyramidal track upper and lower, were characterized, in relation to movement execution.
   experimenter: ['Mike Economo']
   file_create_date: [datetime.datetime(2019, 10, 7, 17, 40, 36, 567483, tzinfo=tzoffset(None, -18000))]
   identifier: anm369962_2017-03-09_0
   institution: Janelia Research Campus
   intervals: {
     trials <class 'pynwb.epoch.TimeIntervals'>
   }
   keywords: <LindiH5pyDataset: /general/keywords>
  