based on notebooks in  
https://github.com/dandi/example-notebooks/tree/master/tutorials/neurodatarehack_2024 

In [1]:
import json
import numpy as np
from dandi.dandiapi import DandiAPIClient
from tqdm.notebook import tqdm
from isodate import parse_duration, Duration
from datetime import datetime
from warnings import simplefilter
simplefilter("ignore")  # Suppress namespace warnings from reading older NWB files

from nwbinspector.tools import get_s3_urls_and_dandi_paths
from pynwb import NWBHDF5IO
import remfile
import h5py

from collections import Counter

import lindi, pynwb

from dandi.dandiapi import DandiAPIClient
import pandas as pd
import pickle

In [2]:
client = DandiAPIClient()
dandisets = list(client.get_dandisets())

# nwb dandisets
Find nwb dandisets

In [3]:
nwb_dandisets = []

for dandiset in tqdm(dandisets):
    raw_metadata = dandiset.get_raw_metadata()

    if any(
        data_standard['identifier'] == "RRID:SCR_015242"  # this is the RRID for NWB
        for data_standard in raw_metadata['assetsSummary'].get('dataStandard', [])
    ):
        nwb_dandisets.append(dandiset)
print(f"There are currently {len(nwb_dandisets)} NWB datasets on DANDI!")

  0%|          | 0/700 [00:00<?, ?it/s]

There are currently 426 NWB datasets on DANDI!


# dandiset behavior types 
Find nwb dandisets with behavior assets

In [4]:

def contains_behavior(data):
    if isinstance(data, dict):
        return any(contains_behavior(v) for v in data.values())
    elif isinstance(data, list):
        return any(contains_behavior(v) for v in data)
    elif isinstance(data, str):
        return 'behavior' in data.lower()
    return False

def find_behavior_keys(data, parent_key=""):
    keys_with_behavior = []

    if isinstance(data, dict):
        for key, value in data.items():
            full_key = f"{parent_key}.{key}" if parent_key else key
            keys_with_behavior.extend(find_behavior_keys(value, full_key))
    elif isinstance(data, list):
        for index, item in enumerate(data):
            full_key = f"{parent_key}[{index}]"
            # keys_with_behavior.extend(find_behavior_keys(item, full_key))
            keys_with_behavior.extend(find_behavior_keys(item, parent_key))
    elif isinstance(data, str):
        if 'behavior' in data.lower():
            keys_with_behavior.append(parent_key)

    return keys_with_behavior


In [5]:
# contains a behavior key

behavior_keys_list = []
behavior_keys_dandisets = []
for dset in tqdm(nwb_dandisets):
    raw_metadata = dset.get_raw_metadata()

    behavior_keys = find_behavior_keys(raw_metadata)
    behavior_keys_list.extend(behavior_keys)
    
    if behavior_keys: 
        approaches = raw_metadata['assetsSummary'].get('approach', [])
        if (
            any('electrophysiological' in a.get('name', '') for a in approaches)
        ):
            exist_ephys = True
        else: 
            exist_ephys = False
        if any('assetssummary' in x.lower() for x in behavior_keys):
            exist_asset = True 
        else:
            exist_asset = False  
        behavior_keys_dandisets.append({
            "dandiset_id": dset.identifier,
            "dandiset": dset,
            "behavior_keys": behavior_keys,
            "asset": exist_asset,
            "ephys": exist_ephys
        })

behavior_keys_dandisets = pd.DataFrame(behavior_keys_dandisets)
# count occurrences of each key
behavior_keys_counter = Counter(behavior_keys_list)
behavior_keys_counter


  0%|          | 0/426 [00:00<?, ?it/s]

Counter({'assetsSummary.approach.name': 123,
         'assetsSummary.measurementTechnique.name': 123,
         'description': 97,
         'assetsSummary.variableMeasured': 92,
         'name': 30,
         'citation': 30,
         'relatedResource.name': 19,
         'keywords': 15,
         'relatedResource.url': 11,
         'studyTarget': 8,
         'contributor.name': 4,
         'about.name': 4,
         'relatedResource.repository': 3,
         'contributor.affiliation.name': 2,
         'acknowledgement': 2,
         'relatedResource.identifier': 1,
         'wasGeneratedBy.description': 1})

In [6]:
behavior_keys_dandisets.head(10)

Unnamed: 0,dandiset_id,dandiset,behavior_keys,asset,ephys
0,3,DANDI:000003/0.250624.0409,"[name, citation, description, assetsSummary.ap...",True,True
1,4,DANDI:000004/0.220126.1852,"[contributor.affiliation.name, contributor.aff...",False,True
2,6,DANDI:000006/0.220126.1855,"[assetsSummary.approach.name, assetsSummary.va...",True,True
3,9,DANDI:000009/0.220126.1903,"[assetsSummary.approach.name, assetsSummary.va...",True,True
4,10,DANDI:000010/0.220126.1905,"[assetsSummary.approach.name, assetsSummary.va...",True,True
5,11,DANDI:000011/0.220126.1907,"[assetsSummary.approach.name, assetsSummary.va...",True,True
6,13,DANDI:000013/0.220126.2143,"[assetsSummary.approach.name, assetsSummary.va...",True,True
7,15,DANDI:000015/0.220126.1914,"[assetsSummary.approach.name, assetsSummary.va...",True,False
8,16,DANDI:000016/draft,"[assetsSummary.approach.name, assetsSummary.va...",True,False
9,17,DANDI:000017/0.240329.1926,"[assetsSummary.approach.name, assetsSummary.va...",True,True


In [7]:
behavior_assets_dandisets = behavior_keys_dandisets[(behavior_keys_dandisets['asset']==True) & 
                                                    (behavior_keys_dandisets['ephys']==True)
                                                    ]
behavior_assets_dandisets.head(10)

Unnamed: 0,dandiset_id,dandiset,behavior_keys,asset,ephys
0,3,DANDI:000003/0.250624.0409,"[name, citation, description, assetsSummary.ap...",True,True
2,6,DANDI:000006/0.220126.1855,"[assetsSummary.approach.name, assetsSummary.va...",True,True
3,9,DANDI:000009/0.220126.1903,"[assetsSummary.approach.name, assetsSummary.va...",True,True
4,10,DANDI:000010/0.220126.1905,"[assetsSummary.approach.name, assetsSummary.va...",True,True
5,11,DANDI:000011/0.220126.1907,"[assetsSummary.approach.name, assetsSummary.va...",True,True
6,13,DANDI:000013/0.220126.2143,"[assetsSummary.approach.name, assetsSummary.va...",True,True
9,17,DANDI:000017/0.240329.1926,"[assetsSummary.approach.name, assetsSummary.va...",True,True
10,29,DANDI:000029/0.231017.2004,"[assetsSummary.approach.name, assetsSummary.va...",True,True
13,39,DANDI:000039/0.230223.1216,"[assetsSummary.approach.name, assetsSummary.va...",True,True
15,45,DANDI:000045/0.211209.1413,"[name, citation, assetsSummary.approach.name, ...",True,True


# behavior asset temporal resolution

Get behavior and temporal info from nwb dandisets with behavior assets.  

steps:
- dandiset name  
- get filepaths (for now [0])  
- get file, file.download_url (api dandi path)  
- stream nwb file with lindi
- check if pynwb.behavior class is empty, which  
- calculate time resolution
- save

In [8]:
import pynwb.behavior
import inspect

def get_non_empty_behavior_classes(nwb):
    """
    Function to find non-empty pynwb.behavior classes in an NWB file.

    Parameters:
        nwb: The NWB file object.

    Returns:
        A dictionary where keys are class names and values are counts of non-empty instances.
    """
    # Get all classes in pynwb.behavior
    behavior_classes = [
        cls for name, cls in inspect.getmembers(pynwb.behavior, inspect.isclass)
        if cls.__module__ == 'pynwb.behavior'
    ]

    # Check which classes are non-empty
    non_empty_classes = {}

    for behavior_class in behavior_classes:
        non_empty_objects = [
            x for x in nwb.objects.values() if isinstance(x, behavior_class)
        ]
        if non_empty_objects:  
            non_empty_classes[behavior_class.__name__] = {
                'count': len(non_empty_objects),
                'objects': non_empty_objects,
                }

    return non_empty_classes

In [9]:
behavior_assets_dandisets['filepaths'] = ''

for i, dset in tqdm(behavior_assets_dandisets.iterrows(), total=len(behavior_assets_dandisets)):
    # if i >2:
    #     break
    dandiset_id = dset['dandiset'].get_raw_metadata()['id'].split('/')[0].split(':')[-1]
    with DandiAPIClient() as client:
        c=client.get_dandiset(dandiset_id)
    files = [asset for asset in c.get_assets()]
    behavior_assets_dandisets.at[i, 'filepaths'] = files
    # print(files[0].path)

behavior_assets_dandisets = behavior_assets_dandisets.reset_index()
behavior_assets_dandisets.head(10)

  0%|          | 0/64 [00:00<?, ?it/s]

Unnamed: 0,index,dandiset_id,dandiset,behavior_keys,asset,ephys,filepaths
0,0,3,DANDI:000003/0.250624.0409,"[name, citation, description, assetsSummary.ap...",True,True,[DANDI:assets/5e9e92e1-f044-4aa0-ab47-1cfcb889...
1,2,6,DANDI:000006/0.220126.1855,"[assetsSummary.approach.name, assetsSummary.va...",True,True,[DANDI:assets/a5ad932b-b893-4522-b989-8f406d78...
2,3,9,DANDI:000009/0.220126.1903,"[assetsSummary.approach.name, assetsSummary.va...",True,True,[DANDI:assets/250ea757-e6a9-4520-99b5-f2efd5e3...
3,4,10,DANDI:000010/0.220126.1905,"[assetsSummary.approach.name, assetsSummary.va...",True,True,[DANDI:assets/6b3b38b9-0736-46a4-a348-b00af509...
4,5,11,DANDI:000011/0.220126.1907,"[assetsSummary.approach.name, assetsSummary.va...",True,True,[DANDI:assets/88dd3ee7-a37a-44b1-bb64-89855040...
5,6,13,DANDI:000013/0.220126.2143,"[assetsSummary.approach.name, assetsSummary.va...",True,True,[DANDI:assets/cbcf1d6d-7f64-4d1f-8692-75e09e17...
6,9,17,DANDI:000017/0.240329.1926,"[assetsSummary.approach.name, assetsSummary.va...",True,True,[DANDI:assets/2c6984f5-5fd7-4ccb-8f05-1961df65...
7,10,29,DANDI:000029/0.231017.2004,"[assetsSummary.approach.name, assetsSummary.va...",True,True,[DANDI:assets/7a3d889c-b513-40d6-b3e9-74ccb24e...
8,13,39,DANDI:000039/0.230223.1216,"[assetsSummary.approach.name, assetsSummary.va...",True,True,[DANDI:assets/f2ca9a62-7034-4b58-892b-8a3cbf32...
9,15,45,DANDI:000045/0.211209.1413,"[name, citation, assetsSummary.approach.name, ...",True,True,[DANDI:assets/b7855fa3-b6ee-4545-86b4-b4413095...


In [10]:
files[0].get_content_url(follow_redirects=1, strip_query=True)

'https://dandiarchive.s3.amazonaws.com/blobs/390/a27/390a27ba-13ed-42fb-8709-8fa6bbcca456'

In [11]:
def get_time_info_from_behavior_class(behavior_class, nwb=None):
    # ignore pupil tracking for now

    time_info = {}
    behavior_keys = behavior_class.keys()

    spatial_key_list = ['SpatialSeries', 'Eyetracking', 'CompassDirection']
    if behavior_keys:
        for key in behavior_keys:
            try:
                if key == 'BehavioralEvents':
                    try:
                        time_series = behavior_class[key]['objects'][0].time_series
                        # print(len(time_series))
                        if len(time_series) == 0:
                            print("BehavioralEvents is empty")
                            time_info[key] = 'empty'
                        else:
                            time_stamps = list(time_series.values())[0].timestamps[:100]
                            diffs = np.diff(time_stamps)
                            mean_diff = np.mean(diffs)
                            std_diff = np.std(diffs)

                            time_info[key] = {
                                "mean_event_diff": np.round(mean_diff, 6),
                                "std_event_diff": np.round(std_diff, 6)
                            }
                    except Exception as e:
                        time_info[key] = e
                
                elif key == 'BehavioralTimeSeries':
                    obj_time_infos = {}
                    for i in range(len(behavior_class[key]['objects'])):
                        obj = behavior_class[key]['objects'][i]
                        obj_name = list(obj.time_series.keys())[0]
                        try:
                            # all = [x for x in nwb.objects.values() if isinstance(x, pynwb.behavior.TimeSeries)]
                            # time_stamps = all[0].timestamps[:1000]
                            list_dicts = list(obj.time_series.values())
                            # print(len(list_dicts))
                            for j in range(len(list_dicts)):
                                ts = list_dicts[j]
                                # print(ts)
                                starting_time = ts.starting_time
                                # print(starting_time)
                                starting_time_unit = ts.starting_time_unit
                                rate = ts.rate

                                time_stamps = ts.timestamps
                                if time_stamps is not None:
                                    diffs = np.diff(time_stamps[:500])
                                    mean_diff = np.mean(diffs)
                                    std_diff = np.std(diffs)
                                else:
                                    diffs, mean_diff, std_diff = None, None, None

                                obj_time_infos[obj_name] = {
                                    'name': obj_name,
                                    'rate': rate,
                                    'starting_time': starting_time,
                                    'starting_time_unit': starting_time_unit,
                                    "mean_diff": mean_diff,
                                    "std_diff": std_diff,
                                }
                        except Exception as e:
                            obj_time_infos[obj.name] = e
                    time_info[key] = obj_time_infos
                    
                elif key in spatial_key_list:
                    obj_time_infos = {}
                    for i in range(len(behavior_class[key]['objects'])):
                        obj = behavior_class[key]['objects'][i]
                        try:
                            # all = [x for x in nwb.objects.values() if isinstance(x, pynwb.behavior.TimeSeries)]
                            # return all 
                            starting_time = obj.starting_time
                            starting_time_unit = obj.starting_time_unit
                            rate = obj.rate
                            
                            time_stamps = obj.timestamps
                            if time_stamps is not None:
                                diffs = np.diff(time_stamps[:500])
                                mean_diff = np.mean(diffs)
                                std_diff = np.std(diffs)
                            else:
                                diffs, mean_diff, std_diff = None, None, None

                            obj_time_infos[obj.name] = {
                                'name': obj.name,
                                'rate': rate,
                                'starting_time': starting_time,
                                'starting_time_unit': starting_time_unit,
                                "mean_diff": mean_diff,
                                "std_diff": std_diff
                            }
                            
                            if rate is  None:
                                rate = 'no rate, check timestamps'
                              
                            
                            
                        except Exception as e:
                            obj_time_infos[obj.name] = e
                    time_info[key] = obj_time_infos
                elif key == 'Position':
                    obj_time_infos = {}
                    for i in range(len(behavior_class[key]['objects'])):
                        objs = list(behavior_class[key]['objects'][i].spatial_series.values())
                        # print(f"position length {len(objs)}")
                        for j in range(len(objs)):
                            obj = objs[j]
                            # print(obj)
                            try:
                                # all = [x for x in nwb.objects.values() if isinstance(x, pynwb.behavior.TimeSeries)]
                                # return all 
                                starting_time = obj.starting_time
                                starting_time_unit = obj.starting_time_unit
                                rate = obj.rate
                                
                                time_stamps = obj.timestamps
                                if time_stamps is not None:
                                    diffs = np.diff(time_stamps[:500])
                                    mean_diff = np.mean(diffs)
                                    std_diff = np.std(diffs)
                                else:
                                    diffs, mean_diff, std_diff = None, None, None

                                obj_time_infos[obj.name] = {
                                    'name': obj.name,
                                    'rate': rate,
                                    'starting_time': starting_time,
                                    'starting_time_unit': starting_time_unit,
                                    "mean_diff": mean_diff,
                                    "std_diff": std_diff
                                }
                                
                                if rate is  None:
                                    rate = 'no rate, check timestamps'
                            except Exception as e:
                                obj_time_infos[obj.name] = e
                        time_info[key] = obj_time_infos
            except Exception as e:
                time_info[key] = Exception
                    

    return time_info, behavior_class

In [12]:
behavior_assets_dandisets_time = behavior_assets_dandisets.copy()
behavior_assets_dandisets_time['experiment_description'] = ''
behavior_assets_dandisets_time['behavior_class'] = ''
behavior_assets_dandisets_time['time_info'] = ''


The following cell takes a while... (~9 minute locally on a macbookpro connected to eduroam)

In [16]:
problems = []

for i, dset in tqdm(behavior_assets_dandisets_time.iterrows(), total=len(behavior_assets_dandisets_time)):
    try:

        dandiset_id = dset['dandiset_id']
        first_filepath = dset['filepaths'][0].path

        with DandiAPIClient() as client:
            file=client.get_dandiset(dandiset_id).get_asset_by_path(first_filepath)
            lindi_url = file.download_url
        f = lindi.LindiH5pyFile.from_hdf5_file(lindi_url)
        nwb = pynwb.NWBHDF5IO(file=f, mode='r').read()

        behavior_classes = get_non_empty_behavior_classes(nwb)
        behavior_assets_dandisets_time.at[i, 'experiment_description'] = nwb.experiment_description
        behavior_assets_dandisets_time.at[i, 'behavior_class'] = behavior_classes
        
        time_info = get_time_info_from_behavior_class(behavior_classes)
        behavior_assets_dandisets_time.at[i, 'time_info'] = time_info
    except Exception as e:
        problems.append((i, dandiset_id, first_filepath, dset['dandiset'], e))

# remove probelem index
for index, _, _, _ , _ in problems:
    behavior_assets_dandisets_time = behavior_assets_dandisets_time.drop(index)

behavior_assets_dandisets_time = behavior_assets_dandisets_time.reset_index(drop=True)

with open('./problems.txt', 'w') as file:
    for problem in problems:
        file.write(f"{problem}\n")  
# behavior_assets_dandisets_time.to_csv('./behavior_assets_dandisets_time.csv')
# behavior_assets_dandisets_time_info = behavior_assets_dandisets_time[['dandiset_id', 'dandiset', 'behavior_class', 'time_info']]
# behavior_assets_dandisets_time_info.to_csv('./behavior_assets_dandisets_time_info.csv')

# Problem files

In [17]:
problems

[(34,
  '000409',
  'sub-NR-0027/sub-NR-0027_ses-ae8787b1-4229-4d56-b0c2-566b61a25b77_behavior+ecephys+image/sub-NR-0027_ses-ae8787b1-4229-4d56-b0c2-566b61a25b77_OriginalVideoBodyCamera.mp4',
  <dandi.dandiapi.RemoteDandiset at 0x113991860>,
  OSError('Unable to synchronously open file (file signature not found)')),
 (41,
  '000568',
  'sub-fCamk1/sub-fCamk1_ses-fCamk1-200902-sess13_behavior+ecephys+image/ee253149-84cc-47d3-b06f-287c51eb3771_external_file_0.avi',
  <dandi.dandiapi.RemoteDandiset at 0x113993850>,
  OSError('Unable to synchronously open file (file signature not found)'))]

In [19]:
for (_, i, f, _, _) in problems:
# for i in ['000409', '000568']:
    with DandiAPIClient() as client:
        c=client.get_dandiset(i)
    print(i)
    print([asset for asset in c.get_assets()][0].path)
    print("")


000409
sub-NR-0027/sub-NR-0027_ses-ae8787b1-4229-4d56-b0c2-566b61a25b77_behavior+ecephys+image/sub-NR-0027_ses-ae8787b1-4229-4d56-b0c2-566b61a25b77_OriginalVideoBodyCamera.mp4

000568
sub-fCamk1/sub-fCamk1_ses-fCamk1-200902-sess13_behavior+ecephys+image/ee253149-84cc-47d3-b06f-287c51eb3771_external_file_0.avi



these two files in the above problems are video files can't be opened with lindi/hdf5

In [None]:
s3_urls_000409 = get_s3_urls_and_dandi_paths(dandiset_id='000409')
s3_urls_000409

#  relevant:
# 'https://dandiarchive.s3.amazonaws.com/blobs/cef/ed8/cefed852-b697-4083-b29b-57eebd169968': 
# 'sub-NR-0027/sub-NR-0027_ses-ae8787b1-4229-4d56-b0c2-566b61a25b77_behavior+ecephys+image.nwb',

In [None]:
s3_urls_000568 = get_s3_urls_and_dandi_paths(dandiset_id='000568')
s3_urls_000568

#  relevant:
#  'https://dandiarchive.s3.amazonaws.com/blobs/d01/527/d01527aa-10c1-42a5-ab6b-c4fecb3af558': 
#  'sub-fCamk1/sub-fCamk1_ses-fCamk1-200902-sess13_behavior+ecephys+image.nwb',