Loading dataset from Allen repository.

In [1]:
import os
import sys
import numpy as np
import pandas as pd
import csv

# import from scripts
import os
current_wd = os.getcwd()
os.chdir(os.path.abspath("..\\..\\..\\isttc\\scripts"))
from cfg_global import project_folder_path
os.chdir(current_wd)

In [2]:
from allensdk.brain_observatory.ecephys.ecephys_project_cache import EcephysProjectCache
from allensdk.brain_observatory.ecephys.visualization import raster_plot

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# cache directory path, it determines where downloaded data will be stored
output_dir = project_folder_path + 'ecephys_cache_dir//'
manifest_path = os.path.join(output_dir, "manifest.json")
cache = EcephysProjectCache.from_warehouse(manifest=manifest_path)
print(cache.get_all_session_types())

['brain_observatory_1.1', 'functional_connectivity']


In [4]:
download_from_warehouse = False

In [5]:
dataset_folder = project_folder_path + 'results\\allen_mice\\dataset\\'

### Download data to local drive 

already done

In [None]:
# load all sessions to local drive 
if download_from_warehouse:
    for session_id, row in brain_observatory_type_sessions.iterrows():
    
        truncated_file = True
        directory = os.path.join(output_dir + '/session_' + str(session_id))
    
        while truncated_file:
            session = cache.get_session_data(session_id)
            try:
                print(session_id)
                print(session.specimen_name)
                truncated_file = False
            except OSError:
                shutil.rmtree(directory)
                print(" Truncated spikes file, re-downloading")

### Get sessions data

Sessions are already loaded on local drive.

In [6]:
# functional connecivity dataset contains 30 min spontaneous activity block
sessions = cache.get_session_table()
print('len sessions: {}'.format(len(sessions)))

brain_observatory_type_sessions = sessions[sessions["session_type"] == "functional_connectivity"]
print('len brain_observatory_type_sessions = functional_connectivity: {}'.format(len(brain_observatory_type_sessions)))
print(brain_observatory_type_sessions.keys())

brain_observatory_type_sessions.tail(3)

len sessions: 58
len brain_observatory_type_sessions = functional_connectivity: 26
Index(['published_at', 'specimen_id', 'session_type', 'age_in_days', 'sex',
       'full_genotype', 'unit_count', 'channel_count', 'probe_count',
       'ecephys_structure_acronyms'],
      dtype='object')


Unnamed: 0_level_0,published_at,specimen_id,session_type,age_in_days,sex,full_genotype,unit_count,channel_count,probe_count,ecephys_structure_acronyms
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
839557629,2019-10-03T00:00:00Z,821469666,functional_connectivity,115.0,M,Pvalb-IRES-Cre/wt;Ai32(RCL-ChR2(H134R)_EYFP)/wt,450,1853,5,"[APN, NOT, MB, DG, CA1, VISam, nan, VISpm, LGd..."
840012044,2019-10-03T00:00:00Z,820866121,functional_connectivity,116.0,M,Pvalb-IRES-Cre/wt;Ai32(RCL-ChR2(H134R)_EYFP)/wt,758,2298,6,"[APN, DG, CA1, VISam, nan, LP, VISpm, VISp, LG..."
847657808,2019-10-03T00:00:00Z,827809884,functional_connectivity,126.0,F,wt/wt,874,2298,6,"[APN, NOT, DG, HPF, ProS, CA1, VISam, nan, MB,..."


In [None]:
# brain_observatory_type_sessions.to_pickle(dataset_folder + 'functional_connectivity_sessions_info_df.pkl')

### Get single units 

using 30 min of spontaneous activity (animals were shown grey screen)

In [7]:
output_filename = dataset_folder + 'allen_func_conn_around30min_spont_with_quality_metrics.csv'
output_log = dataset_folder + 'dataload_log_with_quality_metrics.txt'
verbose = True

In [9]:
old_stdout = sys.stdout
sys.stdout = open(output_log, 'w')

for session_id in brain_observatory_type_sessions.index.values:

    print('############################')
    print('processing session {}'.format(session_id))
    
    # load session
    session = cache.get_session_data(session_id)
    
    # load units
    units_df = session.units
    print('len units df {}'.format(len(units_df)))
    
    # load stimulus presentation 
    presentations = session.get_stimulus_table("spontaneous")
    spont_period_id = presentations.query('duration > 1200').index.values[0]
    print('Spontaneous period {}'.format(presentations.loc[spont_period_id, :]))
    
    # load spikes from stimulus period
    spikes_df = session.presentationwise_spike_times(
        stimulus_presentation_ids=spont_period_id,  
        unit_ids=units_df.index.values
    )
    
    # make df
    spikes_df['time_since_stimulus_presentation_onset_str'] = spikes_df.time_since_stimulus_presentation_onset.astype(str)
    spikes_wide_df = spikes_df.groupby(by='unit_id', as_index=False).agg(spike_times=('time_since_stimulus_presentation_onset_str', ','.join))
    
    units_df_subset = units_df[['ecephys_structure_acronym', 'firing_rate', 'amplitude_cutoff', 'isi_violations', 'presence_ratio']].copy()
    units_df_subset.reset_index(inplace=True)
    units_df_subset['specimen_id'] = brain_observatory_type_sessions.loc[session_id, :]['specimen_id']
    units_df_subset['session_id'] = session_id
    
    units_merged_df = pd.merge(units_df_subset, spikes_wide_df, on='unit_id', how='inner')
    
    # write to file 
    spikes_out_dict = units_merged_df.to_dict(orient='index')
    print('Writing to csv...')
    with open(output_filename, 'a', newline='') as f:
        writer = csv.writer(f)
        for k,v in spikes_out_dict.items():
            if verbose:
                print('Writing unit {}'.format(v['unit_id']))
            #    spikes_l = spike_train.tolist()
            row = [v[
                   'specimen_id']] + [v[
                   'session_id']] + [v[
                   'unit_id']] + [v[
                   'ecephys_structure_acronym']] + [v[
                   'firing_rate']] + [v[
                   'amplitude_cutoff']] + [v[
                   'isi_violations']] + [v[
                   'presence_ratio']] + list(map(float, v['spike_times'].split(',')))
            writer.writerow(row)

sys.stdout = old_stdout

  warn("Ignoring cached namespace '%s' version %s because version %s is already loaded."
  warn("Ignoring cached namespace '%s' version %s because version %s is already loaded."
  warn("Ignoring cached namespace '%s' version %s because version %s is already loaded."
  warn("Ignoring cached namespace '%s' version %s because version %s is already loaded."
  warn("Ignoring cached namespace '%s' version %s because version %s is already loaded."
  warn("Ignoring cached namespace '%s' version %s because version %s is already loaded."
  warn("Ignoring cached namespace '%s' version %s because version %s is already loaded."
  warn("Ignoring cached namespace '%s' version %s because version %s is already loaded."


### Some usefull line

In [None]:
# By default, the AllenSDK applies filters so only units above a set of thresholds are returned.
# The default filter values are as follows:

# isi_violations < 0.5
# amplitude_cutoff < 0.1
# presence_ratio > 0.9
# units = cache.get_units()

units = cache.get_units(amplitude_cutoff_maximum = np.inf,
                        presence_ratio_minimum = -np.inf,
                        isi_violations_maximum = np.inf)
print(units.keys())
len(units)

In [None]:
units.query('session_type == "functional_connectivity"')