# Exploring the experiment meta data

In [1]:
import xarray as xr

### How many data files do we have?

In [2]:
import glob

In [3]:
all_data_files = glob.glob("../steinmentz_data_curation/data/**/*.nc", recursive=True)
all_data_files

['../steinmentz_data_curation/data/processed/neuropixels/steinmetz_2016-12-14_Cori.nc',
 '../steinmentz_data_curation/data/processed/neuropixels/steinmetz_2016-12-17_Cori.nc',
 '../steinmentz_data_curation/data/processed/neuropixels/steinmetz_2016-12-18_Cori.nc',
 '../steinmentz_data_curation/data/processed/neuropixels/steinmetz_2017-01-07_Muller.nc',
 '../steinmentz_data_curation/data/processed/neuropixels/steinmetz_2017-01-08_Muller.nc',
 '../steinmentz_data_curation/data/processed/neuropixels/steinmetz_2017-01-08_Radnitz.nc',
 '../steinmentz_data_curation/data/processed/neuropixels/steinmetz_2017-01-09_Muller.nc',
 '../steinmentz_data_curation/data/processed/neuropixels/steinmetz_2017-01-09_Radnitz.nc',
 '../steinmentz_data_curation/data/processed/neuropixels/steinmetz_2017-01-10_Radnitz.nc',
 '../steinmentz_data_curation/data/processed/neuropixels/steinmetz_2017-01-11_Radnitz.nc',
 '../steinmentz_data_curation/data/processed/neuropixels/steinmetz_2017-01-12_Radnitz.nc',
 '../steinm

In [4]:
len(all_data_files)

39

### Loading one of the data files as an XArray Dataset

In [5]:
data_path = all_data_files[0]
data_path

'../steinmentz_data_curation/data/processed/neuropixels/steinmetz_2016-12-14_Cori.nc'

In [6]:
dset = xr.open_dataset(data_path)
dset

In [7]:
list(dset.variables)

['mouse',
 'session_date',
 'trial',
 'contrast_left',
 'contrast_right',
 'gocue',
 'stim_onset',
 'feedback_type',
 'feedback_time',
 'response_type',
 'response_time',
 'reaction_type',
 'reaction_time',
 'prev_reward',
 'active_trials',
 'time',
 'wheel',
 'licks',
 'pupil_x',
 'pupil_y',
 'pupil_area',
 'face',
 'cell',
 'spks',
 'trough_to_peak',
 'ccf_ap',
 'ccf_dv',
 'ccf_lr',
 'brain_area',
 'brain_groups']

In [8]:
dset.face

In [9]:
dset["contrast_left"]

In [10]:
dset[["contrast_left", "contrast_right"]]

In [11]:
subset_dset = dset[["contrast_left", "contrast_right"]]

In [12]:
subset_dset.to_dataframe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,contrast_left,contrast_right
mouse,session_date,trial,Unnamed: 3_level_1,Unnamed: 4_level_1
Cori,2016-12-14,1,100,0
Cori,2016-12-14,2,0,50
Cori,2016-12-14,3,100,50
Cori,2016-12-14,4,0,0
Cori,2016-12-14,5,50,100
Cori,2016-12-14,...,...,...
Cori,2016-12-14,360,50,25
Cori,2016-12-14,361,50,25
Cori,2016-12-14,362,0,50
Cori,2016-12-14,363,25,0


In [13]:
subset_dset.to_dataframe().reset_index()

Unnamed: 0,mouse,session_date,trial,contrast_left,contrast_right
0,Cori,2016-12-14,1,100,0
1,Cori,2016-12-14,2,0,50
2,Cori,2016-12-14,3,100,50
3,Cori,2016-12-14,4,0,0
4,Cori,2016-12-14,5,50,100
...,...,...,...,...,...
359,Cori,2016-12-14,360,50,25
360,Cori,2016-12-14,361,50,25
361,Cori,2016-12-14,362,0,50
362,Cori,2016-12-14,363,25,0


In [14]:
dset[["contrast_left", "contrast_right", "spks"]]

In [15]:
dset[["contrast_left", "contrast_right", "spks"]].to_dataframe().reset_index()

Unnamed: 0,mouse,session_date,trial,cell,time,contrast_left,contrast_right,spks
0,Cori,2016-12-14,1,1,0.01,100,0,0
1,Cori,2016-12-14,1,1,0.02,100,0,0
2,Cori,2016-12-14,1,1,0.03,100,0,0
3,Cori,2016-12-14,1,1,0.04,100,0,0
4,Cori,2016-12-14,1,1,0.05,100,0,0
...,...,...,...,...,...,...,...,...
66793995,Cori,2016-12-14,364,734,2.46,100,100,0
66793996,Cori,2016-12-14,364,734,2.47,100,100,0
66793997,Cori,2016-12-14,364,734,2.48,100,100,0
66793998,Cori,2016-12-14,364,734,2.49,100,100,0


## Get all the variables with dimensions `(mouse, session_date, trial)`

In [16]:
list(dset.variables)

['mouse',
 'session_date',
 'trial',
 'contrast_left',
 'contrast_right',
 'gocue',
 'stim_onset',
 'feedback_type',
 'feedback_time',
 'response_type',
 'response_time',
 'reaction_type',
 'reaction_time',
 'prev_reward',
 'active_trials',
 'time',
 'wheel',
 'licks',
 'pupil_x',
 'pupil_y',
 'pupil_area',
 'face',
 'cell',
 'spks',
 'trough_to_peak',
 'ccf_ap',
 'ccf_dv',
 'ccf_lr',
 'brain_area',
 'brain_groups']

In [17]:
dset["trial"]

In [18]:
metadata_variables = [
    'mouse',
    'session_date',
    'trial',
    'contrast_left',
    'contrast_right',
    'gocue',
    'stim_onset',
    # 'feedback_type',
    # 'feedback_time',
    # 'response_type',
    # 'response_time',
    # 'reaction_type',
    # 'reaction_time',
    # 'prev_reward',
    'active_trials'
]

In [19]:
dset[metadata_variables]

In [20]:
df = dset[metadata_variables].to_dataframe().reset_index()
df

Unnamed: 0,mouse,session_date,trial,contrast_left,contrast_right,gocue,stim_onset,active_trials
0,Cori,2016-12-14,1,100,0,1.027216,0.5,True
1,Cori,2016-12-14,2,0,50,0.874414,0.5,True
2,Cori,2016-12-14,3,100,50,0.825213,0.5,True
3,Cori,2016-12-14,4,0,0,0.761612,0.5,True
4,Cori,2016-12-14,5,50,100,0.662010,0.5,True
...,...,...,...,...,...,...,...,...
359,Cori,2016-12-14,360,50,25,,0.5,False
360,Cori,2016-12-14,361,50,25,,0.5,False
361,Cori,2016-12-14,362,0,50,,0.5,False
362,Cori,2016-12-14,363,25,0,,0.5,False


## Collect data for all sessions

In [21]:
import pandas as pd

In [22]:
all_sessions = []
for data_path in all_data_files:
    dset = xr.open_dataset(data_path)
    all_sessions.append(dset[metadata_variables].to_dataframe().reset_index())
    
df = pd.concat(all_sessions).reset_index()

In [23]:
df

Unnamed: 0,index,mouse,session_date,trial,contrast_left,contrast_right,gocue,stim_onset,active_trials
0,0,Cori,2016-12-14,1,100,0,1.027216,0.5,True
1,1,Cori,2016-12-14,2,0,50,0.874414,0.5,True
2,2,Cori,2016-12-14,3,100,50,0.825213,0.5,True
3,3,Cori,2016-12-14,4,0,0,0.761612,0.5,True
4,4,Cori,2016-12-14,5,50,100,0.662010,0.5,True
...,...,...,...,...,...,...,...,...,...
14415,421,Lederberg,2017-12-11,422,100,100,,0.5,False
14416,422,Lederberg,2017-12-11,423,0,100,,0.5,False
14417,423,Lederberg,2017-12-11,424,0,50,,0.5,False
14418,424,Lederberg,2017-12-11,425,0,100,,0.5,False


### How many different mice are included in this data?

In [24]:
df.mouse.unique()

array(['Cori', 'Muller', 'Radnitz', 'Moniz', 'Hench', 'Theiler',
       'Richards', 'Forssmann', 'Lederberg', 'Tatum'], dtype=object)

## How many trials were recorded for each mouse?

In [25]:
df.mouse.value_counts().sort_index()

Cori         1143
Forssmann    1485
Hench        1851
Lederberg    2902
Moniz         896
Muller       1112
Radnitz      1512
Richards     1677
Tatum        1389
Theiler       453
Name: mouse, dtype: int64

## How many sessions were recorded for each mouse?

In [26]:
df.groupby("mouse").apply(lambda x: len(x.session_date.unique()))

mouse
Cori         3
Forssmann    4
Hench        4
Lederberg    7
Moniz        3
Muller       3
Radnitz      5
Richards     5
Tatum        4
Theiler      1
dtype: int64

## Is the `stimulus_onset` varying?

In [27]:
df.stim_onset.unique()

array([0.5])

## How long did it take to conduct all the sessions?

In [28]:
df_sorted_in_time = df.sort_values(by="session_date")
first_date = df_sorted_in_time.iloc[0]["session_date"]
last_date = df_sorted_in_time.iloc[-1]["session_date"]

pd.to_datetime(last_date) - pd.to_datetime(first_date)

Timedelta('362 days 00:00:00')

## How many active trials per session?

In [29]:
df.groupby("session_date").apply(lambda x: sum(x.active_trials))

session_date
2016-12-14    214
2016-12-17    251
2016-12-18    228
2017-01-07    444
2017-01-08    412
2017-01-09    365
2017-01-10    253
2017-01-11    142
2017-01-12    128
2017-05-15    247
2017-05-16    235
2017-05-18    124
2017-06-15    250
2017-06-16    372
2017-06-17    447
2017-06-18    342
2017-10-11    343
2017-10-29    143
2017-10-30    237
2017-10-31    260
2017-11-01    440
2017-11-02    550
2017-11-04    290
2017-11-05    252
2017-12-05    340
2017-12-06    611
2017-12-07    526
2017-12-08    585
2017-12-09    479
2017-12-10    224
2017-12-11    316
dtype: int64

## Percentage of active trials per session?

In [30]:
df.groupby("session_date").apply(lambda x: sum(x.active_trials)/len(x.active_trials) * 100)

session_date
2016-12-14    58.791209
2016-12-17    62.593516
2016-12-18    60.317460
2017-01-07    80.144404
2017-01-08    65.189873
2017-01-09    62.393162
2017-01-10    69.696970
2017-01-11    56.349206
2017-01-12    53.781513
2017-05-15    69.187675
2017-05-16    68.115942
2017-05-18    63.917526
2017-06-15    69.444444
2017-06-16    77.178423
2017-06-17    80.251346
2017-06-18    75.663717
2017-10-11    75.717439
2017-10-29    56.521739
2017-10-30    68.299712
2017-10-31    70.270270
2017-11-01    66.666667
2017-11-02    71.428571
2017-11-04    72.500000
2017-11-05    69.613260
2017-12-05    75.555556
2017-12-06    73.525872
2017-12-07    70.509383
2017-12-08    72.670807
2017-12-09    68.526466
2017-12-10    67.065868
2017-12-11    74.178404
dtype: float64

## Which session and mouse had the highest percentage of active trials?

In [31]:
active_trials = df.groupby("session_date").apply(lambda x: sum(x.active_trials)/len(x.active_trials) * 100)

In [32]:
active_trials[active_trials == active_trials.max()]

session_date
2017-06-17    80.251346
dtype: float64

In [33]:
df[df.session_date == "2017-06-17"]["mouse"].unique()

array(['Hench'], dtype=object)