# Ingest acquisitions session path on bl_new_acquisition.Acquisitions

In [1]:
from scripts.conf_file_finding import try_find_conf_file
try_find_conf_file()

Local configuration file found !!, no need to run the configuration (unless configuration has changed)


## Connection to DB

In [2]:
import datajoint as dj
import pandas as pd
import utility.path_utility as pu
import os
import pathlib

import bl_pipeline.acquisition as acq

new_lab = dj.create_virtual_module('new_lab', 'bl_new_lab')
new_subject = dj.create_virtual_module('new_subject', 'bl_new_subject')
new_acquisition = dj.create_virtual_module('new_acquisition', 'bl_new_acquisition')

#bdata          = dj.create_virtual_module('bdata', 'bl_bdata')
#shadow_acquisition = dj.create_virtual_module('shadow_acquisition', 'bl_shadow_acquisition')
#new_acquisition = dj.create_virtual_module('new_acquisition', 'bl_new_acquisition')
#new_lab = dj.create_virtual_module('new_lav', 'bl_new_lab')
#ratinfo        = dj.create_virtual_module('ratinfo', 'bl_ratinfo')

Connecting alvaros@datajoint01.pni.princeton.edu:3306


### 1. Get all directories with raw acquisition from root_ephys_directory

In [24]:
root_dir = pathlib.Path(dj.config['custom']['ephys_root_data_dir'])

fields_t_acq  = pd.DataFrame.from_dict(acq.Acquisitions.heading.attributes, orient='index')
acquisitions_found = 0
acquisition_df = pd.DataFrame(columns=fields_t_acq.index.to_list()) 
acquisition_df = acquisition_df.drop(columns=['acquisition_id', 'user_id', 'acquisition_sessid'])

for root, dirs, files in os.walk(root_dir):
    for dirname in dirs:
        aux_dir = pathlib.Path(os.path.join(root, dirname))
        str_dir = str(aux_dir.as_posix())
        rel_dir = str_dir.replace(str(root_dir), "")
        status_dir = pu.check_file_pattern_dir(str(aux_dir), pu.file_pattern_ephys_session['raw_np_files'], search_childs=False)
        
        if status_dir: 
            acquisitions_found += 1
            acquisition_df.loc[len(acquisition_df.index), 'acquisition_raw_rel_path'] = rel_dir
            
acquisition_df            

KeyboardInterrupt: 

### 1a Get all directories with sorted results from clusterings_root_data_dir

In [36]:
sorted_dir = pathlib.Path(dj.config['custom']['clusterings_root_data_dir'])

fields_t_sort = pd.DataFrame.from_dict(acq.Sortings.heading.attributes, orient='index')
sorted_found = 0
sorted_df = pd.DataFrame(columns=fields_t_sort.index.to_list()) 
sorted_df = sorted_df.drop(columns=['acquisition_id', 'sorting_id'])


for root, dirs, files in os.walk(root_dir):
    for dirname in dirs:
        aux_dir = pathlib.Path(os.path.join(root, dirname))
        str_dir = str(aux_dir.as_posix())
        rel_dir = str_dir.replace(str(root_dir), "") 
        status_dir_sorted = pu.check_file_pattern_dir(str(aux_dir), pu.file_pattern_ephys_session['sorted_np_files'], search_childs=False)
        
        if status_dir_sorted: 
            sorted_found += 1
            sorted_df.loc[len(sorted_df.index), 'acquisition_post_rel_path'] = rel_dir            
    #if acquisitions_found > 100:
        #break
        
        
sorted_df


KeyboardInterrupt: 

### 2. Infer subject and rat from path

In [25]:
acquisition_df['acquisition_type'] = 'ephys'
acquisition_df['experimenter'] = acquisition_df['acquisition_raw_rel_path'].str.split('/').str[1]
acquisition_df['acquisition_rat'] = acquisition_df['acquisition_raw_rel_path'].str.split('/').str[2]

### 3. Get experimenter user_id and merge with corresponding acquisition

In [26]:
contact_df = pd.DataFrame(new_lab.Contacts.fetch('user_id', 'experimenter', as_dict=True))
acquisition_df_nouser = acquisition_df.copy()
acquisition_df = acquisition_df.merge(contact_df, on='experimenter', how='inner')
acquisition_df_nouser = acquisition_df_nouser.merge(contact_df, on='experimenter', how='left')
acquisition_df_nouser = acquisition_df_nouser.loc[acquisition_df_nouser['user_id'].isnull(), :]
acquisition_df_nouser

Unnamed: 0,acquisition_rat,acquisition_type,acquisition_raw_rel_path,user_id,experimenter


### 4. Check corresponding ratname and filter non matching

In [27]:
subject_df = pd.DataFrame(new_subject.Rats.fetch('ratname', as_dict=True))
acquisition_df_norat = acquisition_df.copy()
acquisition_df = acquisition_df.merge(subject_df, left_on='acquisition_rat', right_on='ratname', how='inner')
acquisition_dfnorat = acquisition_df_norat.merge(subject_df, left_on='acquisition_rat', right_on='ratname', how='left')
acquisition_dfnorat = acquisition_dfnorat.loc[acquisition_dfnorat['ratname'].isnull(), :]
acquisition_dfnorat

Unnamed: 0,acquisition_rat,acquisition_type,acquisition_raw_rel_path,user_id,experimenter,ratname


### 5 Check corresponding session and add it 

In [28]:
ephys_session_df = pd.DataFrame(new_acquisition.AcquisitionSessions.fetch('sessid', 'acquisition_raw_rel_path', as_dict=True))
ephys_session_df = ephys_session_df.rename(columns={'sessid':'acquisition_sessid'})
acquisition_df_nosession = acquisition_df.copy()
acquisition_df = acquisition_df.merge(ephys_session_df, on='acquisition_raw_rel_path', how='left')
acquisition_df_nosession = acquisition_df_nosession.merge(ephys_session_df, on='acquisition_raw_rel_path', how='left')
acquisition_df_nosession = acquisition_df_nosession.loc[acquisition_df_nosession['acquisition_sessid'].isnull(), :]
acquisition_df_nosession

Unnamed: 0,acquisition_rat,acquisition_type,user_id,experimenter,ratname,acquisition_sessid,acquisition_raw_rel_path


## 6 Insert all acquisitions found

In [29]:
acquisition_df = acquisition_df.drop(columns=['experimenter', 'ratname'])
acq.Acquisitions.insert(acquisition_df, skip_duplicates=True)

In [30]:
acquisition_db_df = pd.DataFrame(acq.Acquisitions.fetch(as_dict=True))
acquisition_db_df

Unnamed: 0,acquisition_id,acquisition_sessid,acquisition_rat,user_id,acquisition_type,acquisition_raw_rel_path
0,1,,A242,abondy,ephys,/Adrian/A242/2019-06-10_g0/2019-06-10_g0_imec0
1,2,,A242,abondy,ephys,/Adrian/A242/2019-06-05_g0/2019-06-05_g0_imec0
2,3,,A242,abondy,ephys,/Adrian/A242/2019-06-20_g0/2019-06-20_g0_imec0
3,4,,A242,abondy,ephys,/Adrian/A242/2019-06-04_g0/2019-06-04_g0_imec0
4,5,,A242,abondy,ephys,/Adrian/A242/2019-05-30_g0/2019-05-30_g0_imec0
...,...,...,...,...,...,...
240,241,,A241,abondy,ephys,/Adrian/A241/no point sorting/very low trial c...
241,242,,A241,abondy,ephys,/Adrian/A241/no point sorting/very low trial c...
242,243,,A241,abondy,ephys,/Adrian/A241/no point sorting/very low trial c...
243,244,,A241,abondy,ephys,/Adrian/A241/no point sorting/very low trial c...


### 7 Merge acquisitions and sortings

In [35]:
acquisition_db_df = acquisition_db_df[['acquisition_raw_rel_path', 'acquisition_id']].copy()
sorted_df['acquisition_raw_rel_path'] = sorted_df.apply(lambda x: get_parent_dir(x['acquisition_post_rel_path']), axis=1)

sorted_df2 = sorted_df.merge(acquisition_db_df, on='acquisition_raw_rel_path', how='left')

sorted_df2

Unnamed: 0,acquisition_id_x,acquisition_post_rel_path,acquisition_raw_rel_path,acquisition_id_y
0,,/Adrian/A242/2019-06-10_g0/2019-06-10_g0_imec0,/Adrian/A242/2019-06-10_g0,
1,,/Adrian/A242/2019-05-30_g0/2019-05-30_g0_imec0,/Adrian/A242/2019-05-30_g0,
2,,/Adrian/A242/2019-06-06_g0/2019-06-06_g0_imec0,/Adrian/A242/2019-06-06_g0,
3,,/Adrian/A242/2019-06-07_g0/2019-06-07_g0_imec0,/Adrian/A242/2019-06-07_g0,
4,,/Adrian/A242/2019-05-31_g0/2019-05-31_g0_imec0,/Adrian/A242/2019-05-31_g0,
5,,/Adrian/A242/2019-06-03_g0/2019-06-03_g0_imec0,/Adrian/A242/2019-06-03_g0,
6,,/Adrian/A230/2019-07-15_g0/2019-07-15_g0_t0.imec0,/Adrian/A230/2019-07-15_g0,


In [33]:
sorted_df

Unnamed: 0,acquisition_id,acquisition_post_rel_path


# 2. Sorted session processing

## 2.1 Construct and find nominal paths

In [None]:
cluster_root = dj.config['custom']['clustering_root_data_dir']
raw_sessions_df_found['subject_cluster_path'] = raw_sessions_df_found.apply(lambda x: pu.combine_str_path(cluster_root, [x['experimenter'], x['session_rat']]), axis=1)
raw_sessions_df_found['nominal_cluster_session_path'] = raw_sessions_df_found.apply(lambda x: pu.check_date_directory(x['subject_cluster_path'], x['session_date']), axis=1)


## 2.2 Filter only sessions with nominal path found

In [None]:
cluster_sessions_df_nom_path_found = raw_sessions_df_found.loc[~raw_sessions_df_found['nominal_cluster_session_path'].isin(pu.path_not_found_dict.values()), :]
cluster_sessions_df_nom_path_found = cluster_sessions_df_nom_path_found.reset_index(drop=True)


## 2.3 Enumerate all possible directories for each session

In [None]:
#If multiple paths found, this will create a record for each "possibility"
cluster_sessions_df_nom_path_found = cluster_sessions_df_nom_path_found.explode(['nominal_cluster_session_path'])
cluster_sessions_df_nom_path_found = cluster_sessions_df_nom_path_found.sort_values(by=['sessid'])


## 2.4 Find session files in nominal directories and childs

In [None]:
cluster_sessions_df_nom_path_found['real_cluster_session_path'] =\
cluster_sessions_df_nom_path_found.apply(lambda x: pu.find_file_pattern_dir(x['nominal_cluster_session_path'],\
                                                                     pu.file_pattern_ephys_session['sorted_np_files']),axis=1)

#If several recoring files are found inside a "parent" path
cluster_sessions_df_nom_path_found = cluster_sessions_df_nom_path_found.explode(['real_cluster_session_path'])


cluster_sessions_df_found = cluster_sessions_df_nom_path_found.loc[~cluster_sessions_df_nom_path_found['real_cluster_session_path'].isin(pu.path_not_found_dict.values()), :]
cluster_sessions_df_found['cluster_session_rel_path'] = cluster_sessions_df_found.loc[:,'real_cluster_session_path'].str.replace(dj.config['custom']['clustering_root_data_dir'], '', regex=False)
cluster_sessions_df_found = cluster_sessions_df_found.reset_index(drop=True)
cluster_sessions_df_found

# 3.  Ingest into DB (preAcquisitionSession)

## 3.1 Add/Select columns from the DF --> DB

In [None]:
#Count how many possible directories for each session we have
cluster_sessions_df_found['directory_num'] =  cluster_sessions_df_found.groupby('sessid').cumcount()
cluster_sessions_df_found = cluster_sessions_df_found.reset_index(drop=True)


cluster_sessions_df_found = cluster_sessions_df_found.rename(columns={"cluster_session_rel_path": "acquisition_post_rel_path",\
                                          "raw_session_rel_path": "acquisition_raw_rel_path"})

cluster_sessions_df_found['acquisition_type'] = 'ephys'
cluster_sessions_df_found['correct_dirs'] = 0

pre_acquisition_sessions_df = cluster_sessions_df_found[new_acquisition.PreAcquisitionSessions.heading.names]
pre_acquisition_sessions_df





In [None]:
so = pre_acquisition_sessions_df.groupby('directory_num').max()
pre_acquisition_sessions_df.loc[pre_acquisition_sessions_df['sessid'] == 710898, 'acquisition_post_rel_path'].values

## 3.2 Ingest to preAcquisitionSession

In [None]:
dict_keys = pre_acquisition_sessions_df.to_dict(orient='records')
for i in dict_keys:
    new_acquisition.PreAcquisitionSessions.insert1(i)

# 4. Update correct_dirs of known PreAcquisitionSessions (triggers AcquisitionSessions insert)

## 4.1  Fetch from PreAcquisitionSessions

In [None]:
preacq_sessions_df = pd.DataFrame(new_acquisition.PreAcquisitionSessions.fetch(order_by='sessid desc', as_dict=True))
idx_duplicate_raw = preacq_sessions_df['sessid'].duplicated(keep=False)
preacq_sessions_df = preacq_sessions_df[~idx_duplicate_raw]
preacq_sessions_df = preacq_sessions_df.loc[preacq_sessions_df['directory_num'] == 0, :]
preacq_sessions_df = preacq_sessions_df.reset_index(drop=True)
preacq_sessions_df

## 4.2 Find unequivocally relation between directories and session

In [None]:
idx_duplicate_raw = preacq_sessions_df['acquisition_raw_rel_path'].duplicated(keep=False)
preacq_sessions_df_unique_raw = preacq_sessions_df[~idx_duplicate_raw]
preacq_sessions_df_unique_raw = preacq_sessions_df_unique_raw.reset_index(drop=True)
idx_duplicate_post = preacq_sessions_df_unique_raw['acquisition_post_rel_path'].duplicated(keep=False)
acq_sessions_df_unique = preacq_sessions_df_unique_raw[~idx_duplicate_post]
acq_sessions_df_unique = acq_sessions_df_unique.reset_index(drop=True)
acq_sessions_df_unique

## 4.3 Update correct_dir of found session trigger ingest acquisitionSessions

In [None]:
for i in range(acq_sessions_df_unique.shape[0]):
    key = dict()
    key['sessid'] = acq_sessions_df_unique.loc[i, 'sessid']
    key['directory_num'] = 0
    key['correct_dirs'] =  1
    (new_acquisition.PreAcquisitionSessions).update1(key)

## 4.4 Check AcquisitionSessions records

In [None]:
acq_sessions_df = pd.DataFrame(new_acquisition.AcquisitionSessions.fetch(order_by='sessid desc', as_dict=True))
acq_sessions_df