# Notes

In this notebook, we convert the .csv files to .h5 files, and ensure that the idtracker.ai number of frames match the sLEAP number of frames

# Imports

In [2]:
import numpy as np
import h5py
import pandas as pd
import time
import glob
from multiprocessing import Pool
import matplotlib.pyplot as plt
%load_ext autoreload
import os
%autoreload 2
import sys
%matplotlib widget

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Prepare filepaths

In [3]:
main_data_folder = '/media/liam/hd1/fighting_data'

In [4]:
# gather all the top level experiment filepaths
exp_folder_paths = glob.glob(main_data_folder+'/*')
exp_folder_paths.sort()
exp_folder_paths

['/media/liam/hd1/fighting_data/FishTank20200127_143538',
 '/media/liam/hd1/fighting_data/FishTank20200129_140656',
 '/media/liam/hd1/fighting_data/FishTank20200130_153857',
 '/media/liam/hd1/fighting_data/FishTank20200130_181614',
 '/media/liam/hd1/fighting_data/FishTank20200207_161445',
 '/media/liam/hd1/fighting_data/FishTank20200213_154940',
 '/media/liam/hd1/fighting_data/FishTank20200214_153519',
 '/media/liam/hd1/fighting_data/FishTank20200217_160052',
 '/media/liam/hd1/fighting_data/FishTank20200218_153008',
 '/media/liam/hd1/fighting_data/FishTank20200316_163320',
 '/media/liam/hd1/fighting_data/FishTank20200327_154737',
 '/media/liam/hd1/fighting_data/FishTank20200330_161100',
 '/media/liam/hd1/fighting_data/FishTank20200331_162136',
 '/media/liam/hd1/fighting_data/FishTank20200520_152810',
 '/media/liam/hd1/fighting_data/FishTank20200521_154541',
 '/media/liam/hd1/fighting_data/FishTank20200525_161602',
 '/media/liam/hd1/fighting_data/FishTank20200526_160100',
 '/media/liam/

In [5]:
len(exp_folder_paths)

22

In [6]:
# get the exps names
expNames = [os.path.basename(folderpath) for folderpath in exp_folder_paths]
expNames

['FishTank20200127_143538',
 'FishTank20200129_140656',
 'FishTank20200130_153857',
 'FishTank20200130_181614',
 'FishTank20200207_161445',
 'FishTank20200213_154940',
 'FishTank20200214_153519',
 'FishTank20200217_160052',
 'FishTank20200218_153008',
 'FishTank20200316_163320',
 'FishTank20200327_154737',
 'FishTank20200330_161100',
 'FishTank20200331_162136',
 'FishTank20200520_152810',
 'FishTank20200521_154541',
 'FishTank20200525_161602',
 'FishTank20200526_160100',
 'FishTank20200527_152401',
 'FishTank20200824_151740',
 'FishTank20200828_155504',
 'FishTank20200902_160124',
 'FishTank20200903_160946']

In [7]:
cam_names = ['D_xz', 'E_xy', 'F_yz']
numCams = len(cam_names)

# Check that sLEAP and idtracker give us the same total number of frames

##  from the sleap csv files

In [8]:
expNumFrames_sLEAP = []

for expIdx in range(len(expNames)):
    print(expIdx)

    # --- parse the folder paths for this experiment --- #
    expFolderPath = exp_folder_paths[expIdx]
    exp_cam_folders = [os.path.join(expFolderPath, 'sleap_results_csv', camName) for camName in cam_names]


    # --- grab all the csv file paths ---- #
    exp_cam_csv_paths = []
    for camIdx in range(len(cam_names)):
        cam_csv_paths = glob.glob(exp_cam_folders[camIdx]+'/*.csv')
        cam_csv_paths.sort()
        exp_cam_csv_paths.append(cam_csv_paths)


    #  --- test that we have the same number of csv files in each camera view ---- #
    num_files_for_each_cam = [len(csv_list) for csv_list in exp_cam_csv_paths]
    if len(set(num_files_for_each_cam)) != 1:
        raise TypeError("cam views have different number of csv files")
    # now set the total number of csv files, since all cams have the same number
    num_csv_files = list(set(num_files_for_each_cam))[0]


    #--- get the total number of frames in the experiment ---#
    # Note on the method:
    # all csv files, apart from the final one, will have 6000 frames.
    # We count the number of frames in the last csv file.
    # So numFrames = (numCsvFiles-1)*6000 + (numFrames_in_last_csv_file)

    # count the number of frames from each camera view separately
    last_mp4_final_frame_indices = []
    for camIdx in range(len(cam_names)):
        last_results_df = pd.read_csv(exp_cam_csv_paths[camIdx][-1], names=["frame_index", "instance_index", "point_index", "x", "y"])
        final_frame_index = last_results_df['frame_index'].values[-1]
        last_mp4_final_frame_indices.append(final_frame_index)

    # test that we have the same number
    if len(set(last_mp4_final_frame_indices)) != 1:
        print(f"expIdx={expIdx} has  different number of frames in last csv file. Using the max value for the calculation")
        print(last_mp4_final_frame_indices)
        print()
        num_frames_in_last_mp4 = np.max(np.array(last_mp4_final_frame_indices)) + 1 # +1 to move from 0-indexing to cardinality
        #raise TypeError("cam views have different number of frames in last csv file")
    else:
        num_frames_in_last_mp4 = list(set(last_mp4_final_frame_indices))[0] + 1 # +1 to move from 0-indexing to cardinality

    # now get the total number of frames
    numFrames = (num_csv_files-1)*6000 + num_frames_in_last_mp4
    
    
    # record
    expNumFrames_sLEAP.append(numFrames)
    
expNumFrames_sLEAP = np.array(expNumFrames_sLEAP)
expNumFrames_sLEAP

0
expIdx=0 has  different number of frames in last csv file. Using the max value for the calculation
[5755, 5755, 5753]

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21


array([ 971756,  769848,  501943,  517000,  463997,  730119,  495610,
        601226,  537880, 1032353,  561010,  556428,  746434, 1248577,
        596607,  588962,  689100,  497635, 1695643,  798382,  717814,
        726000])

## from idtracker results

In [9]:
t0 = time.time()

expNumFrames_idtracker = []

for expIdx in range(len(expNames)):
    #print(expIdx)
    
    raw_idtracker_filepath = os.path.join(exp_folder_paths[expIdx], 'idtracker_results/trajectories.npy')
    trajectories_dict = np.load(raw_idtracker_filepath, allow_pickle=True).item()
    idtraj = trajectories_dict['trajectories']
    numFrames = idtraj.shape[0]
    expNumFrames_idtracker.append(numFrames)
    
expNumFrames_idtracker = np.array(expNumFrames_idtracker)

In [10]:
expNumFrames_idtracker

array([ 971756,  769848,  501943,  517000,  463997,  730119,  495610,
        601226,  537880, 1032353,  561010,  556428,  746434, 1248577,
        596607,  588962,  689100,  497635, 1695643,  798382,  717814,
        726000])

## compare

In [11]:
expNumFrames_sLEAP - expNumFrames_idtracker

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [12]:
# we can now use either to set the total number of frames
expNumFrames = expNumFrames_sLEAP
expNumFrames

array([ 971756,  769848,  501943,  517000,  463997,  730119,  495610,
        601226,  537880, 1032353,  561010,  556428,  746434, 1248577,
        596607,  588962,  689100,  497635, 1695643,  798382,  717814,
        726000])

# convert csv format to multidimensional arrays

In [13]:
expNumFrames

array([ 971756,  769848,  501943,  517000,  463997,  730119,  495610,
        601226,  537880, 1032353,  561010,  556428,  746434, 1248577,
        596607,  588962,  689100,  497635, 1695643,  798382,  717814,
        726000])

In [14]:
numFish = 2
numBodyPoints = 3

idtracker_acceleration_thresh = 3.3
idtracker_nan_win = 6

numProcessors = 40

In [15]:
def load_and_process_idtracker_data(idTracker_filepath, acceleration_thresh, NaN_window):
    ''' A function to load idTracker results and parse them, any potential swaps
        after collision events by results with very high accelerations
        
    --- args ---
    idTracker_filepath:  the filepath to the trajectories.npy file for an experiment
    acceleration_thresh: the threshold on the absolute value of the acceleration
    NaN_window:          the number of frames either side of problematic frames to remove
    
    --- returns ---
    idtraj: the processed idtracker timeseries.

    -- Thanks to --
    Simon Goorney
    '''
    # load the centroid timeseries
    trajectories_dict = np.load(idTracker_filepath, allow_pickle=True).item()
    idtraj = trajectories_dict['trajectories']

    # compute the relevant derivatives
    speed = np.gradient(idtraj, axis=0)
    normspeed = np.linalg.norm(speed, axis=2)
    normaccel = np.gradient(normspeed, axis=0)
    absaccel = np.abs(normaccel)
    
    # compute the number of frames in the experiment
    nfs = speed.shape[0]

    def compare_nan_array(func, a, thresh):
        # Thanks: https://stackoverflow.com/a/47340067
        out = ~np.isnan(a)
        out[out] = func(a[out] , thresh)
        return out

    # find problematic frames
    super_threshold_indices = np.where(compare_nan_array(np.greater, absaccel, acceleration_thresh))
    indices = super_threshold_indices[0]

    # loop through problem frames, Nan'ing either side in a window
    for i in indices:
        if i < NaN_window:
            idtraj[:i+int(NaN_window/2)] = np.NaN
        elif nfs - i < NaN_window:
            idtraj[i-int(NaN_window/2):] = np.NaN
        else:
            idtraj[i-int(NaN_window/2):i+int(NaN_window/2)] = np.NaN

    # finish up
    return idtraj



def convert_csv_triple_to_array_PARALLEL(i):
    ''' Given an index, which we use to parse a triplet of filepaths for csv file sleap results
        for the three camera views of the same points in time,
        return an array of the sleap results in the shape (numCams, numFrames, numFish, numBodypoints, 2).
    '''
    cam_names = ['D_xz', 'E_xy', 'F_yz']
    numCams = len(cam_names)
    
    # parse the filepaths for this jobIdx 
    exp_csv_cam_paths = var_dict['experiment_csv_cam_paths']
    csv_cam_paths = exp_csv_cam_paths[i]
    #print(csv_cam_paths)
    
    # --- get the number of movie frames for this csv file ---- #
    # Note: we need to be careful in case the csv files for the differnet camera views have differing values
    #       So compute using all three views, and use the max
    csv_final_frame_indices = []
    for camIdx in range(numCams):
        df = pd.read_csv(csv_cam_paths[camIdx], names=["frame_index", "instance_index", "point_index", "x", "y"])
        final_frame_index = df['frame_index'].values[-1]
        csv_final_frame_indices.append(final_frame_index)
    csv_numframes = np.max(np.array(csv_final_frame_indices)) + 1 # +1 to move from 0-indexing to cardinality
    
    # --- preallocate the output for this csv file --- #
    csv_sleap_data_array = np.zeros((numCams, csv_numframes, numFish, numBodyPoints, 2))*np.NaN

    # --- enter the data, one camera at a time ---- #
    for camIdx in range(len(cam_names)):

        cam_results_df = pd.read_csv(csv_cam_paths[camIdx], names=["frame_index", "instance_index", "point_index", "x", "y"])

        for fIdx in range(csv_numframes):
            # grab the frame data
            frame_data = cam_results_df[cam_results_df['frame_index']==fIdx]
            # get number of matched instances
            tracked_instance_idxs = np.unique(frame_data['instance_index'].values)
            num_frame_instances = len(tracked_instance_idxs)

            # make a dictionary for mapping tracked_instance_idxs to fishIdxs.
            # Note: tracked_instance_idxs runs through the whole csv file,
            #       and was generated by some sLEAP temporal tracking.
            #       But we don't want this sLEAP tracking. 
            #       Each frame, we only want to be able to tell apart the different detected animals.
            #       So we map the tracked_instance_idxs to a zero indexed list each frame.
            #       We remove the temporal tracking component, and reset each frame
            #       e.g.:  tracked_instance_idxs = [34,35] 
            #              fishIdxs             -> [0, 1],
            fish_index_dict = dict(zip(tracked_instance_idxs, [i for i in range(num_frame_instances)]))

            # place each datapoint in the correct part of the array
            for index, row in frame_data.iterrows():
                csv_sleap_data_array[camIdx, 
                                     fIdx, 
                                     fish_index_dict[row['instance_index']], 
                                     int(row['point_index']), 
                                     :] = row[['x', 'y']].values
    # ---- finish up -------- #
    return csv_sleap_data_array

In [23]:
for expIdx in range(1, len(expNames)):
    print(expIdx)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21


In [24]:
# ----- Prepare the data for tracking ----- #
t0 = time.time()


#for expIdx in range(len(expNames)):
#for expIdx in [0]:
for expIdx in range(1, len(expNames)):

    # make the savepath for the gathered data
    exp_savepath = os.path.join(exp_folder_paths[expIdx], expNames[expIdx]+'_sLEAP_and_idtracks.h5')


    # --- parse the folder paths for this experiment --- #
    expFolderPath = exp_folder_paths[expIdx]
    exp_cam_folders = [os.path.join(expFolderPath, 'sleap_results_csv', camName) for camName in cam_names]

    # --- grab all the csv file paths ---- #
    exp_cam_csv_paths = []
    for camIdx in range(len(cam_names)):
        cam_csv_paths = glob.glob(exp_cam_folders[camIdx]+'/*.csv')
        cam_csv_paths.sort()
        exp_cam_csv_paths.append(cam_csv_paths)

    #  --- test that we have the same number of csv files in each camera view ---- #
    num_files_for_each_cam = [len(csv_list) for csv_list in exp_cam_csv_paths]
    if len(set(num_files_for_each_cam)) != 1:
        raise TypeError("cam views have different number of csv files")
    # now set the total number of csv files, since all cams have the same number
    num_csv_files = list(set(num_files_for_each_cam))[0]


    # ---- create a list of filepaths for parallelization ---- #
    exp_csv_cam_paths = [ [exp_cam_csv_paths[camIdx][csv_idx] for camIdx in range(len(cam_names))] for csv_idx in range(num_csv_files) ]
    job_idxs = [i for i in range(len(exp_csv_cam_paths))]
    #job_idxs = job_idxs[:20]


    # ---- create a dictionary for access to this function ----- #
    var_dict = {}
    def init_worker(experiment_csv_cam_paths):
        var_dict['experiment_csv_cam_paths'] = experiment_csv_cam_paths


    # ---- map our function over all jobs ------ #
    with Pool(processes=numProcessors, initializer=init_worker, initargs=(exp_csv_cam_paths,)) as pool:
        outputs = pool.map(convert_csv_triple_to_array_PARALLEL, job_idxs)


    # ---- concatenate the results into one array for this experiment ---- #
    sleap_data = np.concatenate(outputs, axis=1)


    # ---- now load and process the idtracker data ----- #
    raw_idtracker_filepath = os.path.join(exp_folder_paths[expIdx], 'idtracker_results/trajectories.npy')
    trajectories_dict = np.load(raw_idtracker_filepath, allow_pickle=True).item()
    idtraj = trajectories_dict['trajectories']
    idtracker_data = load_and_process_idtracker_data(raw_idtracker_filepath, idtracker_acceleration_thresh, idtracker_nan_win)


    # -------- save the ouputs -------#
    with h5py.File(exp_savepath, 'w') as hf:
        hf.create_dataset('idTracker_data', data=idtracker_data)
        hf.create_dataset('sLEAP_data', data=sleap_data)


    print(expIdx, ' finished: {0}s'.format(time.time()-t0))
    print()



# ----------------------------------------------------------------------------------------------------------------------------------#

print()
print()
print('--- all finished ---')
tE = time.time()
print(tE-t0)

1  finished: 289.67706990242004s

2  finished: 503.78289675712585s

3  finished: 721.5171310901642s

4  finished: 886.4639909267426s

5  finished: 1148.6182172298431s

6  finished: 1356.9830145835876s

7  finished: 1586.21426820755s

8  finished: 1799.9946608543396s

9  finished: 2233.0473012924194s

10  finished: 2458.712166786194s

11  finished: 2678.4258670806885s

12  finished: 2968.509666442871s

13  finished: 3423.8771572113037s

14  finished: 3655.2619819641113s

15  finished: 3877.9798069000244s

16  finished: 4122.609823703766s

17  finished: 4330.636656522751s

18  finished: 4950.6605887413025s

19  finished: 5254.85621213913s

20  finished: 5515.95645403862s

21  finished: 5782.770781993866s



--- all finished ---
5782.771513700485


In [25]:
5782/60

96.36666666666666