### Notebook to go through all mat files and get fully hand labeled trials

In [1]:
from pathlib import Path
from tqdm import tqdm
import shutil
import os
from scipy.io import loadmat
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
from tqdm import tqdm
import pandas as pd

In [2]:
columns = ["mat_path", "full_hand_labels"]
df = pd.DataFrame(columns=columns)

In [3]:
df

Unnamed: 0,mat_path,full_hand_labels


In [4]:
mat_dir = Path("/data/caitlin/mat_files/")

In [5]:
def check_all_behaviors(ethogram: np.ndarray):
    for row in ethogram:
        if not row.any():
            return False
    return True

In [6]:
mat_paths = sorted(mat_dir.glob('*'))
len(mat_paths)

696

In [7]:
mat_errors = list()
def get_hand_labels(mat_path): 
    hand_labels = dict()

    m = loadmat(mat_path)
    try:
        num_trials = m["data"]["trial"].shape[0]
        
    except KeyError:
        mat_errors.append(mat_path)
        return

    for i in tqdm(range(num_trials)):
        mat_trial_index = np.argwhere(m["data"]["trial"].ravel() == (i + 1))
        # Trial not found in JAABA data
        if mat_trial_index.size == 0:
            continue
        
        try:
            mat_trial_index = mat_trial_index.item()
        except ValueError:
            mat_errors.append(mat_path)
            return

        behaviors = sorted([b.split('_')[0] for b in m['data'].dtype.names if 'scores' in b])
        
        if len(behaviors) != 6:
            continue

        all_behaviors = [
            "Lift",
            "Handopen",
            "Grab",
            "Sup",
            "Atmouth",
            "Chew"
        ]

        sorted_behaviors = [b for b in all_behaviors if b in behaviors]

        ethograms = []

        for b in sorted_behaviors:
            behavior_index = m['data'].dtype.names.index(f'{b}_labl_label')
            row = m['data'][mat_trial_index][0][behavior_index]
            row[row == -1] = 0
            ethograms.append(row)

        sorted_behaviors = [b.lower() for b in sorted_behaviors]

        ethogram = np.hstack(ethograms).T

        if check_all_behaviors(ethogram):
            hand_labels[m["data"]["exp"][i][0][0]] = ethogram
        
    return hand_labels

In [8]:
for mat in tqdm(mat_paths):
    hand_labels = get_hand_labels(str(mat))
    if hand_labels:
        df.loc[len(df.index)] = [mat.stem, hand_labels] 

  0%|                                               | 0/696 [00:00<?, ?it/s]
100%|█████████████████████████████████████| 80/80 [00:00<00:00, 5950.53it/s][A
  0%|                                       | 1/696 [00:00<01:12,  9.63it/s]
100%|█████████████████████████████████████| 85/85 [00:00<00:00, 5749.61it/s][A

100%|█████████████████████████████████████| 85/85 [00:00<00:00, 5729.56it/s][A
  0%|▏                                      | 3/696 [00:00<01:02, 11.03it/s]
100%|█████████████████████████████████████| 80/80 [00:00<00:00, 5993.58it/s][A

100%|█████████████████████████████████████| 80/80 [00:00<00:00, 5963.32it/s][A
  1%|▎                                      | 5/696 [00:00<00:58, 11.82it/s]
100%|█████████████████████████████████████| 80/80 [00:00<00:00, 5895.02it/s][A

100%|█████████████████████████████████████| 80/80 [00:00<00:00, 5944.31it/s][A
  1%|▍                                      | 7/696 [00:00<00:57, 12.07it/s]
100%|█████████████████████████████████████| 80/80 [0

In [9]:
df

Unnamed: 0,mat_path,full_hand_labels
0,/data/caitlin/mat_files/M234_20170328.mat,"{'M234_20170328_v004': [[0, 0, 0, 0, 0, 0, 0, ..."
1,/data/caitlin/mat_files/M234_20170331.mat,"{'M234_20170331_v001': [[0, 0, 0, 0, 0, 0, 0, ..."
2,/data/caitlin/mat_files/M234_20170403.mat,"{'M234_20170403_v001': [[0, 0, 0, 0, 0, 0, 0, ..."
3,/data/caitlin/mat_files/M234_20170412.mat,"{'M234_20170412_v001': [[0, 0, 0, 0, 0, 0, 0, ..."
4,/data/caitlin/mat_files/M234_20170414.mat,"{'M234_20170414_v002': [[0, 0, 0, 0, 0, 0, 0, ..."
...,...,...
84,/data/caitlin/mat_files/M324_20201015.mat,"{'M324_20201015_v126': [[0, 0, 0, 0, 0, 0, 0, ..."
85,/data/caitlin/mat_files/M326_20201110.mat,"{'M326_20201110_v001': [[0, 0, 0, 0, 0, 0, 0, ..."
86,/data/caitlin/mat_files/M328_20201130_2500.mat,"{'M328_20201130_v001': [[0, 0, 0, 0, 0, 0, 0, ..."
87,/data/caitlin/mat_files/M336_20210612.mat,"{'M336_20210612_v001': [[0, 0, 0, 0, 0, 0, 0, ..."


In [10]:
mat_errors

['/data/caitlin/mat_files/M236_20170817.mat',
 '/data/caitlin/mat_files/M236_20170818.mat',
 '/data/caitlin/mat_files/M238_20170717.mat',
 '/data/caitlin/mat_files/M238_20170719.mat',
 '/data/caitlin/mat_files/M238_20170724.mat',
 '/data/caitlin/mat_files/M238_20170725.mat',
 '/data/caitlin/mat_files/M238_20170726.mat',
 '/data/caitlin/mat_files/M238_20170727.mat',
 '/data/caitlin/mat_files/M239_20170905.mat',
 '/data/caitlin/mat_files/M240_20170724FinalChecked.mat',
 '/data/caitlin/mat_files/M240_20170725FinalChecked.mat',
 '/data/caitlin/mat_files/M240_20170727FinalChecked.mat',
 '/data/caitlin/mat_files/M240_20170728FinalChecked.mat',
 '/data/caitlin/mat_files/M240_20170801FinalChecked.mat',
 '/data/caitlin/mat_files/M240_20170802FinalChecked.mat',
 '/data/caitlin/mat_files/M240_20170807FinalChecked.mat',
 '/data/caitlin/mat_files/M240_20170808FinalChecked.mat',
 '/data/caitlin/mat_files/M256_20180124_3rdCx.mat',
 '/data/caitlin/mat_files/M256_20180124_3rdCx_ORIG.mat',
 '/data/caitl

In [11]:
count = 0
for row in df.iterrows():
    count += len(row[1]["full_hand_labels"].keys())
count

135

In [12]:
df.to_hdf(path_or_buf="/data/caitlin/hand_labels.hdf", key="df")

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['mat_path', 'full_hand_labels'], dtype='object')]

  df.to_hdf(path_or_buf="/data/caitlin/hand_labels.hdf", key="df")


In [13]:
df = pd.read_hdf("/data/caitlin/hand_labels.hdf")

In [14]:
mat_errors2 = list()
def get_hand_labels(mat_path): 
    hand_labels = dict()

    m = loadmat(mat_path)
    try:
        num_trials = m["data1"]["trial"].shape[0]
        
    except KeyError:
        mat_errors2.append(mat_path)
        return

    for i in tqdm(range(num_trials)):
        mat_trial_index = np.argwhere(m["data1"]["trial"].ravel() == (i + 1))
        # Trial not found in JAABA data
        if mat_trial_index.size == 0:
            continue
        
        try:
            mat_trial_index = mat_trial_index.item()
        except ValueError:
            mat_errors2.append(mat_path)
            return

        behaviors = sorted([b.split('_')[0] for b in m['data1'].dtype.names if 'scores' in b])
        
        if len(behaviors) != 6:
            continue

        all_behaviors = [
            "Lift",
            "Handopen",
            "Grab",
            "Sup",
            "Atmouth",
            "Chew"
        ]

        sorted_behaviors = [b for b in all_behaviors if b in behaviors]

        ethograms = []

        for b in sorted_behaviors:
            behavior_index = m['data1'].dtype.names.index(f'{b}_labl_label')
            row = m['data1'][mat_trial_index][0][behavior_index]
            row[row == -1] = 0
            ethograms.append(row)

        sorted_behaviors = [b.lower() for b in sorted_behaviors]

        ethogram = np.hstack(ethograms).T

        if check_all_behaviors(ethogram):
            hand_labels[m["data1"]["exp"][i][0][0]] = ethogram

    return hand_labels

In [15]:
for mat in tqdm(mat_errors):
    hand_labels = get_hand_labels(Path(mat))
    if hand_labels:
        df.loc[len(df.index)] = [Path(mat).stem, hand_labels] 

  0%|                                                                                                                 | 0/42 [00:00<?, ?it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 80/80 [00:00<00:00, 5979.80it/s][A
  2%|██▌                                                                                                      | 1/42 [00:00<00:05,  7.41it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 80/80 [00:00<00:00, 5893.15it/s][A

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 80/80 [00:00<00:00, 6051.84it/s][A
  7%|███████▌                                                                                                 | 3/42 [00:00<00:03, 10.40it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 81/81 [00:00<00:00, 5987.96it

In [16]:
mat_errors2

[PosixPath('/data/caitlin/mat_files/M239_20170905.mat'),
 PosixPath('/data/caitlin/mat_files/M256_20180124_3rdCx.mat'),
 PosixPath('/data/caitlin/mat_files/M256_20180124_3rdCx_ORIG.mat'),
 PosixPath('/data/caitlin/mat_files/M258_20180430to20180502_CNO.mat'),
 PosixPath('/data/caitlin/mat_files/M258_20180502to20180504_PSEM.mat'),
 PosixPath('/data/caitlin/mat_files/M258_20180507to20180509_PSEM.mat'),
 PosixPath('/data/caitlin/mat_files/M258_20180509to20180511_CNO.mat'),
 PosixPath('/data/caitlin/mat_files/M258_20180514to20180516_PSEM.mat'),
 PosixPath('/data/caitlin/mat_files/M258_20180516to20180518_CNO.mat'),
 PosixPath('/data/caitlin/mat_files/M260_20180404_06_1stPSEM1X.mat'),
 PosixPath('/data/caitlin/mat_files/M260_20180411_13_1stPSEM3X.mat'),
 PosixPath('/data/caitlin/mat_files/M260_20180424_27_2ndPSEM3X.mat'),
 PosixPath('/data/caitlin/mat_files/M260_20180501_03_1stCNO.mat'),
 PosixPath('/data/caitlin/mat_files/M260_20180503_05_3rdPSEM3X.mat'),
 PosixPath('/data/caitlin/mat_files/

In [17]:
df

Unnamed: 0,mat_path,full_hand_labels
0,M234_20170328,"{'M234_20170328_v004': [[0, 0, 0, 0, 0, 0, 0, ..."
1,M234_20170331,"{'M234_20170331_v001': [[0, 0, 0, 0, 0, 0, 0, ..."
2,M234_20170403,"{'M234_20170403_v001': [[0, 0, 0, 0, 0, 0, 0, ..."
3,M234_20170412,"{'M234_20170412_v001': [[0, 0, 0, 0, 0, 0, 0, ..."
4,M234_20170414,"{'M234_20170414_v002': [[0, 0, 0, 0, 0, 0, 0, ..."
...,...,...
85,M326_20201110,"{'M326_20201110_v001': [[0, 0, 0, 0, 0, 0, 0, ..."
86,M328_20201130_2500,"{'M328_20201130_v001': [[0, 0, 0, 0, 0, 0, 0, ..."
87,M336_20210612,"{'M336_20210612_v001': [[0, 0, 0, 0, 0, 0, 0, ..."
88,M336_20210613,"{'M336_20210613_v001': [[0, 0, 0, 0, 0, 0, 0, ..."


In [18]:
df.to_hdf(path_or_buf="/data/caitlin/hand_labels.hdf", key="df")

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['mat_path', 'full_hand_labels'], dtype='object')]

  df.to_hdf(path_or_buf="/data/caitlin/hand_labels.hdf", key="df")


In [19]:
df = pd.read_hdf("/data/caitlin/hand_labels.hdf")

In [20]:
count = 0
for row in df.iterrows():
    count += len(row[1]["full_hand_labels"].keys())
count

136

In [31]:
mat_errors3 = list()

def get_hand_labels(mat_path): 
    hand_labels = dict()
    m = loadmat(mat_errors2[0])

    num_trials = m["data"]["trial"].shape[0]

    for i in tqdm(range(num_trials)):
        mat_trial_index = np.argwhere(m["data"]["trial"].ravel() == (i + 1))
        # Trial not found in JAABA data
        if mat_trial_index.size == 0:
            continue

        try:
            mat_trial_index = mat_trial_index.item()
        except ValueError:
            continue

        behaviors = sorted([b.split('_')[0] for b in m['data'].dtype.names if 'scores' in b])

        if len(behaviors) != 6:
            continue

        all_behaviors = [
            "Lift",
            "Handopen",
            "Grab",
            "Sup",
            "Atmouth",
            "Chew"
        ]

        sorted_behaviors = [b for b in all_behaviors if b in behaviors]

        ethograms = []

        for b in sorted_behaviors:
            behavior_index = m['data'].dtype.names.index(f'{b}_labl_label')
            row = m['data'][mat_trial_index][0][behavior_index]
            row[row == -1] = 0
            ethograms.append(row)

        sorted_behaviors = [b.lower() for b in sorted_behaviors]

        ethogram = np.hstack(ethograms).T

        if check_all_behaviors(ethogram):
            hand_labels[m["data"]["exp"][i][0][0]] = ethogram

    return hand_labels

In [32]:
for mat in tqdm(mat_errors2):
    hand_labels = get_hand_labels(Path(mat))
    if hand_labels:
        df.loc[len(df.index)] = [Path(mat).stem, hand_labels] 

  0%|                                                                                                                 | 0/26 [00:00<?, ?it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 116/116 [00:00<00:00, 4681.14it/s][A
  4%|████                                                                                                     | 1/26 [00:00<00:04,  5.49it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 116/116 [00:00<00:00, 4670.04it/s][A
  8%|████████                                                                                                 | 2/26 [00:00<00:03,  6.35it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 116/116 [00:00<00:00, 4668.25it/s][A
 12%|████████████                                                                                             | 3/26 [00:00<00:03,  6.69it/

In [33]:
df

Unnamed: 0,mat_path,full_hand_labels
0,M234_20170328,"{'M234_20170328_v004': [[0, 0, 0, 0, 0, 0, 0, ..."
1,M234_20170331,"{'M234_20170331_v001': [[0, 0, 0, 0, 0, 0, 0, ..."
2,M234_20170403,"{'M234_20170403_v001': [[0, 0, 0, 0, 0, 0, 0, ..."
3,M234_20170412,"{'M234_20170412_v001': [[0, 0, 0, 0, 0, 0, 0, ..."
4,M234_20170414,"{'M234_20170414_v002': [[0, 0, 0, 0, 0, 0, 0, ..."
...,...,...
85,M326_20201110,"{'M326_20201110_v001': [[0, 0, 0, 0, 0, 0, 0, ..."
86,M328_20201130_2500,"{'M328_20201130_v001': [[0, 0, 0, 0, 0, 0, 0, ..."
87,M336_20210612,"{'M336_20210612_v001': [[0, 0, 0, 0, 0, 0, 0, ..."
88,M336_20210613,"{'M336_20210613_v001': [[0, 0, 0, 0, 0, 0, 0, ..."


In [34]:
mat_errors3

[]

In [35]:
df.to_hdf(path_or_buf="/data/caitlin/hand_labels.hdf", key="df")

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['mat_path', 'full_hand_labels'], dtype='object')]

  df.to_hdf(path_or_buf="/data/caitlin/hand_labels.hdf", key="df")


In [36]:
df = pd.read_hdf("/data/caitlin/hand_labels.hdf")

In [37]:
count = 0
for row in df.iterrows():
    count += len(row[1]["full_hand_labels"].keys())
count

136