### Notebook to go through all mat files and get hand labeled trials that have 3 or more behaviors labeled

In [1]:
from pathlib import Path
from tqdm import tqdm
import shutil
import os
from scipy.io import loadmat
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
from tqdm import tqdm
import pandas as pd

In [2]:
columns = ["mat_path", "hand_labels"]
df = pd.DataFrame(columns=columns)

In [3]:
df

Unnamed: 0,mat_path,hand_labels


In [4]:
mat_dir = Path("/data/caitlin/mat_files/")

In [5]:
def check_all_behaviors(ethogram: np.ndarray):
    count = 0
    for row in ethogram:
        if row.any():
            count += 1
    if count >= 4:
        return True
    else:
        return False

In [6]:
mat_paths = sorted(mat_dir.glob('*'))
len(mat_paths)

696

In [7]:
mat_errors = list()
def get_hand_labels(mat_path): 
    hand_labels = dict()

    m = loadmat(mat_path)
    try:
        num_trials = m["data"]["trial"].shape[0]
        
    except KeyError:
        mat_errors.append(mat_path)
        return

    for i in tqdm(range(num_trials)):
        mat_trial_index = np.argwhere(m["data"]["trial"].ravel() == (i + 1))
        # Trial not found in JAABA data
        if mat_trial_index.size == 0:
            continue
        
        try:
            mat_trial_index = mat_trial_index.item()
        except ValueError:
            continue

        behaviors = sorted([b.split('_')[0] for b in m['data'].dtype.names if 'scores' in b])
        
        if len(behaviors) != 6:
            continue

        all_behaviors = [
            "Lift",
            "Handopen",
            "Grab",
            "Sup",
            "Atmouth",
            "Chew"
        ]

        sorted_behaviors = [b for b in all_behaviors if b in behaviors]

        ethograms = []

        for b in sorted_behaviors:
            behavior_index = m['data'].dtype.names.index(f'{b}_labl_label')
            row = m['data'][mat_trial_index][0][behavior_index]
            row[row == -1] = 0
            ethograms.append(row)

        sorted_behaviors = [b.lower() for b in sorted_behaviors]

        ethogram = np.hstack(ethograms).T

        if check_all_behaviors(ethogram):
            hand_labels[m["data"]["exp"][i][0][0]] = ethogram

    return hand_labels

In [8]:
for mat in tqdm(mat_paths):
    hand_labels = get_hand_labels(str(mat))
    if hand_labels:
        df.loc[len(df.index)] = [mat.stem, hand_labels] 

  0%|                                                                                                                | 0/696 [00:00<?, ?it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 80/80 [00:00<00:00, 5515.37it/s][A
  0%|▏                                                                                                       | 1/696 [00:00<01:27,  7.92it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 85/85 [00:00<00:00, 5320.89it/s][A

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 85/85 [00:00<00:00, 5356.47it/s][A
  0%|▍                                                                                                       | 3/696 [00:00<01:08, 10.14it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 80/80 [00:00<00:00, 5545.09it

In [9]:
df

Unnamed: 0,mat_path,hand_labels
0,M232_20170307,"{'M232_20170307_v062': [[0, 0, 0, 0, 0, 0, 0, ..."
1,M232_20170308,"{'M232_20170308_v006': [[0, 0, 0, 0, 0, 0, 0, ..."
2,M232_20170310,"{'M232_20170310_v013': [[0, 0, 0, 0, 0, 0, 0, ..."
3,M234_20170328,"{'M234_20170328_v004': [[0, 0, 0, 0, 0, 0, 0, ..."
4,M234_20170329,"{'M234_20170329_v001': [[0, 0, 0, 0, 0, 0, 0, ..."
...,...,...
197,M326_20201110,"{'M326_20201110_v001': [[0, 0, 0, 0, 0, 0, 0, ..."
198,M328_20201130_2500,"{'M328_20201130_v001': [[0, 0, 0, 0, 0, 0, 0, ..."
199,M336_20210612,"{'M336_20210612_v001': [[0, 0, 0, 0, 0, 0, 0, ..."
200,M336_20210613,"{'M336_20210613_v001': [[0, 0, 0, 0, 0, 0, 0, ..."


In [12]:
mat_errors

['/data/caitlin/mat_files/M236_20170817.mat',
 '/data/caitlin/mat_files/M236_20170818.mat',
 '/data/caitlin/mat_files/M238_20170717.mat',
 '/data/caitlin/mat_files/M238_20170719.mat',
 '/data/caitlin/mat_files/M238_20170724.mat',
 '/data/caitlin/mat_files/M238_20170725.mat',
 '/data/caitlin/mat_files/M238_20170726.mat',
 '/data/caitlin/mat_files/M238_20170727.mat',
 '/data/caitlin/mat_files/M240_20170724FinalChecked.mat',
 '/data/caitlin/mat_files/M240_20170725FinalChecked.mat',
 '/data/caitlin/mat_files/M240_20170727FinalChecked.mat',
 '/data/caitlin/mat_files/M240_20170728FinalChecked.mat',
 '/data/caitlin/mat_files/M240_20170801FinalChecked.mat',
 '/data/caitlin/mat_files/M240_20170802FinalChecked.mat',
 '/data/caitlin/mat_files/M240_20170807FinalChecked.mat',
 '/data/caitlin/mat_files/M240_20170808FinalChecked.mat']

In [13]:
count = 0
for row in df.iterrows():
    count += len(row[1]["hand_labels"].keys())
count

529

In [14]:
df.to_hdf(path_or_buf="/data/caitlin/4ormore_hand_labels.hdf", key="df")

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['mat_path', 'hand_labels'], dtype='object')]

  df.to_hdf(path_or_buf="/data/caitlin/4ormore_hand_labels.hdf", key="df")


In [15]:
df = pd.read_hdf("/data/caitlin/4ormore_hand_labels.hdf")

In [16]:
mat_errors2 = list()
def get_hand_labels(mat_path): 
    hand_labels = dict()

    m = loadmat(mat_path)
    try:
        num_trials = m["data1"]["trial"].shape[0]
        
    except KeyError:
        mat_errors2.append(mat_path)
        return

    for i in tqdm(range(num_trials)):
        mat_trial_index = np.argwhere(m["data1"]["trial"].ravel() == (i + 1))
        # Trial not found in JAABA data
        if mat_trial_index.size == 0:
            continue
        
        try:
            mat_trial_index = mat_trial_index.item()
        except ValueError:
            continue

        behaviors = sorted([b.split('_')[0] for b in m['data1'].dtype.names if 'scores' in b])
        
        if len(behaviors) != 6:
            continue

        all_behaviors = [
            "Lift",
            "Handopen",
            "Grab",
            "Sup",
            "Atmouth",
            "Chew"
        ]

        sorted_behaviors = [b for b in all_behaviors if b in behaviors]

        ethograms = []

        for b in sorted_behaviors:
            behavior_index = m['data1'].dtype.names.index(f'{b}_labl_label')
            row = m['data1'][mat_trial_index][0][behavior_index]
            row[row == -1] = 0
            ethograms.append(row)

        sorted_behaviors = [b.lower() for b in sorted_behaviors]

        ethogram = np.hstack(ethograms).T

        if check_all_behaviors(ethogram):
            hand_labels[m["data1"]["exp"][i][0][0]] = ethogram

    return hand_labels

In [17]:
for mat in tqdm(mat_errors):
    hand_labels = get_hand_labels(Path(mat))
    if hand_labels:
        df.loc[len(df.index)] = [Path(mat).stem, hand_labels] 

  0%|                                                                                                                 | 0/16 [00:00<?, ?it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 80/80 [00:00<00:00, 5504.16it/s][A
  6%|██████▌                                                                                                  | 1/16 [00:00<00:01,  7.73it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 80/80 [00:00<00:00, 5515.10it/s][A

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 80/80 [00:00<00:00, 5529.10it/s][A
 19%|███████████████████▋                                                                                     | 3/16 [00:00<00:01, 10.44it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 81/81 [00:00<00:00, 5498.90it

In [18]:
mat_errors2

[]

In [19]:
df

Unnamed: 0,mat_path,hand_labels
0,M232_20170307,"{'M232_20170307_v062': [[0, 0, 0, 0, 0, 0, 0, ..."
1,M232_20170308,"{'M232_20170308_v006': [[0, 0, 0, 0, 0, 0, 0, ..."
2,M232_20170310,"{'M232_20170310_v013': [[0, 0, 0, 0, 0, 0, 0, ..."
3,M234_20170328,"{'M234_20170328_v004': [[0, 0, 0, 0, 0, 0, 0, ..."
4,M234_20170329,"{'M234_20170329_v001': [[0, 0, 0, 0, 0, 0, 0, ..."
...,...,...
200,M336_20210613,"{'M336_20210613_v001': [[0, 0, 0, 0, 0, 0, 0, ..."
201,M336_20210619,"{'M336_20210619_v001': [[0, 0, 0, 0, 0, 0, 0, ..."
202,M236_20170817,"{'M236_20170817_v002': [[0, 0, 0, 0, 0, 0, 0, ..."
203,M236_20170818,"{'M236_20170818_v042': [[0, 0, 0, 0, 0, 0, 0, ..."


In [20]:
df.to_hdf(path_or_buf="/data/caitlin/4ormore_hand_labels.hdf", key="df")

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['mat_path', 'hand_labels'], dtype='object')]

  df.to_hdf(path_or_buf="/data/caitlin/4ormore_hand_labels.hdf", key="df")


In [21]:
df = pd.read_hdf("/data/caitlin/4ormore_hand_labels.hdf")

In [22]:
df

Unnamed: 0,mat_path,hand_labels
0,M232_20170307,"{'M232_20170307_v062': [[0, 0, 0, 0, 0, 0, 0, ..."
1,M232_20170308,"{'M232_20170308_v006': [[0, 0, 0, 0, 0, 0, 0, ..."
2,M232_20170310,"{'M232_20170310_v013': [[0, 0, 0, 0, 0, 0, 0, ..."
3,M234_20170328,"{'M234_20170328_v004': [[0, 0, 0, 0, 0, 0, 0, ..."
4,M234_20170329,"{'M234_20170329_v001': [[0, 0, 0, 0, 0, 0, 0, ..."
...,...,...
200,M336_20210613,"{'M336_20210613_v001': [[0, 0, 0, 0, 0, 0, 0, ..."
201,M336_20210619,"{'M336_20210619_v001': [[0, 0, 0, 0, 0, 0, 0, ..."
202,M236_20170817,"{'M236_20170817_v002': [[0, 0, 0, 0, 0, 0, 0, ..."
203,M236_20170818,"{'M236_20170818_v042': [[0, 0, 0, 0, 0, 0, 0, ..."


In [23]:
count = 0
for row in df.iterrows():
    count += len(row[1]["hand_labels"].keys())
count

533