next steps: 


- check over lookit reading stuff (using gal's example)

- read in new CSV w/ subject level info 
- edit write to CSV func 
- check over main func 

- see what other parsers he has (trial info)
- build in parser flexibility

- reorganize/clean up/document 

In [1]:
import os
import os.path as op
import sys
from pathlib import Path
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
from helperfuncs.video_framerates import get_frame_information
from helperfuncs.video_framerates import write_to_json
from helperfuncs.lookit_json_parser import get_lookit_trial_times

In [2]:
# global directory path variables. make these your folder names under MCS
project_dir = '/om3/group/saxelab/LAB_STANDARD_LOOKING_TIME_CODE/looking_time/template_project_dir'

# where are icatcher outputs
icatcher_outputs_dir = op.join(project_dir, 'data/icatcher_outputs')

# where is trial info located? (should be a CSV for now)
lookit_trial_info_path = 'data/lookit_trial_timing_info.csv'
lookit_trial_info_csv = op.join(project_dir, lookit_trial_info_path)
onset_file = op.join(project_dir, 'data/video_relative_expt_onsets', child_id + "_expt_onset.txt")

# videos path 
videos_dir = op.join(project_dir, 'data/videos')

In [3]:
# list all files except those beginning with '.' i.e., hidden files 

def listdir_nohidden(path):
    for f in os.listdir(path):
        if not f.startswith('.'):
            yield f

In [None]:
# create method for getting trial/expt onset times w/r/t video 

is_using_lookit_times = 0

Runs everything. Saves the data to csv file.

Takes in iCatcher output and adds timestamps to each frame.

In [None]:
def read_convert_output(filename, stamps):
    """
    Given an npz file containing icatcher annotated frames and looks,
    converts to pandas DataFrame with another column mapping each frame
    to its time stamp in the video
    
    INPUTS: 
    filename (string): name of tabulated iCatcher output file in format
    '[CHILD_ID].npz'
    stamps (List[int]): time stamp for each frame, where stamps[i] is the 
    time stamp at frame i (determined in function get_frame_information(), IMPORTED function)
    
    OUTPUTS: 
    rtype: DataFrame
    
    """
    npz = np.load(filename)
    df = pd.DataFrame([])

    lst = npz.files

    df['frame'] = range(1, len(npz[lst[0]]) + 1)
    df['on_off'] = ['on' if frame > 0 else 'off' for frame in npz[lst[0]]]
    
    # TO DO: DELETE IF UNUSED 
    #df['confidence'] = npz[lst[1]]

    # convert frames to ms using frame rate
    df['time_ms'] = stamps
    df['time_ms'] = df['time_ms'].astype(int)
    
    return df

This function gets trial data from a Datavyu file. Will eventually add parsers here to get trial info from other places.
Need 1) the start of the experiment relative to the video
     2) the start/end of the trials relative to the experiment

## plan for getting trial/expt onset times: 
 
### experiment onset relative to video: 

option 1) file that says in ms what time expt starts with respect to video 
e.g. if expt starts at video timestamp 423ms then there should be a file called experiment_onset.txt that contains value: 423; default for recoding videos (where trial times are already defined w/r/t video using manual coding of trial onsets): set to 0

option 2) lookit parser 

option 3) alternative parser (not yet built -- are there any??) 


In [None]:
def get_trial_sets(child_id):
    """
    Finds corresponding Datavyu input file for given iCatcher output file
    and returns a list of [onset, offset] times for each trial in 
    milliseconds
    
    input_file (string): name of Datavyu input file
    rtype: List[List[int]]
    """
    
    
    if is_using_lookit_times:
        if Path(lookit_trial_info_csv).is_file():
            df = pd.read_csv(lookit_trial_info_csv)
        else:
            df = get_lookit_trial_times(icatcher_outputs_dir)
            df.to_csv(lookit_trial_info_csv)
            
            
        # SOME OF THIS STUFF MIGHT COME AFTER THE MAIN ELSE -- FOR OTHER PARSERS TOO 
        
        # get part of df from current child
        df = df[df['child_id'] == child_id] 
    
    
    else: 
        
        # get trial onsets 
        
        # USE OTHER PARSERS HERE 
        # basically create df 
    
        #get experiment onset 
        with open(onset_file) as f:
            text = f.read()
            expt_onset = int(text)
            
            # add difference to create relative onsets/offsets  
            
            # note: relative means, trial onsets/offsets relative to the start of video 
            # also note: here we add under assumption that video starts BEFORE experiment starts 
            # if your video starts AFTER experiment starts, set as negative value in file 
            df['relative_onset'] = df['onset'] + expt_onset
            df['relative_offset'] = df['offset'] + expt_onset

    
    
    # there's two different file formats -- updated as needed 
    # WHAT IS THIS ?? ASK GAL, MAYBE REMOVE THIS STEP 
    df_sets = df[['relative_onset', 'relative_offset']]
    df_sets = df_sets.rename(columns={"relative_onset": "onset", "relative_offset": "offset"})

    df_sets.dropna(inplace=True)
        
        
    trial_sets = []
    for _, trial in df_sets.iterrows():
        trial_sets.append([int(trial['onset']), int(trial['offset'])])

    def unique(sequence):
        seen = set()
        return [x for x in sequence if not (tuple(x) in seen or seen.add(tuple(x)))]

    
    
    return unique(trial_sets), df

Makes a trial column and logs the trial that each frame is in.

In [None]:
def assign_trial(df, trial_sets):
    
    # ASK GAL - WHAT IS THIS FOR 
    # is this supposed to assign trial to each look 
    
    
    # i think we don't want this -- we assign each look a trial# in other script
    
    """
    Given trial onsets and offsets, makes a 'trial' column in df mapping indicating
    which trial each frame belongs in, or 0 if no trial
    
    df (DataFrame): pandas Dataframe with time information
    trial_sets (List[List[int]]): list of trial [onset, offset] pairs in ms
    rtype: None
    """
    # mapping function
    def map_to_range(value, ranges):
        """
        Modifies df to have a column mapping value to one of the ranges provided, or 0 if not 
        """
        for start, end in ranges:
            if value in range(start, end + 1): 
                return ranges.index([start, end]) + 1
        return 0
    
    # rewrite this with logicals
    df['trial'] = df['time_ms'].apply(lambda x: map_to_range(x, trial_sets))

Saves a csv with subject, trial, and look level information.

In [None]:
def write_to_csv(data_filename, child_id, icatcher_data, session, trial_type, stim_type, icatcher):
    """
    checks if output file is in directory. if not, writes new file
    containing looking times computed by iCatcher and Datavyu for child
    with Lookit ID id. 
    
    child_id (string): unique child ID associated with subject
    icatcher_data (List[List[int]]): list of [on times, off times] per trial
                calculated form iCatcher
    datavyu_data (List[List[int]]): list of [on times, off times] per trial
                calculated form iCatcher
    session (string): the experiment session the participant was placed in
    rtype: None
    """
    # assert(len(icatcher_data) == len(datavyu_data))
    num_trials = len(icatcher_data)
    id_arr = [child_id] * len(icatcher_data)
    data = {
        'child': id_arr, # * subject level info
        'session': [session] * num_trials, # * subject level info
        'trial_num': [i + 1 for i in range(len(icatcher_data))], # * Trials.ordinal
        'trial_type': trial_type, # * Trials.x
        'stim_type': stim_type, # * Trial level info
        'confidence': list(icatcher[(icatcher['on_off'] == 'on') & (icatcher['trial'] != 0)].groupby('trial')[['confidence']].mean().squeeze()), # * no confidence
        'iCatcher_on(s)': [trial[0] for trial in icatcher_data], # * don't want this
        'iCatcher_off(s)': [trial[1] for trial in icatcher_data] # * don't want this
    }

    df = pd.DataFrame(data)

    output_file = Path(data_filename)
    if not output_file.is_file():
        df.to_csv(data_filename)
        return
    
    output_df = pd.read_csv(data_filename, index_col=0)
    ids = output_df['child'].unique()

    if child_id not in ids:
        output_df = output_df.append(df, ignore_index=True)
        output_df.to_csv(data_filename)

In [4]:
def run_analyze_output(data_filename="BBB_output.csv", session=None):
    """
    Given an iCatcher output directory and Datavyu input and output 
    files, runs iCatcher over all videos in vid_dir that have not been
    already run, computes looking times for all iCatcher outputs, and
    compares with Datavyu looking times. 
    data_filename (string): name of file you want comparison data to be written
            to. Must have .csv ending. 
    session (string): ID of the experiment session. If session is not
            specified, looks for videos only within videos_dir, otherwise
            searches within [videos_dir]/session[session]
    """
    for filename in listdir_nohidden(icatcher_outputs_dir):
        child_id = filename.split('.')[0]

        # skip if child data already added
        output_file = Path(data_filename)
        if output_file.is_file():
            output_df = pd.read_csv(data_filename, index_col=0)
            ids = output_df['child'].unique()
            if child_id in ids: 
                print(child_id + ' already processed')
                continue
        
        vid_path = videos_dir + '/'
        if session:
            vid_path += "session" + session + '/'
        vid_path = vid_path + child_id + ".mp4"
        json_video_data = vid_path + child_id + '.json'

        # get timestamp for each frame in the video
        print('getting frame information for {}...'.format(vid_path))
        timestamps, length = get_frame_information(vid_path, json_video_data)
        if not timestamps:
            print('video not found for {} in {} folder'.format(child_id, videos_dir))
            continue
        
        # initialize df with time stamps for iCatcher file
        icatcher_path = icatcher_outputs_dir + '/' + filename
        icatcher = read_convert_output(icatcher_path, timestamps)

        # get trial onsets and offsets from input file, match to iCatcher file
        trial_sets, df = get_trial_sets(child_id)
        assign_trial(icatcher, trial_sets)
        
        # sum on looks and off looks for each trial
        icatcher_times = get_on_off_times(icatcher)
        # datavyu_times = get_output_times(output_file)

        # check whether number of trials from trial info is the same as 
        if icatcher['trial'].max() != len(df):
            print('mismatch in # of trials between icatcher and session info: {} in {} folder'.format(child_id, videos_dir))
            continue

        write_to_csv(data_filename, child_id, icatcher_times, session, df['fam_or_test'], df['scene'], icatcher)
        # return comparison metrics 
        # icatcher_arr, datavyu_arr = np.array(icatcher_times).flatten(), np.array(datavyu_times).flatten()
        #stat, p = pearsonr(icatcher_arr, datavyu_arr)
       # print('Datavyu total on-off looks per trial: \n', datavyu_times)
      #  print('iCatcher total on-off looks per trial: \n', icatcher_times)
      #  print('Pearson R coefficient: {} \np-value: {}'.format(round(stat, 3), round(p, 3)))


In [None]:
if __name__ == "__main__":
    run_analyze_output()