In [21]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler

****
# Import Datasets
****

In [22]:
path_001 = '../data/Project1 Data export_001.tsv'
path_002 = '../data/Project1 Data export_002.tsv'
path_003 = '../data/Project1 Data export_003.tsv'
path_004 = '../data/Project1 Data export_004.tsv'
path_005 = '../data/Project1 Data export_005.tsv'
path_list = [path_001, path_002, path_003, path_004, path_005]

In [160]:
# Read the tsv file and return the dataframe
def read_tsv(path):
    file = pd.read_csv(path,sep='\t')
    file_date = pd.to_datetime(file['Recording timestamp'], unit='s').dt.strftime('%H:%M:%S.%f').str[:-3]
    return file, file_date

# Get the task range start and end index
def task_range_finder(db, task_str):
    all_task = db.loc[db['Event'].str.contains('Task', na=False)]["Recording timestamp"].to_list()
    # Find timestamps where the specified task starts
    task_range_start = db.loc[db['Event'] == task_str, 'Recording timestamp'].to_list()
    # Compute task end times based on subsequent task start times
    task_range_end = [all_task[j+1] #- (5*1e6)  # 5 seconds before the next task
                        for j in range(len(all_task) - 1)  
                        if all_task[j] in task_range_start]
    # Handle case where the last task has no following task
    if len(task_range_start) > len(task_range_end):
        task_range_end.append(all_task[-1] + (60*1e6))  # 60 seconds after the begining of the last task to be sure to encompass the whole task
    return task_range_start, task_range_end


# Min-Max transform
def min_max_safe_transform(arr):
    if arr.shape[0] == 0:
        # Return empty 1D array if no samples
        return np.array([])
    # Fit a new scaler for each array
    scaler = MinMaxScaler()
    scaled = scaler.fit_transform(arr)  # shape => (rows, 1)
    return scaled.ravel() 

# Get the gaze point and mouse position data
def GP_array_list(db, task_start, task_end):
    all_x = []
    all_y = []
    all_mouse_x = []
    all_mouse_y = []

    for i in range(len(task_start)):
        x = db['Gaze point X'].iloc[task_start[i]:task_end[i]].dropna().to_numpy().reshape(-1,1)
        y = db['Gaze point Y'].iloc[task_start[i]:task_end[i]].dropna().to_numpy().reshape(-1,1)
        mouse_x = db['Mouse position X'].iloc[task_start[i]:task_end[i]].dropna().to_numpy().reshape(-1,1)
        mouse_y = db['Mouse position Y'].iloc[task_start[i]:task_end[i]].dropna().to_numpy().reshape(-1,1)
        #print(x)
        # Normalize each array (skip if empty)
        x_scaled = min_max_safe_transform(x)
        y_scaled = min_max_safe_transform(y)
        mouse_x_scaled = min_max_safe_transform(mouse_x)
        mouse_y_scaled = min_max_safe_transform(mouse_y)

        # Append each result to the lists
        all_x.append(x_scaled)
        all_y.append(y_scaled)
        all_mouse_x.append(mouse_x_scaled)
        all_mouse_y.append(mouse_y_scaled)

    # Return lists of arrays
    return all_x, all_y, all_mouse_x, all_mouse_y


def pad_or_truncate_nan(ts, target_length):
    """
    Pads (or truncates) a 1D time-series array 'ts' with NaN 
    to make it exactly 'target_length' long.
    """
    # Create an array of NaNs
    padded = np.full(shape=(target_length,), fill_value=-1)
    length = min(len(ts), target_length)
    padded[:length] = ts[:length]  # copy up to target_length
    return padded

In [131]:
all_files = []
for i in range(len(path_list)):
    file,_ = read_tsv(path_list[i])
    all_files.append(file)


  file = pd.read_csv(path,sep='\t')


In [49]:
all_files[0].columns

Index(['Recording timestamp', 'Computer timestamp', 'Sensor', 'Project name',
       'Export date', 'Participant name', 'Recording name', 'Recording date',
       'Recording date UTC', 'Recording start time',
       'Recording start time UTC', 'Recording duration', 'Timeline name',
       'Recording Fixation filter name', 'Recording software version',
       'Recording resolution height', 'Recording resolution width',
       'Recording monitor latency', 'Average calibration accuracy (mm)',
       'Average calibration precision SD (mm)',
       'Average calibration precision RMS (mm)',
       'Average calibration accuracy (degrees)',
       'Average calibration precision SD (degrees)',
       'Average calibration precision RMS (degrees)',
       'Average calibration accuracy (pixels)',
       'Average calibration precision SD (pixels)',
       'Average calibration precision RMS (pixels)',
       'Average validation accuracy (mm)',
       'Average validation precision SD (mm)',
       'A

In [121]:
# The unit of the timestamp is in microsecond
#Duration of the screen recording in minutes
print(f"Duration of screen recording in min: {(all_files[0].query('Event == "ScreenRecordingEnd"')['Recording timestamp'].values - all_files[0].query('Event == "ScreenRecordingStart"')['Recording timestamp'].values).item()/(1e6 * 60)}")

#Total duration of the recording in minutes (unit of the columns, miliseconds)
print(f"Total duration in min: {all_files[0]["Recording duration"].unique().item()/(1000*60)}")

Duration of screen recording in min: 18.937161516666666
Total duration in min: 20.649416666666667


In [126]:
# The frequency of recording is 120Hz, that's why consecutive timestamps are at least 8.33ms apart
(all_files[0]["Recording timestamp"].diff()/1e6).value_counts()

Recording timestamp
0.008333    63238
0.008334    30950
0.008332     9320
0.008331     2464
0.008335     1573
            ...  
0.002701        1
0.003793        1
0.097263        1
0.000479        1
0.164029        1
Name: count, Length: 8369, dtype: int64

In [137]:
all_files[0]["Event"].value_counts()

Event
MouseEvent                       666
KeyboardEvent                    508
Task 4                             7
Task 1                             6
Task 3                             6
Task 5                             6
Task 6                             6
Task 2                             6
RecordingStart                     1
Eye tracker Calibration start      1
ScreenRecordingStart               1
Eye tracker Calibration end        1
ScreenRecordingEnd                 1
RecordingEnd                       1
Name: count, dtype: int64

In [161]:
start, end = task_range_finder(all_files[0], 'Task 6')
# (np.array(end) - np.array(start))/1e6

In [162]:
start

[384650848, 506481709, 595375103, 633562973, 761980973, 790936973]

In [163]:
end

[384650848, 425124426, 535555769, 633562973, 664384973, 790936973, 827283973]

In [None]:
# Why start and end are not the same length?
# We have to find out why
# Some ends are before the corresponding starts: we have to find out why
