# Pure Fixation Count and Fixation Duration Differences

In [5]:
import os
import re
import pickle
import numpy as np
import pandas as pd
import my_stats
import scipy.stats as stats
from statistics import mean, median

### Functions

In [13]:
def calculate_averages(filename, filepath):
    df = pd.read_csv(f"{filepath}/{filename}")
    counts = df.iloc[:, 1:]
    num_tokens = df.shape[1]
    num_participants = df.shape[0]
    # print("total shape: ", df.shape, "num tokens: ", num_tokens, "participants: ", num_participants)
    row_sums = counts.sum(axis=1)
    # print('row sums', row_sums)
    # p_averages = row_sums/num_tokens
    # print(f"participant averages? {len(participant_averages)}\n", participant_averages)
    participant_sums = dict(zip(df.iloc[:, 0], row_sums))
    total_sum = sum(row_sums)
    fc_avg = total_sum/num_participants
    # print("sums", fc_avg, row_sums, row_count)
    
    return fc_avg, participant_sums
    # return total sum, and dictionary of participants 
    

def calculate_duration_averages(filename, filepath):
    df = pd.read_csv(f"{filepath}/{filename}")
    counts = df.iloc[:, 1:]
    row_sums = counts.sum(axis=1)
    row_nonzeros = counts.apply(lambda x: np.count_nonzero(x), axis=1)
    row_averages = row_sums.div(row_nonzeros).replace([np.inf, -np.inf, np.nan], 0)

    count_nonzeros = len(np.where(counts != 0)[0])
    participant_averages = dict(zip(df.iloc[:, 0], row_averages))
    total_average = sum(row_sums)/count_nonzeros
    
    return total_average, participant_averages


def cohens_d(list1, list2):
    n1, n2 = len(list1), len(list2)
    s1, s2 = np.nanvar(list1, ddof=1), np.nanvar(list2, ddof=1)
    pooled_var = ((n1-1) * s1 + (n2-1) * s2) / (n1 + n2 - 2)
    return (np.nanmean(list1) - np.nanmean(list2)) / np.sqrt(pooled_var)


def decide_t_test(g1_data, g2_data):  # based on variance, decide which t-test to use
    g1var = np.nanvar(g1_data, ddof=1)
    g2var = np.nanvar(g2_data, ddof=1)
    larger = max(g1var, g2var)
    smaller = min(g1var, g2var)
    # ratio = larger/smaller
    # test = "Student's" # only using Welch's so t-values can be compared
    # equal_var = True
    # if ratio >= 4:
    #     test = "Welch's"
    equal_var = False
    return "Welch's", equal_var


def my_t_test(name1, name2, g1_data, g2_data):
    output = {'test used': [],
              f"{name1} avg": [],
              f"{name2} avg": [],
              't-value': [],
              'p-value': [],
              'effect size': []
              }

    # g1_data = np.array(g1_data)[~np.isnan(g1_data)]
    # g2_data = np.array(g2_data)[~np.isnan(g2_data)]
    print("means", mean(g1_data), mean(g2_data), len(g1_data), len(g2_data))
    if len(g1_data) >= 2 and len(g2_data) >= 2:
        test, equal_var = decide_t_test(g1_data, g2_data)
        t, p = stats.ttest_ind(a=g1_data, b=g2_data, equal_var=equal_var)
        d = cohens_d(g1_data, g2_data)

        output['test used'].append(test)
        output[f"{name1} avg"].append(mean(g1_data))
        output[f"{name2} avg"].append(mean(g2_data))
        output['t-value'].append(t)
        output['p-value'].append(p)
        output['effect size'].append(d)

    else:
        print(f"not enough data points to calculate variance")
    return output


In [7]:
experts = ['189', '310', '311', '312', '313', '315', '316', '318', '319']
novices = ['111', '117', '129', '139', '147', '168', '176', '182', '186', '191']
women = ['111', '117', '119', '129', '136', '166', '182', '317', '319']
total_gaze_measures = {
    # total fixation counts and durations split by task
    'fcr':[], 'fcw':[], 'fdr':[], 'fdw':[],
    
    # fixation counts and durations split by expertise
    'fcexpr':[], 'fcexpw':[], 'fcnovr':[], 'fcnovw':[],
    'fdexpr':[], 'fdexpw':[], 'fdnovr':[], 'fdnovw':[],
    
    # fixation counts and durations split by gender
    'fcwomr':[], 'fcwomw':[], 'fcmenr':[], 'fcmenw':[], 
    'fdwomr':[], 'fdwomw':[], 'fdmenr':[], 'fdmenw':[] 
}

fixpath = "abstract_fixation_counts"
durpath = "abstract_fixation_durations"
fc_dir = os.listdir(fixpath)
fd_dir = os.listdir(durpath)

for task in fc_dir:
    # task = reading or writing
    subfixpath = f"{fixpath}/{task}"
    subdurpath = f"{durpath}/{task}"
    fixfiles = os.listdir(subfixpath)
    durfiles = os.listdir(subdurpath)
    
    if task == 'reading':
        task_str = 'r'
    elif task == 'writing':
        task_str = 'w'
    fixkey = 'fc'+task_str
    durkey = 'fd'+task_str
        
    for file in fixfiles:
        # print(file)
        # print(task)
        file_fc_avg, participant_fc_avg = calculate_averages(file, subfixpath) # averages
        file_fd_avg, participant_fd_avg = calculate_duration_averages(file, subdurpath) # averages
        
        total_gaze_measures[fixkey].append(file_fc_avg)
        total_gaze_measures[durkey].append(file_fd_avg)
        
        keys = []
        for person, number in participant_fc_avg.items():
            new_person = str(person)
            if new_person in women:
                # print("woman", new_person)
                gen_fc_key = 'fcwom'+task_str
                gen_fd_key = 'fdwom'+task_str
            else:
                # print("man", new_person)
                gen_fc_key = 'fcmen'+task_str
                gen_fd_key = 'fdmen'+task_str
            total_gaze_measures[gen_fc_key].append(participant_fc_avg[person])
            total_gaze_measures[gen_fd_key].append(participant_fd_avg[person])
                
            if new_person in experts:
                # print("expert", new_person)
                exp_fc_key = 'fcexp'+task_str
                exp_fd_key = 'fdexp'+task_str
            elif new_person in novices:
                # print("novice", new_person)
                exp_fc_key = 'fcnov'+task_str
                exp_fd_key = 'fdnov'+task_str
            else:
                continue
            total_gaze_measures[exp_fc_key].append(participant_fc_avg[person])
            total_gaze_measures[exp_fd_key].append(participant_fd_avg[person])
            

In [5]:
# def remove_outliers(data):
#     q1, q3 = np.percentile(data, [25, 75])
#     iqr = q3 - q1
#     lower_bound = q1 - (1.5 * iqr)
#     upper_bound = q3 + (1.5 * iqr)

#     return [x for x in data if lower_bound <= x <= upper_bound]


In [6]:
# total_gaze_measures = {k : remove_outliers(v) for k, v in total_gaze_measures.items()}

In [15]:
# # fixation counts and durations split by expertise
# 'fcexpr': [], 'fcexpw': [], 'fcnovr': [], 'fcnovw': [],
# 'fdexpr': [], 'fdexpw': [], 'fdnovr': [], 'fdnovw': [],
# print("Writing")
# print(f"Experts:{mean(total_gaze_measures['fcexpw'])}, Novices: {mean(total_gaze_measures['fcnovw'])}")
# print(f"Experts:{mean(total_gaze_measures['fdexpw'])}, Novices: {mean(total_gaze_measures['fdnovw'])}\n")

# print("Reading")
# print(f"Experts:{mean(total_gaze_measures['fcexpr'])}, Novices: {mean(total_gaze_measures['fcnovr'])}")
# print(f"Experts:{mean(total_gaze_measures['fdexpr'])}, Novices: {mean(total_gaze_measures['fdnovr'])}")

Writing
Experts:81.69302325581396, Novices: 106.60743801652893
Experts:0.08712404977964776, Novices: 0.11991097154834611

Reading
Experts:23.5625, Novices: 55.58967391304348
Experts:0.06269816058109995, Novices: 0.11331992054067863


In [16]:
print("average fixation count for Reading: ", mean(total_gaze_measures['fcr']))
print("average fixation duration for Reading: ", mean(total_gaze_measures['fdr']))

print("average fixation count for Writing: ", mean(total_gaze_measures['fcw']))
print("average fixation duration for Writing: ", mean(total_gaze_measures['fdw']))


average fixation count for Reading:  38.98649324485426
average fixation duration for Reading:  0.1030644080906216
average fixation count for Writing:  94.92109895656294
average fixation duration for Writing:  0.11432642743150927


In [15]:
# TODO t-tests
my_stats.my_t_test("reading fc", "writing fc", total_gaze_measures['fcr'], total_gaze_measures['fcw'])

{'test used': ["Welch's"],
 'reading fc avg': [38.98649324485426],
 'writing fc avg': [94.92109895656294],
 't-value': [-9.585027422730311],
 'p-value': [2.4072376021628875e-15],
 'effect size': [-1.6956700178498216]}

In [16]:
my_stats.my_t_test("reading fd", "writing fd",total_gaze_measures['fdr'], total_gaze_measures['fdw'])


{'test used': ["Welch's"],
 'reading fd avg': [0.1030644080906216],
 'writing fd avg': [0.11432642743150927],
 't-value': [-1.8500732219633098],
 'p-value': [0.06632950226879404],
 'effect size': [-0.29723804300057555]}

In [17]:
my_t_test("exp reading fc", "nov reading fc", total_gaze_measures['fcexpr'], total_gaze_measures['fcnovr'])



means 23.5625 55.58967391304348 336 368


{'test used': ["Welch's"],
 'exp reading fc avg': [23.5625],
 'nov reading fc avg': [55.58967391304348],
 't-value': [-10.343369916513725],
 'p-value': [2.8112494921281863e-23],
 'effect size': [-0.7660194765594568]}

In [18]:
my_t_test("exp reading fd", "nov reading fd", total_gaze_measures['fdexpr'], total_gaze_measures['fdnovr'])


means 0.06269816058109995 0.11331992054067863 336 368


{'test used': ["Welch's"],
 'exp reading fd avg': [0.06269816058109995],
 'nov reading fd avg': [0.11331992054067863],
 't-value': [-8.197665047763548],
 'p-value': [1.4572617482044437e-15],
 'effect size': [-0.6053449257544024]}

In [19]:
my_t_test("exp writing fc", "nov writing fc",
          total_gaze_measures['fcexpw'], total_gaze_measures['fcnovw'])


means 81.69302325581396 106.60743801652893 215 242


{'test used': ["Welch's"],
 'exp writing fc avg': [81.69302325581396],
 'nov writing fc avg': [106.60743801652893],
 't-value': [-2.9396402171755205],
 'p-value': [0.003461620227716573],
 'effect size': [-0.2772198488149874]}

In [20]:
my_t_test("exp writing fd", "nov writing fd",
          total_gaze_measures['fdexpw'], total_gaze_measures['fdnovw'])


means 0.08712404977964776 0.11991097154834611 215 242


{'test used': ["Welch's"],
 'exp writing fd avg': [0.08712404977964776],
 'nov writing fd avg': [0.11991097154834611],
 't-value': [-6.013665692540119],
 'p-value': [3.740048898072501e-09],
 'effect size': [-0.5619470571248397]}

In [4]:
participantfiles = "/home/zachkaras/code_summ_data"
people = os.listdir(participantfiles)

time = 0 # time in seconds
datapoints = 0
tasks = 0
for person in people:
    gazefiles = os.listdir(f"{participantfiles}/{person}/annotated_gaze")
    tasks += len(gazefiles)
    for file in gazefiles:
        df = pd.read_csv(f"{participantfiles}/{person}/annotated_gaze/{file}")
        start = df.loc[0, 'system_timestamp']
        end = df.loc[len(df)-1, 'system_timestamp']
        diff = (end-start)/10**6
        time += diff
        datapoints += len(df)
print("time in hours: ", time/3600)
print("java tasks: ", tasks)
print("eye-tracking data points: ", datapoints)


time in hours:  35.67536585277777
java tasks:  1684
eye-tracking data points:  6848501


In [15]:
time/3600 # time in hours

35.67536585277777