# Data Preprocessing for Illusory Pitch Study

In [None]:
import numpy as np
import pandas as pd
import scipy.stats as stats
from glob import glob

def dprime_and_c(hit_rate, fa_rate):
    
    # Get corresponding z-scores for the hit rate and false alarm rate
    zH = stats.norm.ppf(hit_rate)
    zF = stats.norm.ppf(fa_rate)
    
    # Calculate d' and C using z-scores
    dprime = zH - zF
    C = -(zH + zF) / 2
    
    return dprime, C

### Load data

In [None]:
# Find all data files
datafiles = glob('../data/IPAD_*.csv')

# Load each data file and concatenate them into a single table
d = pd.concat((pd.read_csv(f) for f in datafiles))

# Select only non-pilot participants
#d = d[d.code_version == 1]

# Select only main trial events
d = d[d.event == 'trial']

# Recode the pitch shift and key press as 0 for "lower" and 1 for "higher"
d = d.assign(answer = d['shift'] == '+',
            response = d['response'] == '+')

# Before version 1.1, the experiment always loaded the stimuli for difficulty 1.0, so manually overwrite recorded trial difficulty
d.loc[d.version < 1.1, 'difficulty'] = 1.0

### Calculate scores within each subject and condition

In [None]:
# Define conditions as (octave, offset) pairs


# Scores will be stored in a long-format table
scores = pd.DataFrame(columns=['subject', 'experimenter', 'version', 'jnd', 'difficulty', 'interval',
                               'hit_rate', 'fa_rate', 'accuracy', 'perc_resp_low', 'dprime', 'C', 'rt'])

# Calculate scores for each subject
for s, subj in enumerate(d.subject.unique()):

    # Select all responses from the current subject
    subj_trials = d[d.subject == subj]
    experimenter = subj_trials.iloc[-1].experimenter
    exp_version = subj_trials.iloc[-1].version
    jnd = subj_trials.iloc[-1].jnd
    conditions = [(1, 425), (1, 500), (1, 575), (0.5, 425), (0.5, 500), (0.5, 575)]

    # Calculate scores within each condition
    for i, condition in enumerate(conditions):
        
        # Select all trials from the current condition
        difficulty = condition[0]
        interval = condition[1]
        trials = subj_trials[(subj_trials.difficulty == difficulty) & (subj_trials.interval == interval)]
        if len(trials) == 0:
            continue
        
        # Create dictionary to store scores from current subject and condition
        condi_scores = dict(subject=subj, experimenter=experimenter, version=exp_version, difficulty=difficulty, interval=interval)
        
        # Calculate accuracy and the percent of the time the participant responded "lower"
        condi_scores['accuracy'] = np.mean(trials.correct)
        condi_scores['perc_resp_low'] = np.mean(~trials.response)
        
        # Calculate hit and false alarm rates using Hautus (1995) adjustment to avoid 0s and 1s
        condi_scores['hit_rate'] = (np.sum(trials.answer & trials.response) + .5) / (np.sum(trials.answer) + 1)
        condi_scores['fa_rate'] = (np.sum(~trials.answer & trials.response) + .5) / (np.sum(~trials.answer) + 1)
        
        # Calculate d' and C based on the hit rate and false alarm rate
        condi_scores['dprime'], condi_scores['C'] = dprime_and_c(condi_scores['hit_rate'], condi_scores['fa_rate'])

        condi_scores['rt'] = trials.rt.mean()
        condi_scores['jnd'] = jnd
        
        # Add current scores as a row to the full table of scores
        scores.loc[len(scores.index)] = condi_scores

### Calculate extra scores for exploratory analyses

In [None]:
# Define offset conditions
intervals = [425, 500, 575]

# Scores will be stored in a data frame with one row per subject
exploratory_scores = pd.DataFrame(columns=['subject', 'jnd', 'dprime', 'csize'])

uniq_subjs = d.subject.unique()
subj_dprimes = np.empty(len(uniq_subjs))
subj_Cs = np.empty(len(uniq_subjs))
subj_effectsizes = np.empty(len(uniq_subjs))
for s, subj in enumerate(uniq_subjs):
    
    # Select all responses from the current subject
    subj_trials = d[d.subject == subj]
    jnd = subj_trials.iloc[-1].jnd
    
    # Calculate person's overall sensitivity
    subj_hit_rate = (np.sum(subj_trials.answer & subj_trials.response) + .5) / (np.sum(subj_trials.answer) + 1)
    subj_fa_rate = (np.sum(~subj_trials.answer & subj_trials.response) + .5) / (np.sum(~subj_trials.answer) + 1)
    dprime, _ = dprime_and_c(subj_hit_rate, subj_fa_rate)
    
    # Calculate overall timing-induced bias size
    C = np.full(len(intervals), np.nan)
    for i, interval in enumerate(intervals):
        trials = subj_trials[subj_trials.interval == interval]
        offset_hit_rate = (np.sum(trials.answer & trials.response) + .5) / (np.sum(trials.answer) + 1)
        offset_fa_rate = (np.sum(~trials.answer & trials.response) + .5) / (np.sum(~trials.answer) + 1)
        _, C[i] = dprime_and_c(offset_hit_rate, offset_fa_rate)
        csize = np.std(C) #np.mean(np.abs([C[0] - C[1], C[0] - C[2], C[1] - C[2]]))
        
    # Add new row of scores to the data frame
    exploratory_scores.loc[len(exploratory_scores.index)] = dict(subject=subj, jnd=jnd, dprime=dprime, csize=csize)

### Save processed scores to a file

In [None]:
scores.to_csv('../data/scores.csv', index=False)
exploratory_scores.to_csv('../data/exploratory_scores.csv', index=False)