# Data Processing - Experiment 4: IT SynthTone

### Imports and Constants

In [1]:
import numpy as np
import pandas as pd
import scipy.stats as ss
import statsmodels.api as sm
from glob import glob
from statsmodels.stats.outliers_influence import OLSInfluence

def rescale_ioi(ioi):
    """
    Calculates location of an IOI on a 0-100 scale. Appears in the manuscript
    as Equation 1.

    This conversion assumes that 0=275ms and 100=1100ms, given
    that a 275 ms IOI is a rate twice as fast as the metronome and a 1100 ms
    IOI is a rate half as fast as the metronome.
    """
    ioi = 100 * np.log(1100 / ioi) / np.log(1100 / 275) 
    return np.round(ioi, 5)

# Define and find file paths
DATA_PATH = '../data/'
SAVEFILE = '../data/response_data.csv'

# Define levels of conditions
IOI_LEVELS = [1000, 918, 843, 774, 710, 652, 599, 550, 504, 463, 425, 390, 358, 329, 302]
TEMPO_BINS = [(1000, 918, 843), (774, 710, 652), (599, 550, 504), (463, 425, 390), (358, 329, 302)]
PITCH_LEVELS = [2, 3, 4, 5, 6, 7]
LOUDNESS_LEVELS = [0, 1, 2]

# Calculate ground-truth ratings for each IOI and tempo range
CORRECT_RATINGS = [rescale_ioi(ss.gmean(iois)) for iois in TEMPO_BINS]
CORRECT_RATINGS_FULL = [rescale_ioi(ioi) for ioi in IOI_LEVELS]

### Load Raw Data

Pavlovia saves each person's data to a separate CSV file. Here we use glob to find all the data files. We then read each data file with Pandas, check to make sure it's a complete session (i.e., it has an "ending" event), and append it to a single dataframe containing everyone's data.

In [2]:
datafiles = np.array(glob(DATA_PATH + 'I*.csv'))
df = []
for f in datafiles:
    d = pd.read_csv(f)
    if 'event' in d and d.event.iloc[-1] == 'ending':
        df.append(d)
df = pd.concat(df, ignore_index=True)

### Process Main Task

Get data frames containing only tone presentations and responses, respectively. Each trial produces one presentation event and one response event.

In [3]:
pres_rows = df[df['event'] == 'tones'].index
pres = df.iloc[pres_rows]
resp = df.iloc[pres_rows + 1]
pres = pres.reset_index(drop=True)
resp = resp.reset_index(drop=True)
if not np.all(resp.event == 'response'):
    raise ValueError('Non-response event included in response dataframe.')

Next, convert conditions and responses from floats to integers.

In [4]:
for colname in ('pitch', 'ioi', 'loudness'):
    pres.loc[:, colname] = pres[colname].astype(int)
for colname in ('pitch', 'ioi', 'loudness', 'response'):
    resp.loc[:, colname] = resp[colname].astype(int)

Add a column containing ground truth tempo ratings.

In [5]:
tempo_map = dict()
for i, iois in enumerate(TEMPO_BINS):
    for ioi in iois:
        tempo_map[ioi] = i + 1
pres = pres.assign(tempo=[tempo_map[ioi] for ioi in pres['ioi']])
pres = pres.assign(true_score=rescale_ioi(pres['ioi']))

Finally, merge presentation and response data back into one data frame with a single row per trial. This will be easier to analyze than having presentation and response data on separate rows.

In [6]:
# Select columns of interest from presentation and response events
pres = pres[['subject', 'first_type', 'pitch', 'ioi', 'tempo', 'loudness', 
             'type', 'true_score']]
resp = resp[['response', 'rt']]

# Merge presentation and response data
data = pd.merge(pres, resp, left_index=True, right_index=True)

# Addd column containing the difference between the correct and actual response
data = data.assign(error=data.response - data.true_score)

### Additional Scoring

Initialize arrays for all the new columns we will be adding to the data frame. An asterisk in the comment indicates that the value is identical for all trials within a given subject; otherwise the score will vary within subjects.

In [7]:
# Metadata
block = np.zeros(len(data), dtype=int)  # Block number of the trial
trial = np.zeros(len(data), dtype=int)  # Trial number within the session

# Headphone test scores
test_correct = np.zeros(len(data), dtype=int)  # Questions answered correctly (*)
test_incorrect = np.zeros(len(data), dtype=int)  # Questions answered incorrectly (*)
test_skipped = np.zeros(len(data), dtype=int)  # Questions skipped (*)

# Performance criteria
extremes = np.zeros(len(data), dtype=float)  # Trials on which they answered 0, 50, or 100 (*)
corr = np.zeros(len(data), dtype=float)  # Pearson r correlation between each person's ratings and the ground truth (*)

# Parameters and scores relating to the subject-specific IOI-to-rating linear models
slope = np.zeros(len(data), dtype=float)  # Slope of the model (*)
intercept = np.zeros(len(data), dtype=float)  # Intercept of the model (*)
resid = np.zeros(len(data), dtype=float)  # Residual tempo rating on each trial
cooks = np.zeros(len(data), dtype=float)  # Cook's distance for the response on each trial

In [8]:
# Define block numbers and trial numbers (these will be the same for each participant)
block_numbers = np.concatenate([[x for _ in range(30)] for x in range(6)])
trial_numbers = np.arange(1, 181)

# Identify which rows come from the headphone test
test_tones_mask = df.event == 'headphone_test_tones'
test_response_mask = df.event == 'headphone_test_response'

for subj in np.unique(data.subject):

    # Identify events from current subject
    subj_mask = data.subject == subj
    subj_mask_full = df.subject == subj
    
    # Label trials with the blocks they are from
    block[subj_mask] = block_numbers
    trial[subj_mask] = trial_numbers
    
    # Isolate headphone test presentation and response data
    testpres = df.loc[subj_mask_full & test_tones_mask, :].reset_index()
    testresp = df.loc[subj_mask_full & test_response_mask, :].reset_index()
    
    # Convert key codes for responses to 1, 2, and 3. Then determine whether 1, 2, or 3 was the correct answer 
    # based on the position of 'S' in the stimulus file name
    testresp = testresp.assign(response=np.array(testresp.key_press, dtype=int) - 48,
                              answer=[s.find('S') - 28 for s in testpres.stimulus])
    
    # Score headphone test trials by comparing responses to the correct answers
    testresp = testresp.assign(correct=testresp.response == testresp.answer,
                              incorrect=(testresp.response != testresp.answer) & (testresp.response > 0),
                              skipped=testresp.response == 0)
    test_correct[subj_mask] = testresp.correct.sum()
    test_incorrect[subj_mask] = testresp.incorrect.sum() 
    test_skipped[subj_mask] = testresp.skipped.sum()

    # Count number of times the participant responded 0|50|100
    score = np.sum(np.isin(data.loc[subj_mask, 'response'], (0, 50, 100)))
    extremes[subj_mask] = score

    # Calculate correlation between participant's responses and actual IOI
    score = ss.pearsonr(data.loc[subj_mask, 'true_score'], data.loc[subj_mask, 'response'])[0]
    corr[subj_mask] = score

    # Fit model of how the paricipant mapped tempo onto the scale
    fit = sm.OLS(data.loc[subj_mask, 'response'], 
                 sm.add_constant(np.log(data.loc[subj_mask, 'ioi']))).fit()

    # Identify outlier trials based on Cook's distance
    cooks[subj_mask] = OLSInfluence(fit).summary_frame().cooks_d

    # Refit model without outliers
    refit_mask = subj_mask & (cooks <= 4 / subj_mask.sum())
    fit = sm.OLS(data.loc[refit_mask, 'response'],
                sm.add_constant(np.log(data.loc[refit_mask, 'ioi']))).fit()
    intercept[subj_mask] = fit.params[0]
    slope[subj_mask] = fit.params[1]

    # Save model fit
    resid[subj_mask] = data.loc[subj_mask, 'response'] - \
        fit.predict(sm.add_constant(np.log(data.loc[subj_mask, 'ioi'])))

Add all the new columns to the data frame. This will be our final, processed version of the data.

In [9]:
data.loc[:, 'block'] = block
data.loc[:, 'trial'] = trial
data.loc[:, 'test_correct'] = test_correct
data.loc[:, 'test_incorrect'] = test_incorrect
data.loc[:, 'test_skipped'] = test_skipped
data.loc[:, 'extreme_responses'] = extremes
data.loc[:, 'pearsonr'] = corr
data.loc[:, 'intercept'] = intercept
data.loc[:, 'slope'] = slope
data.loc[:, 'residual'] = resid
data.loc[:, 'cooks'] = cooks

### Save Processed Data

Save the cleaned and processed version of the data to a CSV. This is the file we will load to perform analyses.

In [10]:
data.to_csv(SAVEFILE, index=False)