# Data Processing - Experiment 3: IT LongTone

### Imports and Constants

In [1]:
import numpy as np
import pandas as pd
import scipy.stats as ss
import statsmodels.api as sm
from glob import glob
from statsmodels.stats.outliers_influence import OLSInfluence

def rescale_ioi(ioi):
    """
    Calculates location of an IOI on a 0-100 scale. Appears in the manuscript
    as Equation 1.

    This conversion assumes that 0=275ms and 100=1100ms, given
    that a 275 ms IOI is a rate twice as fast as the metronome and a 1100 ms
    IOI is a rate half as fast as the metronome.
    """
    ioi = 100 * np.log(1100 / ioi) / np.log(1100 / 275) 
    return np.round(ioi, 5)

# Define and find file paths
DATA_PATH = '../data/'
SAVEFILE = '../data/response_data.csv'
TAP_SAVEFILE = '../data/tap_data.csv'

# Define levels of conditions
IOI_LEVELS = [1000, 918, 843, 774, 710, 652, 599, 550, 504, 463, 425, 390, 358, 329, 302]
TEMPO_BINS = [(1000, 918, 843), (774, 710, 652), (599, 550, 504), (463, 425, 390), (358, 329, 302)]
PITCH_LEVELS = [2, 3, 4, 5, 6, 7]
LOUDNESS_LEVELS = [0, 1, 2]

# Calculate ground-truth ratings for each IOI and tempo range
CORRECT_RATINGS = [rescale_ioi(ss.gmean(iois)) for iois in TEMPO_BINS]
CORRECT_RATINGS_FULL = [rescale_ioi(ioi) for ioi in IOI_LEVELS]

### Load Raw Data

Pavlovia saves each person's data to a separate CSV file. Here we use glob to find all the data files. We then read each data file with Pandas, check to make sure it's a complete session (i.e., it has an "ending" event), and append it to a single dataframe containing everyone's data.

In [2]:
datafiles = np.array(glob(DATA_PATH + 'I*.csv'))
df = []
for f in datafiles:
    d = pd.read_csv(f)
    # Ensure that the session is complete
    if 'event' in d and d.event.iloc[-1] == 'ending':
        df.append(d)
df = pd.concat(df, ignore_index=True)

### Process Spontaneous Motor Tempo Task

Separate out the spontaneous motor tempo task from each participant into a new tap_data data frame with one row per subject.

In [3]:
tap_data = df[df.event == 'tapping_test']
tap_data = tap_data.rename(columns={'rt': 'tap_times'})

Convert the lists of key presses and tap times from strings into lists of integers.

In [4]:
tap_data['key_press'] = [[int(y) for y in x.split(',')] 
                         if type(x) == str else x for x in tap_data.key_press]
tap_data['tap_times'] = [[int(y) for y in x.split(',')] 
                         if type(x) == str else x for x in tap_data.tap_times]

Clean invalid key presses from the spontaneous rate tapping data

In [5]:
valid_key_presses = []
valid_tap_times = []
for i in tap_data.index:
    if isinstance(tap_data.loc[i, 'key_press'], float):
        if np.isnan(tap_data.loc[i, 'key_press']):
            tap_data.loc[i, 'key_press'] = [[np.nan]]
            tap_data.loc[i, 'tap_times'] = [[np.nan]]
        else:
            tap_data.loc[i, 'key_press'] = [[int(tap_data.loc[i, 'key_press'])]]
            tap_data.loc[i, 'tap_times'] = [[int(tap_data.loc[i, 'tap_times'])]]
    fj_mask = np.in1d(tap_data.loc[i, 'key_press'], [70, 74])
    valid_key_presses.append(np.array(tap_data.loc[i, 'key_press'])[fj_mask])
    valid_tap_times.append(np.array(tap_data.loc[i, 'tap_times'])[fj_mask])
tap_data['valid_key_press'] = valid_key_presses
tap_data['valid_tap_times'] = valid_tap_times

Calculate median inter-tap intervals (NaN if tapped 3 or fewer times)

In [6]:
itis = [np.median(np.diff(times)) if len(times) > 3 else np.nan 
        for times in tap_data.valid_tap_times]
tap_data['pref_rate'] = itis

### Process Main Task

Get data frames containing only tone presentations and responses, respectively. Each trial produces one presentation event and one response event.

In [7]:
pres_rows = df[df['event'] == 'tones'].index
pres = df.iloc[pres_rows]
resp = df.iloc[pres_rows + 1]
pres.reset_index(drop=True, inplace=True)
resp.reset_index(drop=True, inplace=True)
if not np.all(resp.event == 'response'):
    raise ValueError('Non-response event included in response dataframe.')

Next, read the stimulus file names to determine the pitch, IOI, and loudness of each trial.

In [8]:
pres = pres.assign(pitch=[int(s[17]) for s in pres['stimulus']],
                   ioi=[int(s[19:-6]) for s in pres['stimulus']],
                   loudness=[int(s[-5]) for s in pres['stimulus']])

Add a column containing ground truth tempo ratings.

In [9]:
tempo_map = dict()
for i, iois in enumerate(TEMPO_BINS):
    for ioi in iois:
        tempo_map[ioi] = i + 1
pres = pres.assign(tempo=[tempo_map[ioi] for ioi in pres['ioi']])
pres = pres.assign(true_score=rescale_ioi(pres['ioi']))

Finally, merge presentation and response data back into one data frame with a single row per trial. This will be easier to analyze than having presentation and response data on separate rows.

In [10]:
# Select columns of interest from presentation and response events
pres = pres[['subject', 'pitch', 'ioi', 'tempo', 'loudness', 
             'tap_condition', 'key_press', 'rt', 'true_score']]
resp = resp[['response', 'rt']]

# Rename overlapping column name
pres.rename(columns={'rt': 'tap_times'}, inplace=True)

# Convert tapping information from strings to lists
pres['key_press'] = [[int(y) for y in x.split(',')] 
                     if type(x) == str else x for x in pres.key_press]
pres['tap_times'] = [[int(y) for y in x.split(',')] 
                     if type(x) == str else x for x in pres.tap_times]
    
# Merge presentation and response data
data = pd.merge(pres, resp, left_index=True, right_index=True)

# Addd column containing the difference between the correct and actual response
data = data.assign(error=data.response - data.true_score)

### Additional Scoring

Initialize arrays for all the new columns we will be adding to the data frame. An asterisk in the comment indicates that the value is identical for all trials within a given subject; otherwise the score will vary within subjects. Scores that only have one value per participant will also be included in the tap_data dataset (which only includes one row per subject).

In [11]:
# Metadata
block = np.zeros(len(data), dtype=int)
trial = np.zeros(len(data), dtype=int)

# Headphone test scores (*)
test_correct = np.zeros(len(data), dtype=int)
test_correct2 = np.zeros(len(tap_data), dtype=int)
test_incorrect = np.zeros(len(data), dtype=int)
test_incorrect2 = np.zeros(len(tap_data), dtype=int)
test_skipped = np.zeros(len(data), dtype=int)
test_skipped2 = np.zeros(len(tap_data), dtype=int)

# Trials on which they answered 0, 50, or 100 (*)
extremes = np.zeros(len(data), dtype=float)  
extremes2 = np.zeros(len(tap_data), dtype=float)

# Pearson r correlation between each person's ratings and the ground truth (*)
corr = np.zeros(len(data), dtype=float)
corr2 = np.zeros(len(tap_data), dtype=float)

# Slope and intercept of subject-specific IOI-to-rating linear models (*)
intercept = np.zeros(len(data), dtype=float)
intercept2 = np.zeros(len(tap_data), dtype=float)
slope = np.zeros(len(data), dtype=float)
slope2 = np.zeros(len(tap_data), dtype=float)

# Residual tempo rating and cook's distance for the response on each trial
resid = np.zeros(len(data), dtype=float)
cooks = np.zeros(len(data), dtype=float)

Perform a variety of data processing for each participant. Exclusion-related scoring includes marking the headphone test, counting how many times the participant gave an extreme response (0|50|100), and calculating the correlation between their responses and the actual tempo. We then fit the subject-specific models relating IOIs to raw ratings (Equation 3 in the manuscript) and calculate residual tempo ratings for all trials (Equation 4 in the manuscript).

In [12]:
# Define block numbers and trial numbers (these will be the same for each participant)
block_numbers = np.concatenate([[x for _ in range(30)] for x in range(3)])
trial_numbers = np.arange(1, 91)

# Identify which rows come from the headphone test
test_tones_mask = df.event == 'headphone_test_tones'
test_response_mask = df.event == 'headphone_test_response'

# Calculate performance metrics and regression model for each participant
for subj in np.unique(data.subject):

    # Identify events from current subject
    subj_mask = data.subject == subj
    subj_mask_full = df.subject == subj
    subj_mask_tapdata = tap_data.subject == subj
    
    # Label trials with the blocks they are from
    block[subj_mask] = block_numbers
    trial[subj_mask] = trial_numbers
    
    # Isolate headphone test presentation and response data
    testpres = df.loc[subj_mask_full & test_tones_mask, :].reset_index()
    testresp = df.loc[subj_mask_full & test_response_mask, :].reset_index()
    
    # Convert key codes for responses to 1, 2, and 3. Then determine whether 1, 2, or 3 was the correct answer 
    # based on the position of 'S' in the stimulus file name
    testresp = testresp.assign(response=np.array(testresp.key_press, dtype=int) - 48,
                              answer=[s.find('S') - 28 for s in testpres.stimulus])
    
    # Score headphone test trials by comparing responses to the correct answers
    testresp = testresp.assign(correct=testresp.response == testresp.answer,
                              incorrect=(testresp.response != testresp.answer) & (testresp.response > 0),
                              skipped=testresp.response == 0)
    test_correct[subj_mask] = testresp.correct.sum()
    test_correct2[subj_mask_tapdata] = testresp.correct.sum()
    test_incorrect[subj_mask] = testresp.incorrect.sum() 
    test_incorrect2[subj_mask_tapdata] = testresp.incorrect.sum() 
    test_skipped[subj_mask] = testresp.skipped.sum()
    test_skipped2[subj_mask_tapdata] = testresp.skipped.sum()

    # Count number of times the participant responded 0|50|100
    score = np.sum(np.isin(data.loc[subj_mask, 'response'], (0, 50, 100)))
    extremes[subj_mask] = score
    extremes2[subj_mask_tapdata] = score

    # Calculate correlation between participant's responses and true relative tempo
    score = ss.pearsonr(data.loc[subj_mask, 'true_score'], data.loc[subj_mask, 'response'])[0]
    corr[subj_mask] = score
    corr2[subj_mask_tapdata] = score

    # Fit model of how the paricipant mapped tempo onto the scale
    fit = sm.OLS(data.loc[subj_mask, 'response'], 
                sm.add_constant(np.log2(550 / data.loc[subj_mask, 'ioi']))).fit()

    # Identify outlier trials based on Cook's distance
    cooks[subj_mask] = OLSInfluence(fit).summary_frame().cooks_d

    # Refit model without outliers
    refit_mask = subj_mask & (cooks <= 4 / subj_mask.sum())
    fit = sm.OLS(data.loc[refit_mask, 'response'],
                sm.add_constant(np.log2(550 / data.loc[refit_mask, 'ioi']))).fit()
    intercept[subj_mask] = fit.params[0]
    intercept2[subj_mask_tapdata] = fit.params[0]
    slope[subj_mask] = fit.params[1]
    slope2[subj_mask_tapdata] = fit.params[1]

    # Calculate residual tempo ratings
    resid[subj_mask] = data.loc[subj_mask, 'response'] - \
        fit.predict(sm.add_constant(np.log2(550 / data.loc[subj_mask, 'ioi'])))

# Mark trials as tapping type NTI (0), TI-NT (1), or TI-YT (2) based on whether the participant tapped to the repeating tone
data.loc[:, 'tapped'] = np.array([isinstance(x, (str, list)) for x in data.key_press])
data.loc[:, 'tap_type'] = data.tap_condition.astype(int) + (data.tap_condition & data.tapped).astype(int)

Add all the new columns to the data frames. These will be our final, processed versions of the data.

In [13]:
data.loc[:, 'block'] = block
data.loc[:, 'trial'] = trial
data.loc[:, 'test_correct'] = test_correct
tap_data.loc[:, 'test_correct'] = test_correct2
data.loc[:, 'test_incorrect'] = test_incorrect
tap_data.loc[:, 'test_incorrect'] = test_incorrect2
data.loc[:, 'test_skipped'] = test_skipped
tap_data.loc[:, 'test_skipped'] = test_skipped2
data.loc[:, 'extreme_responses'] = extremes
tap_data.loc[:, 'extreme_responses'] = extremes2
data.loc[:, 'pearsonr'] = corr
tap_data.loc[:, 'pearsonr'] = corr2
data.loc[:, 'intercept'] = intercept
tap_data.loc[:, 'intercept'] = intercept2
data.loc[:, 'slope'] = slope
tap_data.loc[:, 'slope'] = slope2
data.loc[:, 'residual'] = resid
data.loc[:, 'cooks'] = cooks

### Save Processed Data

Save the cleaned and processed version of the data to a CSV. These are the files we will load to perform analyses.

In [14]:
data.to_csv(SAVEFILE, index=False)
tap_data.to_csv(TAP_SAVEFILE, index=False)