# BerryPy

## If you have a standard PsychoPy output file (e.g., from the Builder), you can ignore the code in the notebook

In [16]:
%matplotlib inline

In [17]:
import shutil
import numpy as np
import pandas as pd
import glob
import os
import errno
import re
import matplotlib.pyplot as plt
import seaborn as sns
import sys

In [18]:
# copy config_general_settings.txt to config_general_settings.py
shutil.copy("config_general_settings.txt", "config_general_settings.py")

# import the py
if 'config_general_settings' in sys.modules:
    del sys.modules["config_general_settings"]
from config_general_settings import *

# source: https://stackoverflow.com/questions/600268/mkdir-p-functionality-in-python
def mkdir_p(path):
    try:
        os.makedirs(path)
    except OSError as exc:
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else:
            raise

dirs = ["input", "output", "output/images"]

for dir in dirs:
    mkdir_p(dir)

In [19]:
# if your output files are not standard PsychoPy output files, you might want to define some extra things
# in particular, this cell allows you to modify how the participant code is identified and how the input file is read by pandas 

# if your data file does not have a column that identifies the participant, you can define here how to get the participant code from the file name
# example: myCode_2016_03_21.csv -> code below returns "myCode"
#def getSubjCode(resFileName):
#    subjCode = resFileName.split("_")[0]
#    return subjCode

# specify extension of data files (.csv, .txt)
extension = ".csv"

# are your RTs in ms? (PsychoPy default: s)
# 0 = no
# 1 = yes
rtsInMs = 0
# note: if your RTs are in ms, BerryPy will converted them to s 
# therefore, you still need to set convertToMs to 1 if you want the output to be in ms

# specify pandas.read_csv parameters (see below for complete list)
# please note that these will need to be specified as a dictionary
# example: readCsvParams = {"index_col":False, "skiprows":6, "sep":"\t"}
readCsvParams = {}

# available parameters (v18): (filepath_or_buffer, sep=', ', delimiter=None, header='infer', names=None, index_col=None, usecols=None, squeeze=False, prefix=None, mangle_dupe_cols=True, dtype=None, engine=None, converters=None, true_values=None, false_values=None, skipinitialspace=False, skiprows=None, skipfooter=None, nrows=None, na_values=None, keep_default_na=True, na_filter=True, verbose=False, skip_blank_lines=True, parse_dates=False, infer_datetime_format=False, keep_date_col=False, date_parser=None, dayfirst=False, iterator=False, chunksize=None, compression='infer', thousands=None, decimal='.', lineterminator=None, quotechar='"', quoting=0, escapechar=None, comment=None, encoding=None, dialect=None, tupleize_cols=False, error_bad_lines=True, warn_bad_lines=True, skip_footer=0, doublequote=True, delim_whitespace=False, as_recarray=False, compact_ints=False, use_unsigned=False, low_memory=True, buffer_lines=None, memory_map=False, float_precision=None)

In [20]:
np.set_printoptions(precision=3)

In [21]:
# glob result files
# use info about file names to construct empty dataframe
resFiles = glob.glob("input" + os.sep + "*" + extension)

resFileNames = []

for resFile in resFiles:
    resFileName = resFile.split(os.sep)[-1].split(".")[0]  # get filename without path and extension
    resFileNames.append(resFileName)

In [22]:
# if you need to add extra columns to an input file, define the corresponding functions here
# for example, you will need to do this if you did not code switch and repeat trials in your input file
# make sure to insert extra functions further below where it says "insert extra function defined above here"

# extra functions

# create an extra column that codes the congruency of the previous trial
#def prevCongr(df):
#    df['prevCongr'] = df['realCongr'].shift(1)
#    return df



In [23]:
# replace non-word characters with underscore
def replaceNonwordChars(myString):
    myString = re.sub(r'\W+', '_', myString)
    return myString

In [24]:
# read config file
def readCfgFile(cfgFile):

    lines = open(cfgFile).read().splitlines()

    conds = []
    for line in lines:

        lineList = line.split(':')

        if line.startswith('#') or not line.strip():
            pass

        else:
            conds.append(line)
    
    return (conds)

In [25]:
def createQueryStr(maskStr):
    
    condName, columnValueList = maskStr.split(';')[0], maskStr.split(';')[1:]  # 'switch', ['a: 3,5', 'b: 0,1']
    condName = condName.strip()
    queryStr = '('
    comparisonList = []
    errorList = []
    
    for cpv in columnValueList:  # ['a: 3,5 ', 'b: 0,1']
        
        columnComparisonList = []
        flag = 0
        condStr = '('
        column, value = cpv.split(':')  # ' a', ' 3,5 '
        column = replaceNonwordChars(column.strip())  # remove all leading and trailing whitespace
        
        if re.search(r"/", value):  # it's an error definition
            accList = value.split('/')
            errorList = accList[0].split(',')
            valueList = accList[1].split(',') 
        else:
            valueList = value.split(',')
        
        for val in valueList:
            
            val = val.strip()
                        
            if re.search(r"\[", val):  # a comparison operator is defined, [ needs escaping
                operator, val = val.split(']')
                operator = operator[1:]
            else:
                operator = "=="
                       
            try:
                int(val)  # is it an int?
                flag = 1
            except ValueError:
                pass
            try:
                float(val)  # is it a float?
                flag = 1
            except ValueError:
                pass
            if flag == 0:  # it's presumably a string
                val = "'" + val + "'"
            columnComparisonList.append(str(column) + operator + str(val))
        
        condStr += ' | '.join(columnComparisonList)  # '(a == 3 | a == 5'
        condStr += ')'  # '(a == 3 | a == 5)'
        comparisonList.append(condStr)  # ['(a == 3 | a == 5)', '(b == 0 | b == 1)'] 
            
    queryStr += ' & '.join(comparisonList)  # '((a == 3 | a == 5) & (b == 0 | b == 1)'
    queryStr += ')'  # '((a == 3 | a == 5) & (b == 0 | b == 1))'
        
    if len(errorList) == 0:
        return (condName, queryStr)
    else:
        return (condName, queryStr, errorList)

In [26]:
def calculateMoct(rts, moctType, rejType, rejValue):
              
    if rejType == 1:  # SD-based
        rtsNew = rts[abs(rts - np.mean(rts)) < rejValue * np.std(rts)]
        rtsRejected = rts[abs(rts - np.mean(rts)) > rejValue * np.std(rts)]
        
    elif rejType == 2:  # %-based
        rtsSorted = np.sort(rts)
        nrRts = len(rtsSorted)
        rejectFloat = (rejValue / 100.0) * nrRts
        reject = int(rejectFloat)
        if reject > 0:
            rtsNew = rtsSorted[reject:-reject]
        else:
            rtsNew = rts

    elif rejType == 3:  # MAD
        theMedian = np.median(rts)
        absDeviations = [abs(rt - theMedian) for rt in rts]
        mdAbsDev = 1.4826 * np.median(absDeviations)
        rtsNew = rts[abs(rts - theMedian) < rejValue * mdAbsDev]
        rtsRejected = rts[abs(rts - theMedian) > rejValue * mdAbsDev]

    elif rejType == 0:
        rtsNew = rts
        rtsRejected = []
 
    nrRts = len(rts)
    nrRtsNew = len(rtsNew)
    #print rts

    if nrRtsNew >= 2:
        theMean = np.mean(rtsNew)
        theSd = np.std(rtsNew)
    else:
        theMean = None
        theSd = None
        
    if nrRts >= 2:
        theMedian = np.median(rts)
        q75, q25 = np.percentile(rts, [75 ,25])
        theIqr = q75 - q25
    else:
        theMedian = None
        theIqr = None
    
    if (convertToMs == 1) and (theMean != None):
        theMean = int(np.round(theMean * 1000, decimals=0))
        theSd = int(np.round(theSd * 1000, decimals=0))
    
    if (convertToMs == 1) & (theMedian != None):
        theMedian = int(np.round(theMedian * 1000, decimals=0))
        #print "theMedian: " + str(theMedian)
        theIqr = int(np.round(theIqr * 1000, decimals=0))
        #print theIqr
        
    if tooFewTrials == 1:
        if nrRtsNew <= tooFewTrialsNr:
            theMean = None
            theSd = None
        if nrRts <= tooFewTrialsNr:
            theMedian = None
            theIqr = None
 
    return (theMean, theSd, theMedian, theIqr, nrRts, nrRtsNew, rtsNew, rtsRejected)  

In [27]:
# accInfo is the PsychoPy column with the accuracy info for all trials defined in config_errors.txt
# errorList is what defines an error in this list
def calculateErrorRate(accInfo, errorList):  
    
    allTrialsNr = accInfo.size
    errorNr = 0
    
    for i in errorList:
        errorNr += accInfo[accInfo == int(i)].size   

    if allTrialsNr > 0:
        errorRate = np.round(float(errorNr)/allTrialsNr*100, decimals=1)
    else:
        errorRate = np.nan
        
    return errorRate

In [28]:
# non-word characters will be removed from the column names in the dataframe; therefore, the also need to removed from the variable names specified above (e.g. response.rt -> response_rt)

if participantColumn is not None:
    participantColumn = replaceNonwordChars(participantColumn)

newColumns = []
if keepColumns:
    for column in keepColumns:
        column = replaceNonwordChars(column)
        newColumns.append(column)
    keepColumns = newColumns
    
rtColumn = replaceNonwordChars(rtColumn)
accColumn = replaceNonwordChars(accColumn)

In [31]:
outFrame = pd.DataFrame(index=resFileNames)  # initialise output DataFrame with result file names as indices
report = []

for resFile in resFiles:

    resFileName = resFile.split(os.sep)[-1].split(".")[0]  # get rid of path and extension, keep filename
    print "Result file: %s" % resFileName
    
    if not readCsvParams:
        df = pd.read_csv(resFile, index_col=False)  # convert PsychoPy output to pandas DataFrame
    else:
        df = pd.read_csv(resFile, **readCsvParams)
    
    df.columns = df.columns.str.strip().str.replace('\W+', '_')  # for the column names, remove leading or trailing whitespace and replace non-word characters with underscore
    
    ################
    # insert extra function defined above here
    # e.g.: df = prevAccuracy(df)
    ################

    if participantColumn is not None:
        subjCode = str(df.loc[0, participantColumn])
    else:
        subjCode = getSubjCode(resFileName)
    
    print "Participant code: %s" % subjCode

    if keepColumns:
        for column in keepColumns:
            theValue = str(df.loc[0, column])
            outFrame.set_value(resFileName, column, theValue)
    
    if trialsLoop is not None:
        theLoop = trialsLoop + "_thisN"
        df = df[df[theLoop] >= 0]  # get rid of practice trials
        
    origLength = belowLength = len(df)

    print "Total number of trials: %s" % len(df)
    
    outFrame.set_value(resFileName, "subjCode", subjCode)  # add subject code to output
    report.append("# %s" % (resFileName))
    report.append("## Basic sanity checks")

    rtConds = readCfgFile("./config_rts.txt")
    
    # for non-standard output files, we need to convert to s
    if rtsInMs == 1:
        df[rtColumn] = df[rtColumn] / 1000.
                        
    if rejectBelow == 1:
        # keep RTs above rejectBelowTime, but keep also time-outs (RT is None or is 0)
        # note that the extra brackets are required!!!
        df = df[((pd.isnull(df[rtColumn])) | (df[rtColumn] >= float(rejectBelowTime)) | (df[rtColumn] == 0))]
        belowLength = len(df)
        nrBelow = origLength - belowLength
        percentBelow = np.round(float(nrBelow)/origLength*100, decimals=1)
        report.append("Percentage of trials rejected as too fast: %s" % percentBelow)
        print "Rejected %s RTs below %s sec" % (nrBelow, rejectBelowTime)
        
    if rejectAbove == 1:
        df = df[((pd.isnull(df[rtColumn])) | (df[rtColumn] <= float(rejectAboveTime)))]
        aboveLength = len(df)
        nrAbove = belowLength - aboveLength
        percentAbove = np.round(float(nrAbove)/origLength*100, decimals=1)
        report.append("Percentage of trials rejected as too slow: %s" % percentAbove)
        print "Rejected %s RTs above %s sec" % (nrAbove, rejectAboveTime)
    
    print "Number of trials after rejecting extreme values: %s" % len(df)
    
    nrTimeouts = len(df[((pd.isnull(df[rtColumn])) | (df[rtColumn] == 0))])
    if nrTimeouts > 0:
        percentTimeout = np.round(float(nrTimeouts)/len(df)*100, decimals=1)
    else:
        percentTimeout = 0
    report.append("Percentage of timeouts: %s" % percentTimeout)
    
    nrErrors = len(df[df[accColumn] == incorrect])
    if nrErrors > 0:
        percentError = np.round(float(nrErrors)/len(df)*100, decimals=1)
    else:
        percentError = 0
    report.append("Percentage of errors: %s" % percentError)
    
    if plotting == 1:
        sns.set(rc={"figure.figsize": (10, 7)})
        fig = plt.figure()
        bins = np.linspace(fromRT, toRT, nrOfBins)
        ax = sns.distplot(df[df[accColumn] == correct][rtColumn], bins, hist=True, kde=False, rug=False)
        ax.set_title("Distribution of all correct RTs")
        fig.savefig("./output/images/%s_overall.png" % resFileName)
        report.append("![](./images/%s_overall.png)" % resFileName)
        plt.close(fig)

    report.append("## RTs")
        
    for cond in rtConds:
            
        theCond, theQuery = createQueryStr(cond)  # call the function to get the query string
        report.append("### %s" % theCond)
        
        maskedDf = df.query(theQuery)  # DataFrame with subset of trials, but all columns
        
        print "Number of trials in condition %s: %s" % (theCond, len(maskedDf))
        
        maskedDfNoTimeouts = maskedDf[((pd.notnull(maskedDf[rtColumn])) | (maskedDf[rtColumn] == 0))]
        
        nrTimeoutsCond = len(maskedDf) - len(maskedDfNoTimeouts)
        if nrTimeoutsCond > 0:
            percentTimeoutsCond = np.round(float(nrTimeoutsCond)/len(maskedDf)*100, decimals=1)
        else:
            percentTimeoutsCond = 0
        report.append("Percentage of time-outs: %s" % percentTimeoutsCond)
        
        maskedRts = maskedDfNoTimeouts[rtColumn]  # Series with just the info from the RT column
        
        # create rejType if it does not exist
        try:
            rejType
        except NameError:
            rejType = 0           
        
        # create rejValue if it does not exist
        try:
            rejValue
        except NameError:
            rejValue = 0
                
        theMean, theSd, theMedian, theIqr, nrRts, nrRtsNew, rtsNew, rtsRejected = calculateMoct(rts=maskedRts, moctType=moctType, rejType=rejType, rejValue=rejValue)
        
        #print rtsNew
        #print rtsRejected
                
        if len(rtsNew) > 0:
                        
            if plotting == 1:
                sns.set(rc={"figure.figsize": (10, 7)})
                fig = plt.figure()
                bins = np.linspace(fromRT, toRT, nrOfBins)
                ax = sns.distplot(maskedRts, bins, hist=True, kde=False, rug=True)
                ax = sns.distplot(rtsNew, bins, kde=False, rug=True)
                fig.savefig("./output/images/%s_%s.png" % (resFileName, theCond))
                report.append("![](./images/%s_%s.png)" % (resFileName, theCond))
                plt.close(fig)
        
            nrRejected = nrRts-nrRtsNew
            report.append("#### Mean")
            report.append("Number of trials rejected: %s" % nrRejected)
            rtsRejectedStrings = []
            if len(rtsRejected) > 0:
                rtsRejected.sort_values(inplace=True)
                report.append("RTs rejected: %s" % np.round(rtsRejected.values, decimals=3))
            else:
                report.append("RTs rejected: 0")
            report.append("Number of trials remaining: %s" % nrRtsNew)
            report.append("Mean: %s" % theMean)
            report.append("SD: %s" % theSd)        

            report.append("#### Median")
            report.append("Number of trials in this condition: %s" % nrRts)
            report.append("Median: %s" % theMedian)
            report.append("IQR: %s" % theIqr)
                
        if moctType == 1:
            moct = theMean
        else:
            moct = theMedian
        
        outFrame.set_value(resFileName, theCond, moct)
        
    if computeErrorRates == 1:
        
        errorConds = readCfgFile("./config_errors.txt") 
        
        report.append("## Error rates")
        
        for errorCond in errorConds:
            
            theCond, theQuery, errorList = createQueryStr(errorCond)
            report.append("### %s" % theCond)

            maskedDf = df.query(theQuery) 
            
            print "Number of trials for error rate calculation in condition %s: %s" % (theCond, len(maskedDf))

            accInfo = maskedDf[accColumn] 

            errorResult = calculateErrorRate(accInfo=accInfo, errorList=errorList)
            report.append("Error rate: %s" % errorResult)

            outFrame.set_value(resFileName, theCond, errorResult)
     
    print ""
    
    report.append("---")
        
outFrame.to_csv("./output/output.csv", index_label="resFile")
                        

Result file: f1_2014_Nov_19_1259
Participant code: f1
Total number of trials: 672
Rejected 0 RTs below 0.15 sec
Rejected 0 RTs above 2.0 sec
Number of trials after rejecting extreme values: 672

Result file: f2_2014_Nov_20_1021
Participant code: f2
Total number of trials: 672
Rejected 0 RTs below 0.15 sec
Rejected 0 RTs above 2.0 sec
Number of trials after rejecting extreme values: 672



In [30]:
# open file for writing
outfile = "./output/report.md"
try:
    ofile = open(outfile, "w")  # open in write mode
except IOError:
    sys.stderr.write("can't open %s: %s %s\n" % (outfile, sys.exc_type, sys.exc_value))

for i in report:
    
    ofile.write(i + "  \n\n")
    
# close file
ofile.close()

print "done"

done
