# Realign & Homogenize ED-1, ED-2, ED-3

`2.clean_realign_homogenize_all`

Realign and merge converted data from ED-3 into the concatenated data from ED-1 and ED-2.

Differentiate, import, and reassociate memory data into the main-task trialwise dataset.

### Configuration

In [1]:
from pathlib import Path

import pandas as pd
from datetime import datetime

from _utils import clean

In [2]:
date = datetime.today().strftime('%y%m%d')

In [3]:
derivs_dir = Path('..') / 'derivatives'
allsub_dir = derivs_dir / '00.allsub'

## Pull Concatenated Task-wise data

Dataset 1

In [4]:
main_fpath_1 = allsub_dir / ('econdec-1_task-main_beh_' + date + '.csv')
frac_fpath_1 = allsub_dir / ('econdec-1_task-frac_beh_' + date + '.csv')
face_fpath_1 = allsub_dir / ('econdec-1_task-face_beh_' + date + '.csv')

In [7]:
main_df_1 = clean.smooth_columns(pd.read_csv(main_fpath_1))
frac_df_1 = clean.smooth_columns(pd.read_csv(frac_fpath_1))
face_df_1 = clean.smooth_columns(pd.read_csv(face_fpath_1))

Dataset 2

In [8]:
main_fpath_2 = allsub_dir / ('econdec-2_task-main_beh_' + date + '.csv')
frac_fpath_2 = allsub_dir / ('econdec-2_task-frac_beh_' + date + '.csv')
face_fpath_2 = allsub_dir / ('econdec-2_task-face_beh_' + date + '.csv')

In [9]:
main_df_2 = clean.smooth_columns(pd.read_csv(main_fpath_2))
frac_df_2 = clean.smooth_columns(pd.read_csv(frac_fpath_2))
face_df_2 = clean.smooth_columns(pd.read_csv(face_fpath_2))

Dataset 3

In [10]:
main_fpath_3 = allsub_dir / ('econdec-3_task-main_beh_' + date + '.csv')
frac_fpath_3 = allsub_dir / ('econdec-3_task-frac_beh_' + date + '.csv')
face_fpath_3 = allsub_dir / ('econdec-3_task-face_beh_' + date + '.csv')

In [11]:
main_df_3 = clean.smooth_columns(pd.read_csv(main_fpath_3))
frac_df_3 = clean.smooth_columns(pd.read_csv(frac_fpath_3))
face_df_3 = clean.smooth_columns(pd.read_csv(face_fpath_3))

## Note

I'm unsure whether the above (repetitive), or one of the options below (unintuitive) is a cleaner way to represent the data corpus at this stage.

still sorta repetitive:

hard to read (what's even going on here?):

## Exclude bad subjects

In [12]:
from config import exclusions

Dataset 1

In [13]:
main_df_1 = main_df_1[~main_df_1['subjnum'].isin(exclusions)]

In [14]:
len(main_df_1.subjnum.unique())

88

Dataset 2

In [15]:
main_df_2 = main_df_2[~main_df_2['subjnum'].isin(exclusions)]

In [16]:
len(main_df_2.subjnum.unique())

101

#### Study-eye

In [17]:
main_df_3 = main_df_3[~main_df_3['originalparticipant'].isin(exclusions)]

In [19]:
len(main_df_3.originalparticipant.unique())

72

#### Output

In [20]:
exclusions_dir = derivs_dir / '01.exclusions'
if not Path.exists(exclusions_dir): Path.mkdir(exclusions_dir)

In [21]:
main_df_1.to_csv(exclusions_dir / ('econdec-1_task-main_beh_' + date + '.csv'))
main_df_2.to_csv(exclusions_dir / ('econdec-2_task-main_beh_' + date + '.csv'))
main_df_3.to_csv(exclusions_dir / ('exondec-3_task-main_beh_' + date + '.csv'))

# Main task

## ED-1

In [22]:
main_df_1.head()

Unnamed: 0,subjnum,agegroup,experimentername,runnum,date,time,trialnum,trialnumbydomdist,domain,magnitude,...,probrt,confidence,confidencest,confidencert,stocknumber,bondnumber,genderjudgment,totalpayout,trueprobgood,estwithinrange?
0,100,1,kf,1,10_12,11:31:01.963000,1,1,LOSS,low,...,6.077591,8,2141471.0,3.022637,16,9,1,-6,0.3,0
1,100,1,kf,1,10_12,11:31:01.963000,2,2,LOSS,low,...,7.294263,8,2141525.0,3.695852,16,9,1,-12,0.155172,0
2,100,1,kf,1,10_12,11:31:01.963000,3,3,LOSS,low,...,7.635041,8,2141546.0,3.121775,16,9,1,-18,0.3,1
3,100,1,kf,1,10_12,11:31:01.963000,4,4,LOSS,low,...,10.879553,7,2141574.0,3.406241,16,9,1,-24,0.5,0
4,100,1,kf,1,10_12,11:31:01.963000,5,5,LOSS,low,...,16.525458,8,2141602.0,4.553061,16,9,1,-26,0.7,0


 ## ED-2

In [23]:
main_df_2.head()

Unnamed: 0,subjnum,agegroup,experimentername,runnum,date,time,trialnum,trialnumbydomdist,domain,magnitude,...,probrt,confidence,confidencest,confidencert,stocknumber,bondnumber,genderjudgment,totalpayout,trueprobgood,estwithinrange?
0,2001,1,ed,1,9_7,14:56:22.840000,1,1,GAIN,low,...,4.165232,9.0,22151.347369,2.155163,18,1,0,6,0.7,0
1,2001,1,ed,1,9_7,14:56:22.840000,2,2,GAIN,low,...,2.122263,9.0,22168.689525,0.90059,18,1,1,12,0.844828,0
2,2001,1,ed,1,9_7,14:56:22.840000,3,3,GAIN,low,...,3.157299,8.0,22185.804354,1.129109,18,1,1,14,0.927027,0
3,2001,1,ed,1,9_7,14:56:22.840000,4,4,GAIN,low,...,3.599311,8.0,22206.712546,1.060679,18,1,1,16,0.967365,0
4,2001,1,ed,1,9_7,14:56:22.840000,5,5,GAIN,low,...,2.064677,8.0,22223.815899,1.148896,18,1,1,22,0.985748,0


## ED-3

In [26]:
main_df_3 = clean.eye_cleanup(main_df_3)

Remove practice data 

In [27]:
print(clean.eye_cleanup.__doc__)


    Removes rows from a DataFrame based on predetermined indicators of extraneous data, or an indicator that the row represents unwanted practice data.
    


In [28]:
main_df_3.head()

Unnamed: 0,agegroup,accuracy,bankaccount,bypassed,confidencevalue,date,emotionresponse,estimationvalue,experimenter,facert,...,originaltrialnumber,originaltrialorder,practice,stockfractallocation,stockfractallocationtype,stockimagename,stocktext,stocktextlocation,stockvalue,trueprobability
9,1,1,-6.0,0,8.0,11041300,58,70,mm,2811,...,1.0,1,3,"(565, 540)",L,fractal12b.jpg,-$2 or -$10,"(640, 510)",-2,0.7
10,1,1,-12.0,0,8.0,11041300,58,50,mm,3289,...,1.0,2,3,"(1355, 540)",R,fractal12b.jpg,-$2 or -$10,"(1280, 510)",-10,0.5
11,1,1,-18.0,0,7.0,11041300,58,30,mm,3242,...,1.0,3,3,"(1355, 540)",R,fractal12b.jpg,-$2 or -$10,"(1280, 510)",-10,0.3
12,1,1,-20.0,0,6.0,11041300,58,50,mm,3904,...,1.0,4,3,"(1355, 540)",R,fractal12b.jpg,-$2 or -$10,"(1280, 510)",-2,0.5
13,1,1,-30.0,0,7.0,11041300,58,30,mm,1997,...,1.0,5,3,"(1355, 540)",R,fractal12b.jpg,-$2 or -$10,"(1280, 510)",-10,0.3


## Main task columns

In [30]:
new_main_columns = {'face':'facepic','runnum':'block','trialnum':'trial',
               'fracrt':'choicert','fracst':'choicest',
               'facert':'outcomert','facest':'outcomest',
               'probrt':'esttaskrt','probst':'esttaskst',
               'optionchosen':'stockchosen',
               'probgood':'estimation','trueprobgood':'trueprob'}

### ED-1

In [31]:
main_df_1 = main_df_1.rename(columns=new_main_columns)

In [32]:
main_df_1 = main_df_1.drop(['agegroup','experimentername','date','time','trialnumbydomdist',
                                  'choicest','outcomest','esttaskst',
                                  'confidencest','stocknumber','bondnumber','genderjudgment',
                                  'totalpayout','fractalchosen','estwithinrange?','confidencert'], axis=1)

In [34]:
main_df_1['study'] = main_df_1.apply(clean.label_study, axis=1)
main_df_1.columns

Index(['subjnum', 'block', 'trial', 'domain', 'magnitude', 'cueonleft',
       'cueonright', 'stockpic', 'bondpic', 'stockchosen', 'choicert',
       'stockvalue', 'facepic', 'outcomert', 'estimation', 'esttaskrt',
       'confidence', 'trueprob', 'study'],
      dtype='object')

### ED-2

In [35]:
main_df_2 = main_df_2.rename(columns=new_main_columns)

In [38]:
main_df_2 = main_df_2.drop([
    'agegroup','experimentername','date','time','trialnumbydomdist',
    'choicest','outcomest','esttaskst',
    'confidencest','stocknumber','bondnumber','genderjudgment',
    'totalpayout','fractalchosen','estwithinrange?','confidencert'
], axis=1)

In [40]:
main_df_2['study'] = main_df_2.apply(clean.label_study, axis=1)
main_df_2.columns

Index(['subjnum', 'block', 'trial', 'domain', 'magnitude', 'cueonleft',
       'cueonright', 'stockpic', 'bondpic', 'stockchosen', 'choicert',
       'stockvalue', 'facepic', 'outcomert', 'estimation', 'esttaskrt',
       'confidence', 'trueprob', 'study'],
      dtype='object')

### ED-3

In [42]:
main_df_3.columns

Index(['agegroup', 'accuracy', 'bankaccount', 'bypassed', 'confidencevalue',
       'date', 'emotionresponse', 'estimationvalue', 'experimenter', 'facert',
       'facekeypressed', 'paymentaccuracy', 'phase', 'rt', 'selection',
       'showinstruction', 'bondfractallocation', 'bondfractallocationtype',
       'bondimagename', 'bondtext', 'bondtextlocation', 'bondvalue',
       'bubblefile', 'correctfractal', 'correctfractallocation', 'domain',
       'facedomain', 'facefile', 'faceimage', 'facestockvalue', 'fracdomain',
       'fracmagnitude', 'gender', 'incorectfractal',
       'incorrectfractallocation', 'magnitude', 'oldfaceequalstrue',
       'originalblock', 'originalparticipant', 'originalparticipantnumber',
       'originalsubjectnumber', 'originaltrailnumber', 'originaltrialnumber',
       'originaltrialorder', 'practice', 'stockfractallocation',
       'stockfractallocationtype', 'stockimagename', 'stocktext',
       'stocktextlocation', 'stockvalue', 'trueprobability'],
     

In [43]:
new_eye_columns = {
    'originalparticipant':'subjnum',
    'experimenter':'experimentername',
    'stockimagename':'stockpic',
    'bondimagename':'bondpic',
    'originalblock':'block',
    'originaltrialorder':'trial',
    #'stockfractallocationtype':'cueonleft',
    #'bondfractallocationtype':'cueonright',
    'faceimage':'facepic',
    #'selection':'stockchosen',
    'rt':'choicert',
    'estimationvalue':'estimation',
    'trueprobability':'trueprob',
    'accuracy':'genderjudgment',
    'confidencevalue':'confidence',
    'facert':'outcomert'
    }

In [44]:
main_df_3 = main_df_3.rename(columns=new_eye_columns)

In [45]:
main_df_3 = main_df_3.drop([
    'practice','bankaccount','bubblefile','bondvalue','stocktext','bondtext',
    'stocktextlocation','bondtextlocation','emotionresponse','bypassed','agegroup','experimentername',
    'date','correctfractallocation','incorrectfractallocation','paymentaccuracy','phase',
    'stockfractallocation','bondfractallocation','showinstruction','gender',
    'correctfractal','incorectfractal','oldfaceequalstrue','facefile','facekeypressed',
    'originalsubjectnumber','originalparticipantnumber','originaltrialnumber','originaltrailnumber',
    'fracdomain','facedomain','fracmagnitude','facestockvalue',
    'genderjudgment'], axis=1)

In [47]:
main_df_3['study'] = main_df_3.apply(clean.label_study, axis=1)

In [48]:
for col in ('choicert','outcomert'):
    main_df_3[col] = main_df_3[col].astype(float) *.001

### Unified columns

#### final cleaning to put values in the same units, etc...

In [50]:
main_df_all = pd.concat([main_df_1, main_df_2, main_df_3])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [52]:
main_df_all['stockchosen'] = main_df_all.apply(clean.clean_stockchosen, axis=1)
main_df_all['bondpic'] = main_df_all.apply(clean.clean_bondpic, axis=1)
main_df_all['stockpic'] = main_df_all.apply(clean.clean_stockpic, axis=1)
len(main_df_all)

18786

# Fractal Memory

## ED-1

In [53]:
frac_df_1['oldfractal'] = frac_df_1.apply(clean.clean_paths, axis=1)

In [70]:
frac_lil_df_1 = frac_df_1[['subjectid','oldfractal','judgment']].sort_values(['subjectid','oldfractal'])

In [71]:
frac_lil_bond_df_1 = frac_lil_df_1.rename(columns={
    'subjectid':'subjnum','oldfractal':'bondpic','judgment':'bondmem'
})

In [72]:
frac_lil_stock_df_1 = frac_lil_df_1.rename(columns={
    'subjectid':'subjnum','oldfractal':'stockpic','judgment':'stockmem'
})

## ED-2

In [73]:
frac_df_2['oldfractal'] = frac_df_2.apply(clean.clean_paths, axis=1)

In [74]:
frac_lil_df_2 = frac_df_2[['subjectid','oldfractal','judgment']].sort_values(['subjectid','oldfractal'])

In [75]:
frac_lil_bond_df_2 = frac_lil_df_2.rename(columns={
    'subjectid':'subjnum','oldfractal':'bondpic','judgment':'bondmem'
})

In [76]:
frac_lil_stock_df_2 = frac_lil_df_2.rename(columns={
    'subjectid':'subjnum','oldfractal':'stockpic','judgment':'stockmem'
})

## ED-3

In [77]:
frac_lil_df_3 = frac_df_3[['originalparticipant','correctfractal','selection','correctfractallocation']]
frac_lil_df_3['selection'] = frac_lil_df_3.apply(clean.clean_selection, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [79]:
frac_lil_bond_df_3 = frac_lil_df_3.rename(columns={
    'originalparticipant':'subjnum',
    'correctfractal':'bondpic',
    'selection':'bondmem'
}).drop(columns='correctfractallocation')

frac_lil_stock_df_3 = frac_lil_df_3.rename(columns={
    'originalparticipant':'subjnum',
    'correctfractal':'stockpic',
    'selection':'stockmem'
}).drop(columns='correctfractallocation')

## Concatenate ED-1, ED-2, ED-3 Fractal Memory

In [80]:
frac_lil_bond_df = pd.concat([
    frac_lil_bond_df_1, frac_lil_bond_df_2, frac_lil_bond_df_3
])

frac_lil_stock_df = pd.concat([
    frac_lil_stock_df_1, frac_lil_stock_df_2, frac_lil_stock_df_3
])

# Face Memory

## ED-1

In [81]:
face_lil_df_1 = face_df_1[['subjectid','face','subjresp']]
face_lil_df_1 = face_lil_df_1.rename(columns={
    'subjectid':'subjnum','face':'facepic','subjresp':'facemem'
})

## ED-2

In [82]:
face_lil_df_2 = face_df_2[['subjectid','face','subjresp']]
face_lil_df_2 = face_lil_df_2.rename(columns={
    'subjectid':'subjnum','face':'facepic','subjresp':'facemem'
})

## ED-3

In [83]:
face_lil_df_3 = face_df_3[
    ['originalparticipant','facefile','selection']
].rename(columns={
    'originalparticipant':'subjnum',
    'facefile':'facepic',
    'selection':'facemem'
})

## Concatenate ED-1, ED-2, ED-3

In [84]:
face_lil_df = pd.concat([
    face_lil_df_1, face_lil_df_2, face_lil_df_3
])

# Output

In [85]:
main_df_all = main_df_all.merge(frac_lil_bond_df, how='left')
main_df_all = main_df_all.merge(frac_lil_stock_df, how='left')
main_df_all = main_df_all.merge(face_lil_df, how='left')
# unified_main_frame[['subjnum','stockpic','bondpic','stockmem','bondmem']]

In [90]:
trials=[]
for s in range(len(main_df_all.subjnum.unique())):
    for t in range(1,73):
        trials.append(t)

In [91]:
blocks=[]
for s in range(len(main_df_all.subjnum.unique())):
    for b in range(1,13):
        for x in range(6):
            blocks.append(b)

# Check size
Final merged DataFrame compared to expected number of blocks & trials:

In [93]:
print(len(blocks))
print(len(trials))
print(len(main_df_all))

18792
18792
18786


In [99]:
assert len(blocks) == len(trials)
assert len(trials) == len(main_df_all)

AssertionError: 

6 ED3 subjects are missing a trial so the trial and block numbers won't match up if they are added here.

Should move this step to early cleaning immediately after extraction.

In [None]:
counts = main_df_all.groupby('subjnum').count()['block']
counts[counts < 72]

In [None]:
main_df_all['trial'] = pd.Series(trials)
main_df_all['block'] = pd.Series(blocks)

# Output

ONly when all data is fully aligned and homogenized.

**ALL** cleaning steps should be done before this point.

In [None]:
homog_dir = derivs_dir / '02.homogenized'
if not Path.exists(homog_dir):
    Path.mkdir(homog_dir)

In [None]:
fpath = homog_dir  / ('econdec-full_task-main_beh_' + date + '.csv')
main_df_all.to_csv(fpath, index=False)

In [100]:
len(main_df_all.subjnum.unique())

261

For reference:

```
final_columns=['study','subjnum','trial','block','domain','dom',
               'estimation','trueprob','estdiff','valestdiff','valestdiffvalid',
               'choicert','choicerta3sd','choicerti3sd','choicemed12v3','choicemed123'
               'esttaskrt','esttaskrta3sd','esttaskrti3sd',
               'outcomert','outcomerta3sd','outcomerti3sd','outcomemed12','outcomemed123'
               'stockchosen','waschoiceoptimal','optimalchoiceshouldhavebeen',
               'magnitude','stockvalue','absstockval','b4choiceprobability',
               'stockpic','bondpic','facepic','stockmemresp','bondmemresp',
               'studymedchoice','studysplitchoice','studymedoutcome','studysplitoutcome',
               'primemedchoice','primesplitchoice','primemedoutcome','primesplitoutcome']
               ```