Merge REDCap out-of-scanner Face Name Recall variables with in-scanner data pulled from IntraDB

In [None]:
import os, datetime
import csv
import sys
import shutil
from openpyxl import load_workbook
import pandas as pd
from io import BytesIO
import numpy as np
import subprocess
from scipy import stats
from ccf.box import LifespanBox
from ccf.config import LoadSettings
from ccf.redcap import RedcapTable 
config = LoadSettings()
import Crosswalk.cleanupfuncs as cleanupfuncs


In [None]:
box=LifespanBox(cache='./')
verbose = True
snapshotdate = datetime.datetime.today().strftime('%m_%d_%Y')
pathout="./prepped/hca" 
racepath="./prepped"
scannerpath=''

#Rosetta (a.k.a Inventory) file will have all the nda vars and pedids
extrainfo=config['rosetta']['hca']['filename']

eventlist=['visit_1_arm_1','visit_2_arm_1','visit_3_arm_1','visit_arm_1']


In [None]:
print(extrainfo)
inventory=pd.read_csv(extrainfo)
inventory.columns
inventory.subject.head()
inventory=inventory.loc[inventory.nda_age.isnull()==False]
inventory.nda_interview_date=pd.to_datetime(inventory.nda_interview_date).dt.strftime('%m/%d/%Y')
inventory.nda_age=inventory.nda_age.round(0).astype(int)
inventory=inventory.loc[inventory.redcap_event_name.isin(eventlist)]
inventory=inventory.loc[inventory.IntraDB.str.contains('STG')]
Fullinventory=inventory.copy()
print("Inventory w select events should be 1798:",Fullinventory.shape)

In [None]:
Fullinventory.columns
ndar=Fullinventory[['REDCap_id','nda_age','nda_interview_date',
                   'pseudo_guid','subject','M/F','redcap_event_name','redcap_event']].rename({'nda_age':'interview_age',
                                    'nda_interview_date':'interview_date','M/F':'sex',
                                    'subject':'src_subject_id','pseudo_guid':'subjectkey'})

In [None]:
#get Redcap Facename variables and ids from Redcap hca
fieldlist=['id','subject_id','counterbalance_v1','counterbalance_v2','cb1_1a','cb1_1a_other',
           'cb1_2a','cb1_2a_other','cb1_3a','cb1_3a_other','cb1_4a',
           'cb1_4a_other','cb1_5a','cb1_5a_other','cb1_6a','cb1_6a_other','cb1_7a',
           'cb1_7a_other','cb1_8a','cb1_8a_other','cb1_9a','cb1_9a_other','cb1_10a','cb1_10a_other',
           'cb2_1a','cb2_1a_other','cb2_2a','cb2_2a_other','cb2_3a','cb2_3a_other','cb2_4a',
           'cb2_4a_other','cb2_5a','cb2_5a_other','cb2_6a','cb2_6a_other','cb2_7a','cb2_7a_other',
           'cb2_8a','cb2_8a_other','cb2_9a','cb2_9a_other','cb2_10a','cb2_10a_other']

hca=RedcapTable.get_table_by_name('hcpa').get_frame(fieldlist)
hca=hca.loc[hca.redcap_event_name.isin(eventlist)]


In [None]:
cb1cols = ['id','subject_id','redcap_event_name','counterbalance_v1','counterbalance_v2']+[col for col in hca.columns if 'cb1' in col]
cb2cols = [col for col in hca.columns if 'cb2' in col]

hca1=hca.loc[hca.redcap_event_name=='visit_1_arm_1'][cb1cols+cb2cols]
hca2=hca.loc[hca.redcap_event_name=='visit_2_arm_1'][cb1cols+cb2cols]
hca2.counterbalance_v2.value_counts(dropna=False) # this will be empty because counterbalance assignment happened in V1


In [None]:
#hca1 corresponding to v1 data is done
#hca2 for v2 needs extra massaging

hca2=hca2.drop(columns={'counterbalance_v1','counterbalance_v2','subject_id'}) #these vars empty as demostrated in last cell
hcacb2=hca1.loc[hca1.counterbalance_v2.isnull()==False][['id','counterbalance_v2']]
hca2=pd.merge(hcacb2,hca2,on=['id'],how='left')


In [None]:
#stack V1 CBs
hca1cb1=hca1.loc[hca1.counterbalance_v1==1.0].rename(columns={'counterbalance_v1':'version','cb1_1a':'f1_recall', 'cb1_1a_other':'f1_other', 'cb1_2a':'f2_recall',
       'cb1_2a_other':'f2_other', 'cb1_3a':'f3_recall', 'cb1_3a_other':'f3_other', 'cb1_4a':'f4_recall', 'cb1_4a_other':'f4_other',
       'cb1_5a':'f5_recall', 'cb1_5a_other':'f5_other', 'cb1_6a':'f6_recall', 'cb1_6a_other':'f6_other', 'cb1_7a':'f7_recall',
       'cb1_7a_other':'f7_other', 'cb1_8a':'f8_recall', 'cb1_8a_other':'f8_other', 'cb1_9a':'f9_recall', 'cb1_9a_other':'f9_other',
       'cb1_10a':'f10_recall', 'cb1_10a_other':'f10_other'})

hca1cb2=hca1.loc[hca1.counterbalance_v1==2.0].rename(columns={'counterbalance_v1':'version','cb2_1a':'f1_recall', 'cb2_1a_other':'f1_other', 'cb2_2a':'f2_recall',
       'cb2_2a_other':'f2_other', 'cb2_3a':'f3_recall', 'cb2_3a_other':'f3_other', 'cb2_4a':'f4_recall', 
       'cb2_4a_other':'f4_other','cb2_5a':'f5_recall', 'cb2_5a_other':'f5_other', 'cb2_6a':'f6_recall', 'cb2_6a_other':'f6_other', 
       'cb2_7a':'f7_recall','cb2_7a_other':'f7_other', 'cb2_8a':'f8_recall', 'cb2_8a_other':'f8_other', 'cb2_9a':'f9_recall', 
        'cb2_9a_other':'f9_other',
       'cb2_10a':'f10_recall', 'cb2_10a_other':'f10_other'})
hca1all=pd.concat([hca1cb1,hca1cb2]).drop(columns=['counterbalance_v2'])


In [None]:
#stack V2 cbs
hca2cb1=hca2.loc[hca2.counterbalance_v2==1.0].rename(columns={'counterbalance_v2':'version','cb1_1a':'f1_recall', 'cb1_1a_other':'f1_other', 'cb1_2a':'f2_recall',
       'cb1_2a_other':'f2_other', 'cb1_3a':'f3_recall', 'cb1_3a_other':'f3_other', 'cb1_4a':'f4_recall', 'cb1_4a_other':'f4_other',
       'cb1_5a':'f5_recall', 'cb1_5a_other':'f5_other', 'cb1_6a':'f6_recall', 'cb1_6a_other':'f6_other', 'cb1_7a':'f7_recall',
       'cb1_7a_other':'f7_other', 'cb1_8a':'f8_recall', 'cb1_8a_other':'f8_other', 'cb1_9a':'f9_recall', 'cb1_9a_other':'f9_other',
       'cb1_10a':'f10_recall', 'cb1_10a_other':'f10_other'})

hca2cb2=hca2.loc[hca2.counterbalance_v2==2.0].rename(columns={'counterbalance_v2':'version','cb2_1a':'f1_recall', 'cb2_1a_other':'f1_other', 'cb2_2a':'f2_recall',
       'cb2_2a_other':'f2_other', 'cb2_3a':'f3_recall', 'cb2_3a_other':'f3_other', 'cb2_4a':'f4_recall', 
       'cb2_4a_other':'f4_other','cb2_5a':'f5_recall', 'cb2_5a_other':'f5_other', 'cb2_6a':'f6_recall', 'cb2_6a_other':'f6_other', 
       'cb2_7a':'f7_recall','cb2_7a_other':'f7_other', 'cb2_8a':'f8_recall', 'cb2_8a_other':'f8_other', 'cb2_9a':'f9_recall', 
        'cb2_9a_other':'f9_other',
       'cb2_10a':'f10_recall', 'cb2_10a_other':'f10_other'})
hca2all=pd.concat([hca2cb1,hca2cb2])
hca2all=hca2all.loc[~(hca2all.redcap_event_name.isnull()==True)]


In [None]:
hcacat=pd.concat([hca1all,hca2all])
hcacat=hcacat.loc[~(hcacat.f1_recall.isnull()==True)].drop(columns=['subject_id'])

In [None]:
#now add the ndar variables
hcat=pd.merge(ndar,hcacat,left_on=['REDCap_id','redcap_event_name'],right_on=['id','redcap_event_name'],how='left')
hcat.shape

In [None]:
#dont drop subjects...need to keep these until the end when we finally remove subjects
#hcat=hcat.loc[~(hcat.version.isnull()==True)]

hcat.version=hcat.version.astype('Int64')

for i in ['f1_recall','f2_recall','f3_recall','f4_recall','f5_recall','f6_recall','f7_recall','f8_recall','f9_recall','f10_recall']:
    hcat[i]=hcat[i].astype('Int64')
    
for j in ['f7_other','f3_other','f9_other']: 
    hcat[j]=hcat[j].str.replace('"','').str.replace("'",'')
    hcat[j]=hcat[j].str[0:10]

hcat=hcat[['nda_age', 'nda_interview_date', 'pseudo_guid', 'subject','redcap_event',
       'M/F', 'version', 'f1_recall',
       'f1_other', 'f2_recall', 'f2_other', 'f3_recall', 'f3_other',
       'f4_recall', 'f4_other', 'f5_recall', 'f5_other', 'f6_recall',
       'f6_other', 'f7_recall', 'f7_other', 'f8_recall', 'f8_other',
       'f9_recall', 'f9_other', 'f10_recall', 'f10_other']].rename(columns=
        {'pseudo_guid':'subjectkey','subject':'src_subject_id','nda_age':'interview_age',
         'nda_interview_date':'interview_date','M/F':'sex','version':'version_form'})#'redcap_event_name', 
print(hcat.shape)

#don't drop these...need all the ndar variables
##drop empty rows
#hcat=hcat.loc[~(hcat.version_form.isnull()==True)]

#hcat.to_csv("seeitnow.csv")
print(hcat.shape)

In [None]:
hcat.columns

## Add IntraDB Staging Vars. i.e. recall stats per facename recall in scanner

In [None]:
F1=pd.read_csv("/Users/petralenzini/work/Behavioral/Lifespan/NDA_submissions/NDA_submissions/facename/FaceStats_HCA_V1.txt")
F2=pd.read_csv("/Users/petralenzini/work/Behavioral/Lifespan/NDA_submissions/NDA_submissions/facename/FaceStats_HCA_V2.txt")
PreReqs=pd.read_csv("/Users/petralenzini/work/Behavioral/Lifespan/NDA_submissions/NDA_submissions/facename/pcp_summary_CCF_HCA_STG_TaskAnalysis.csv")
PreReqs=PreReqs.rename(columns={'entityLabel':'MR_ID'})
F=pd.concat([F1,F2],axis=0)
print(F.shape)
F=pd.merge(PreReqs[['MR_ID']],F,on='MR_ID',how='inner',indicator=True)
F._merge.value_counts()


In [None]:
print(F1.shape)
print(F2.shape)
print(F.shape)
F['subject']=F['MR_ID'].str.split('_',expand=True)[0]
F['redcap_event']=F['MR_ID'].str.split('_',expand=True)[1]
F.to_csv('Ftest.csv',index=False)

In [None]:
#[] are not empty...they correspond with wide files that have quotes between lines
# i.e brackets has 'data' (needs version from REDCap)

#empties has 'version' and validating image data, so user might be able to reconstruct info.
#has redcap but not stats

In [None]:
print(F.shape)
F=F.drop_duplicates(subset='MR_ID')
print(F.shape)

#drop stats with no counterbalance
brackets=F.loc[(F.version=="[]")]
F=F.loc[~(F.version=="[]")]
print(F.shape)
brackets

In [None]:
#drop empty stats
statscols=['avg_mem_resp', 'ave_mem_rt',
       'avg_rec_resp', 'avg_rec_rt', 'f1_mem1_resp', 'f2_mem1_resp',
       'f3_mem1_resp', 'f4_mem1_resp', 'f5_mem1_resp', 'f6_mem1_resp',
       'f7_mem1_resp', 'f8_mem1_resp', 'f9_mem1_resp', 'f10_mem1_resp',
       'f1_mem2_resp', 'f2_mem2_resp', 'f3_mem2_resp', 'f4_mem2_resp',
       'f5_mem2_resp', 'f6_mem2_resp', 'f7_mem2_resp', 'f8_mem2_resp',
       'f9_mem2_resp', 'f10_mem2_resp', 'f1_rec1_resp', 'f2_rec1_resp',
       'f3_rec1_resp', 'f4_rec1_resp', 'f5_rec1_resp', 'f6_rec1_resp',
       'f7_rec1_resp', 'f8_rec1_resp', 'f9_rec1_resp', 'f10_rec1_resp',
       'f1_rec2_resp', 'f2_rec2_resp', 'f3_rec2_resp', 'f4_rec2_resp',
       'f5_rec2_resp', 'f6_rec2_resp', 'f7_rec2_resp', 'f8_rec2_resp',
       'f9_rec2_resp', 'f10_rec2_resp', 'f1_mem1_rt', 'f2_mem1_rt',
       'f3_mem1_rt', 'f4_mem1_rt', 'f5_mem1_rt', 'f6_mem1_rt', 'f7_mem1_rt',
       'f8_mem1_rt', 'f9_mem1_rt', 'f10_mem1_rt', 'f1_mem2_rt', 'f2_mem2_rt',
       'f3_mem2_rt', 'f4_mem2_rt', 'f5_mem2_rt', 'f6_mem2_rt', 'f7_mem2_rt',
       'f8_mem2_rt', 'f9_mem2_rt', 'f10_mem2_rt', 'f1_rec1_rt', 'f2_rec1_rt',
       'f3_rec1_rt', 'f4_rec1_rt', 'f5_rec1_rt', 'f6_rec1_rt', 'f7_rec1_rt',
       'f8_rec1_rt', 'f9_rec1_rt', 'f10_rec1_rt', 'f1_rec2_rt', 'f2_rec2_rt',
       'f3_rec2_rt', 'f4_rec2_rt', 'f5_rec2_rt', 'f6_rec2_rt', 'f7_rec2_rt',
       'f8_rec2_rt', 'f9_rec2_rt', 'f10_rec2_rt']

empties=F[F[statscols].isnull().all(axis=1)]
Ft=F.dropna(subset=statscols,how='all')
Ft.shape
empties.shape

In [None]:
print("IntraDB counts:",Ft.shape)
print("REDCap RECall:",hcat.shape)
hcat.version_form.value_counts()

In [None]:
hcat.version_form.value_counts(dropna=False)
print(hcat.loc[hcat.version_form.isnull()==True])

# only want to keep facename stuff for subjects whose scans passed prereqs and stats files are non-empty
# okay if redcap is empty
IntRed=pd.merge(Ft,hcat,left_on=['subject','redcap_event'],right_on=['src_subject_id','redcap_event'],how='left')
print("IntRed:",IntRed.shape)

In [None]:
#drop redcap data for mismatches, and completely empties
print("1",IntRed.shape)
print("2. Data in either Redcap or IntraDB")
IntRedSlim=IntRed.loc[~((IntRed.version_form.isnull()==True) & (IntRed.version.isna()==True))].copy()
print("2",IntRedSlim.shape)
#drop anyone with completely empty record.
print("should be empty:",IntRedSlim.loc[((IntRedSlim.version_form.isnull()==True) & (IntRedSlim.version.isna()==True))].shape)
IntRedSlim.reset_index(inplace=True)
print(IntRedSlim.shape)
#IntRedSlim.to_csv('test.csv',index=False)

In [None]:
redcapvars=['version_form','f1_recall','f1_other','f1_recall','f1_other','f2_recall','f2_other','f3_recall','f3_other','f4_recall','f4_other','f5_recall','f5_other','f6_recall','f6_other','f7_recall','f7_other','f8_recall','f8_other','f9_recall','f9_other','f10_recall','f10_other']
recallvars=[x for x in redcapvars if (('recall' in x) or ('version_form' in x))]
stringvars=[x for x in redcapvars if 'other' in x]

In [None]:
#set the mismatched redcap variables to missing, based on type
stayit=IntRedSlim.loc[~(((IntRedSlim.version_form==1) & (IntRedSlim.version=='CB2')) | ((IntRedSlim.version_form==2) & (IntRedSlim.version=='CB1'))| ((IntRedSlim.version_form.isnull()==True)))].copy()
fixit1 =IntRedSlim.loc[  ((IntRedSlim.version_form==1) & (IntRedSlim.version=='CB2')) | ((IntRedSlim.version_form==2) & (IntRedSlim.version=='CB1')) ].copy()
fixit2 =IntRedSlim.loc[   ((IntRedSlim.version_form.isnull()==True))].copy()
print(IntRedSlim.shape)
print(stayit.shape)
print(fixit1.shape)
print(fixit2.shape)


In [None]:
#set to missing any mismatches
fixit1[stringvars+['version_form']]=''
fixit1[recallvars]=''

In [None]:
#now find and remove any where V1 and V2 used the same counterbalance
#CB1 and CB2 are the same;

print("startcount",stayit.shape)
stayit2=stayit.copy()                      
v1peeps=stayit2.loc[stayit2.redcap_event=='V1'][['version','version_form','subject']]
v2peeps=stayit2.loc[stayit2.redcap_event=='V2'][['version','version_form','subject']]

v1v2=pd.merge(v1peeps,v2peeps,on='subject',how='inner') #scan merge
print(v1v2.shape)

print(v1v2.shape)
v1v2.head()
print("the guys that accidentally got the same Facename Scan")
a=v1v2.loc[(v1v2.version_x==v1v2.version_y)]
print(a)

print("the guys that accidentally got the same REDCap")
b=v1v2.loc[(v1v2.version_form_x.astype(str)==v1v2.version_form_y.astype(str)) & (~(v1v2.version_form_y.astype(str)=='<NA>'))]
print(b)


#nothing to remove, 

In [None]:
#REMEMBER
#[] are not empty...they correspond with wide files that have quotes between lines
# i.e brackets has 'data' (needs version from REDCap)

#empties has 'version' and validating image data, so user might be able to reconstruct info.
#has redcap but not stats

#ADD BACK THE STRAGGLERS
brackets.columns
bfind=pd.merge(brackets.drop(columns='_merge'),hcat,left_on=['subject','redcap_event'],right_on=['src_subject_id','redcap_event'],how='inner')
bfind['version']="CB"+bfind['version_form'].astype('str')      

bempties=pd.merge(empties.drop(columns='_merge'),hcat,left_on=['subject','redcap_event'],right_on=['src_subject_id','redcap_event'],how='inner')


#pd.concat([bfind,bempties]).to_csv('testb.csv',index=False)

In [None]:
#concatenate the results and check that it worked
IntRedSlimFixed=pd.concat([stayit,fixit1,fixit2,bfind,bempties],axis=0)
firstcols=['subjectkey','src_subject_id','sex','interview_date','interview_age']
IntRedSlimFixed[firstcols+[i for i in IntRedSlimFixed.columns if i not in firstcols]]


#print(IntRedSlimFixed.shape)
#IntRedSlimFixed.to_csv('test2.csv',index=False)


In [None]:
#write out csv for validation
filePath=pathout+'/facename01.csv'

if os.path.exists(filePath):
    os.remove(filePath)
else:
    print("Can not delete the file as it doesn't exists")

with open(filePath,'a') as f:
    f.write("facename,1\n")
    IntRedSlimFixed.drop(columns=['version_form','file_name','subject','redcap_event','_merge','index','MR_ID']).rename({'version':'version_form'}).to_csv(f,index=False)


In [None]:
tointlist=['f1_mem1_resp','f2_mem1_resp','f3_mem1_resp','f4_mem1_resp','f5_mem1_resp','f6_mem1_resp','f7_mem1_resp','f8_mem1_resp','f9_mem1_resp','f10_mem1_resp','f1_mem2_resp','f2_mem2_resp','f3_mem2_resp','f4_mem2_resp','f5_mem2_resp','f6_mem2_resp','f7_mem2_resp','f8_mem2_resp','f9_mem2_resp','f10_mem2_resp','f1_rec1_resp','f2_rec1_resp','f3_rec1_resp','f4_rec1_resp','f5_rec1_resp','f6_rec1_resp','f7_rec1_resp','f8_rec1_resp','f9_rec1_resp','f10_rec1_resp','f1_rec2_resp','f2_rec2_resp','f3_rec2_resp','f4_rec2_resp','f5_rec2_resp','f6_rec2_resp','f7_rec2_resp','f8_rec2_resp','f9_rec2_resp','f10_rec2_resp']
tointlist2=['f1_recall','f2_recall','f3_recall','f4_recall','f5_recall','f6_recall','f7_recall','f8_recall','f9_recall','f10_recall','f1_recall','f2_recall','f3_recall','f4_recall','f5_recall','f6_recall','f7_recall','f8_recall','f9_recall','f10_recall']#,'f1_rec1_resp','f2_rec1_resp','f3_rec1_resp','f4_rec1_resp','f5_rec1_resp','f6_rec1_resp','f7_rec1_resp','f8_rec1_resp','f9_rec1_resp','f10_rec1_resp','f1_rec2_resp','f2_rec2_resp','f3_rec2_resp','f4_rec2_resp','f5_rec2_resp','f6_rec2_resp','f7_rec2_resp','f8_rec2_resp','f9_rec2_resp','f10_rec2_resp']
cleanupfuncs.integercleanup(structure='facename01',filePath="./prepped/hca/",varlist=tointlist+tointlist2)