In [None]:
import os
import pandas
import difflib
import numpy as np

In [None]:
wdir = os.path.join(os.getcwd(), 'stuff')

In [None]:
# load data
adnidf = pandas.read_csv(os.path.join(wdir,'UCBERKELEYAV1451_11_14_17.csv'))
adnidf = adnidf.sort_values(['RID','EXAMDATE'])

In [None]:
# identify target columns
# HINT: you may want to use the data dictionary to convert the columns to a more
# comprehensible format -- otherwise the method employed in this notebook won't really work
ctx_cols = [x for x in adnidf.columns if 'CTX_' in x and 'SIZE' not in x]
hits = ['AMYGDALA', 'CAUDATE', 'HIPPOCAMPUS', 'PALLIDUM', 'PUTAMEN', 'THALAMUS', 'ACCUMBENS']
subcols = [x for x in adnidf.columns if any([y in x for y in hits]) and 'SIZE' not in x]
goodcols = ['RID'] + ctx_cols + subcols


In [None]:
# make new spreadsheet with just the target columns
nsdf = pandas.DataFrame(adnidf[goodcols])
nsdf.drop(nsdf.columns[0],axis=1,inplace=True)
nsdf.columns = nsdf.columns[:-7].tolist() + ['RIGHT_ACCUMBENS_AREA'] + nsdf.columns[-6:].tolist()

In [None]:
nsdf.shape

In [None]:
nsdf.dropna(inplace=True)
nsdf.shape

In [None]:
# load the spreadsheet containing the label-ROI map for the volumetric DKT atlas
ldf = pandas.read_csv(os.path.join(wdir,'dst_labels.csv'), header=None)
ldf.columns = ['label','ROI']
ldf.head()

In [None]:
# subcortical targets...
ldf.ROI[70:79]

In [None]:
# add hemispheric info
lhctx = ['LH %s'%x.upper().replace(' ','') for x in ldf.ROI[:31]]
lsc = ['LEFT %s'%x.upper().replace(' ','') for x in ldf.ROI[31:39]]
rhctx = ['RH %s'%x.upper().replace(' ','') for x in ldf.ROI[39:70]]
rsc = ['RIGHT %s'%x.upper().replace(' ','') for x in ldf.ROI[70:78]]
ldf.loc[ldf.index[:78],'newlabs'] = lhctx + lsc + rhctx + rsc

In [None]:
# use difflib to find the best matches for each column
for i,row in ldf.iterrows():
    if i < 78:
        roi = row['newlabs']
        match = difflib.get_close_matches(roi,nsdf.columns)
        if len(match) == 0:
            ldf.loc[i,'matching_lab'] = np.nan
        else:
            ldf.loc[i,'matching_lab'] = match[0]

In [None]:
# deal with the accumbens...
ldf.loc[ldf.index[31],'matching_lab'] = 'LEFT_ACCUMBENS_AREA'
ldf.loc[ldf.index[33],'matching_lab'] = np.nan
ldf.loc[ldf.index[70],'matching_lab'] = 'RIGHT_ACCUMBENS_AREA'
ldf.loc[ldf.index[72],'matching_lab'] = np.nan

In [None]:
# get rid of the ROIs that seem to be missing from the volumetric atlas..
to_drop = [x for x in nsdf.columns if 'POLE' in x or 'BANKS' in x or 'UNKNOWN' in x]
nsdf.drop(to_drop,axis=1,inplace=True)
nsdf.shape

In [None]:
# reindex the spreadsheet
nsdf = nsdf.reindex(columns=ldf.matching_lab.dropna().values)

In [None]:
# now deal with that weird unidentifiable ROI...
final_cols = nsdf.columns[:33].tolist() + ['MISSING1'] + nsdf.columns[33:71].tolist() + ['MISSING2'] + nsdf.columns[71:].tolist()

In [None]:
# last step!
nsdf = nsdf.reindex(columns=final_cols)

In [None]:
# how did we do?
list(zip(nsdf.columns,ldf.ROI.tolist()))