In [28]:
import pandas as pd
import numpy as np

In [29]:
# onedrive = 'F:/onedrive/'
onedrive = "C:/Users/jokit/OneDrive/"
path = f'{onedrive}Phd workshop/fmri/tadpole_challenge/'
data = pd.read_csv(f'{path}TADPOLE_D1_D2.csv')
D3 = pd.read_csv(f'{path}TADPOLE_D3.csv')
v1 = data['D1'].values
v2 = data['D2'].values
D1 = data.loc[v1 == 1,:]
D2 = data.loc[v2 == 1,:]

  interactivity=interactivity, compiler=compiler, result=result)


In [30]:
cols = D3.columns.values
subD1 = D1[cols]
subD1['D'] = np.repeat('D1',len(subD1))

subD2 = D2[cols]
subD2['D'] = np.repeat('D2',len(subD2))

D3['D'] = np.repeat('D3',len(D3))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


# Begin preprocessing the data

In [31]:
allData = pd.concat([subD1,subD2,D3],axis=0)
visc = allData['VISCODE'].values
age = allData['AGE'].values
for i,v in enumerate(visc):
    if v.startswith('m'):
        inc = float(v.split('m')[1])/12.
        age[i] += inc
allData['AGE'] = age

In [32]:
# infer the categorical features
import re
prefix = '(^\d+(\.\d+)?$)'
onehotMask = [False if re.match(prefix,str(x)) else True for x in allData.iloc[0,:].values]
contMask = [not x for x in onehotMask]

In [33]:
# predicting adas13
onehotCols = ['PTGENDER','PTETHCAT','PTRACCAT','PTMARRY']
contCols = ['AGE','PTEDUCAT','Hippocampus',\
            'WholeBrain','Entorhinal','Fusiform','MidTemp']

In [34]:
allData = allData.loc[~np.isnan(allData['ADAS13'].values),:]
# lots of the records are duplicated.
allData = allData.drop_duplicates(subset=['RID','AGE'])

## preprocessing for LGPR

In [9]:
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
contData = allData[contCols].values
onehotData = allData[onehotCols].values

scaler = StandardScaler()
imp = IterativeImputer(max_iter=30, random_state=0)
dataScaler = scaler.fit_transform(contData)
dataScaler = imp.fit_transform(dataScaler)

In [10]:
def categorizeFeature(x):
    idmap = {each:i for i,each in enumerate(set(x))}
    if len(idmap) > 1:
        return np.array([idmap[each] for each in x]).reshape(-1,1)
    return None

In [11]:
colnames = []
cateRes = None
for each in range(onehotData.shape[1]):
    if cateRes is None:
        r = categorizeFeature(onehotData[:,each])
        if r is not None:
            cateRes = r
            colnames.append(onehotCols[each])
    else:
        r = categorizeFeature(onehotData[:,each])
        if r is not None:
            cateRes = np.concatenate([cateRes,r],axis=1)
            colnames.append(onehotCols[each])
cateCols = np.arange(len(colnames))
colnames.extend(contCols)
colnames = np.array(colnames)
contCC = np.arange(len(cateCols),len(colnames))

## preprocessing for other algorithms

In [35]:
colnames = []
onehotModel = OneHotEncoder()
cateRes = onehotModel.fit_transform(onehotData.astype(str)).toarray()
for i in range(len(onehotModel.categories_)):
    colnames.extend([onehotCols[i]+'_'+str(x) for x in onehotModel.categories_[i]])
colnames.extend(contCols)
colnames = np.array(colnames)

In [36]:
idCol = allData['RID'].values
oidCol = allData['AGE'].values
X = np.concatenate([cateRes,dataScaler],axis=1)
y = allData['ADAS13'].values

In [20]:
# format a small dataset containing only several individuals
from collections import Counter
ct = Counter(idCol)
top = 50
targetIds = set([x[0] for x in ct.most_common()[:top]])
mask = [x in targetIds for x in idCol]
idCol = idCol[mask]
oidCol = oidCol[mask]
X = X[mask]
y = y[mask]

In [21]:
from collections import defaultdict

def splitTrainTest(ids, time, prop=.7, seed = 19):
    np.random.seed(seed)
    mask = np.array([np.random.uniform() < prop for x in ids])
    
    trainIdx, trainTime = ids[mask], time[mask]
    testIdx,testTime = ids[~mask],time[~mask]
    trainDict = defaultdict(list)
    testDict = defaultdict(list)
    
    for i,m in enumerate(mask):
        if m:
            trainDict[ids[i]].append((time[i],i))
        else:
            testDict[ids[i]].append((time[i],i))
    for k,v in trainDict.items():
        trainV = [x[0] for x in v]
        trainVid = [x[1] for x in v]
        tv = testDict.get(k,None)
        if tv is not None:
            testV = [x[0] for x in tv]
            testVid = [x[1] for x in tv]
            maxId = np.argmax(trainV)
            minId = np.argmin(testV)
            while trainV[maxId] > testV[minId]:
                tmp = trainV[maxId]
                trainV[maxId] = testV[minId]
                testV[minId] = tmp
                tmp = trainVid[maxId]
                trainVid[maxId] = testVid[minId]
                testVid[minId] = tmp
                maxId = np.argmax(trainV)
                minId = np.argmin(testV)
            trainDict[k] = [(x,y) for x,y in zip(trainV,trainVid)]
            testDict[k] = [(x,y) for x,y in zip(testV,testVid)]
    train = []
    test = []
    for k,v in trainDict.items():
        train.extend([x[1] for x in v])
    for k,v in testDict.items():
        test.extend([x[1] for x in v])
    return np.array(train), np.array(test)

def getIndvFixFeature(ids,x):
    dt = defaultdict(list)
    for i,k in enumerate(ids):
        dt[k].append(x[i])
    noChange = np.repeat(True,x.shape[1])
    for k,v in dt.items():
        v = np.array(v)
        for i in range(v.shape[1]):
            if np.sum(v[:,i] - v[0,i]) != 0:
                noChange[i] = False
    return noChange

from scipy import io
def generate(seed = 19,density = 0.7,name = 'tadpole'):
    trainIdx, testIdx = splitTrainTest(idCol, X[:,np.where(colnames == 'AGE')[0]].reshape(-1),density,seed)
    if name == 'tadpoleLGPR':
        io.savemat(f'../tadpoleLGPR_{seed}',{'trainId':idCol[trainIdx],'trainOid':oidCol[trainIdx],'trainX':X[trainIdx],'trainY':y[trainIdx],
                                  'testId':idCol[testIdx],'testOid':oidCol[testIdx],'testX':X[testIdx],'testY':y[testIdx],'colname':colnames,
                                        'cateCols':cateCols, 'contCols':contCC})
    else:
        io.savemat(f'../tadpole_{seed}',{'trainId':idCol[trainIdx],'trainOid':oidCol[trainIdx],'trainX':X[trainIdx],'trainY':y[trainIdx],
                                  'testId':idCol[testIdx],'testOid':oidCol[testIdx],'testX':X[testIdx],'testY':y[testIdx],'colname':colnames})

In [27]:
generate(seed=14,name='tadpole')