# Data Preprocessing

In [2]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re

# Importing the dataset with library pandas
dataset = pd.read_csv('TADPOLE_InputData.csv')
labels_train = pd.read_csv('TADPOLE_TargetData_train.csv')
labels_test = pd.read_csv('TADPOLE_TargetData_test.csv')

# Tally the data types of all data columns, and then separate them according to dtype.
dtypeCounts = dataset.dtypes.value_counts(); # Count the number of columns for each data type. Turns out to be only 'float64' and 'object'.
numDataset = dataset.select_dtypes(include=['float'])
objDataset = dataset.select_dtypes(include=['object'])

# Unfortunately some numerical columns would be grouped into objDataset by the operations above, since they contain entries with non-numerical symbols such as in '>1300'.
# Basically the strategy is to convert these columns to floats by extracting only numbers, including int and float.
columnsTofix = ['ABETA_UPENNBIOMK9_04_19_17','TAU_UPENNBIOMK9_04_19_17','PTAU_UPENNBIOMK9_04_19_17','COMMENT_UPENNBIOMK9_04_19_17']
for column in columnsTofix:
    strIdx = np.where(objDataset[column].apply(type).values == str)[0] # Find all str type entries in each column, which may or may not contain non-numerical characters such as '<' or '>'.
    for row in strIdx: # iterate through each column
        objDataset.set_value(row,column,float(re.search('\d+.?\d*',objDataset[column].values[row]).group(0))) # get rid of '<' '>' if there is any and convert to float  
    objDataset[column]=pd.to_numeric(objDataset[column],errors='coerce') # cast each column to float type
numDataset = pd.concat([numDataset,objDataset[columnsTofix]], axis = 1) # Move this section of data to numDataset
objDataset = objDataset.drop(columnsTofix,axis=1) # Remove this section from objDataset


badColumns = ['update_stamp_UCSFFSL_02_01_16_UCSFFSL51ALL_08_01_16',
    'update_stamp_UCSFFSX_11_02_15_UCSFFSX51_08_01_16',
    'update_stamp_UCBERKELEYAV45_10_17_16',
    'update_stamp_DTIROI_04_30_14']
objDataset = objDataset.drop(badColumns,axis=1) # Remove this section from objDataset

''' The badColumns are strange:  
    They have mixed data types, some entries (e.g. 33:37.0) seem to be timing, 
    but the others (e.g. 428e+4), seem to be float with the same repeating values.
    What do they mean? Before understanding them, I leave them out of objDataset.
'''

''' The folloing codes were used to detect the problematic columns (columnsTofix and badColumns)mentioned above
for column in objDataset:
    if sum(objDataset[column].apply(type) == float) -  sum(pd.isnull(objDataset[column])) > 0: # returns true if there is at least one entry that is float but not 'NaN'. 'NaN' are excluded since they are float, but not really numerical.
        print(column)
'''

# Separate date and time columns from objDataset (which was desgined to contain only categorical data)
for column in objDataset: # variable 'column' is a string
    if objDataset[column].str.match('[0-9]+/[0-9]+/[0-9]+').sum() > 0: # returns true if this column contains at least one string that matches date format.
        objDataset[column] = pd.to_datetime(objDataset[column],format="%m/%d/%y",errors='coerce') # convert string to date
dateDataset = objDataset.select_dtypes(include=['datetime64']) # select dates from objDataset for variable dateDataset
objDataset = objDataset.drop(dateDataset.columns,axis=1) # Remove dates from objDataset

nonnanNumDataset = numDataset.count() # returns the count of non-NaN entries for each column in numDataset, since not only we want to impute, we want to know how many we impute, especially for columns with very sparse initial data.
# Histogram of non-NaN percentage of columns
temp = numDataset.count()/8717
temp.hist(bins=50)
numX = numDataset.iloc[:,:].values # converting to ndarray makes it easier to work with sklearn imputer, see below

# Imputing missing data in numX
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0, verbose=0, copy=True) # imput numerical columns
numX2 = imp.fit_transform(numX)
nan_index = []
for i in range(numX.shape[1]): #find all columns that are invalid
    curr_vect = numX[:,i]
    curr_sum = np.nansum(curr_vect)
    if curr_sum==0 :
        nan_index.append(i)
        
nan_index2=[]
for i in range(numX2.shape[1]):#find all columns that are false positives
    curr_vect = numX2[:,i]
    curr_sum = np.nansum(curr_vect)
    if curr_sum==0 :
        nan_index2.append(i)
    
for i in range(len(nan_index2)): #remove false positives from invalid columns
    nan_index.remove(nan_index2[i]) 

num_cat_names = [] #get new category after all the dropped cols
for i in range(len(numDataset.columns)):
    if i not in nan_index:
        num_cat_names.append(numDataset.columns[i])

cat_name_to_num_dict = {} #map category name to corresponding column
for i in range(len(num_cat_names)):
    cat_name_to_num_dict[num_cat_names[i]] = numX2[:,i]


# Encoding categorical data, missing data, aka 'NaN', will be encoded to zero(s).
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
objDataset = objDataset.fillna('Null') # replace all 'NaN's with 'Null', which has not appeared in the entire objDataset.
catX = objDataset.iloc[:,:].values # Convert categorical data from DataFrame to ndarray for ease of operation
catX_pre_ohe = objDataset.iloc[:,:].values
lab = LabelEncoder();
for i in range(catX.shape[1]):
    catX[:,i] = lab.fit_transform(catX[:,i])
    catX_pre_ohe[:,i] = lab.fit_transform(catX_pre_ohe[:,i])
catX_pre_ohe = np.asarray(catX_pre_ohe,dtype=np.float64)
ohe = OneHotEncoder(categorical_features = 'all') # To binary data, for example, for three categories 1, 2, 3, it is translated to 00, 01, 10.
catX = ohe.fit_transform(catX).toarray()


cat_to_ohe_dict = {}
last_index = 0
for i in range(catX_pre_ohe.shape[1]):
    col_span = catX_pre_ohe[:,i].max()
    splice_index = int(last_index + col_span + 1)
    cat_to_ohe_dict[i] = catX[:,last_index:splice_index]
    last_index = splice_index

cat_name_to_cat_dict = {}
cat_names = objDataset.columns
for i in range(len(cat_names)):
    cat_name_to_cat_dict[cat_names[i]]=cat_to_ohe_dict[i] #map category name to np arrays
    


  interactivity=interactivity, compiler=compiler, result=result)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [3]:
# Work on labels_train: convert dates to number of days relative to the initial date, and then bind to the rest of data and do imputation
idxPTIDTable_train = pd.concat([pd.DataFrame(labels_train.index.values),labels_train['PTID_Key']],axis=1)
dic_train = idxPTIDTable_train.groupby('PTID_Key').groups
dateY_train_raw = pd.to_datetime(labels_train['Date'],format="%m/%d/%y",errors='coerce')
dateY_train = np.zeros(labels_train['Date'].size)
dateY_train[:] = np.nan
for key, value in dic_train.items():
    allDates = dateY_train_raw[value] # Get all dates in column i about patient with ID 'key'
    if not allDates.empty: # There is at least one row about this patient
        firstDate = allDates.min()
        if not pd.isnull(firstDate): # There is at least one valid date in this column, which is the earliest date of this column
            for j in range(allDates.size):
                dateY_train[value[j]] = (allDates[value[j]]-firstDate).days
y_train = np.concatenate((dateY_train[:,None],labels_train.iloc[:,1:].values),axis=1)
imp = imp.fit(y_train)
y_train = imp.transform(y_train)



# Work on labels_test: convert dates to number of days relative to the initial date, and then bind to the rest of data and do imputation
idxPTIDTable_test = pd.concat([pd.DataFrame(labels_test.index.values),labels_test['PTID_Key']],axis=1)
dic_test = idxPTIDTable_test.groupby('PTID_Key').groups
dateY_test_raw = pd.to_datetime(labels_test['Date'],format="%Y-%m-%d",errors='coerce')
dateY_test = np.zeros(dateY_test_raw.size)
dateY_test[:] = np.nan
for key, value in dic_test.items():
    allDates = dateY_test_raw[value] # Get all dates in column i about patient with ID 'key'
    if not allDates.empty: # There is at least one row about this patient
        firstDate = allDates.min()
        if not pd.isnull(firstDate): # There is at least one valid date in this column, which is the earliest date of this column
            for j in range(allDates.size):
                dateY_test[value[j]] = (allDates[value[j]]-firstDate).days
y_test = np.concatenate((dateY_test[:,None],labels_test.iloc[:,1:].values),axis=1)
imp = imp.fit(y_test)
y_test = imp.transform(y_test)
yAttributes = labels_test.columns.values
yAttributes = np.delete(yAttributes, 1)

In [4]:
#for y train set
y_train_sub_dates=y_train[:,1:] #take out absolute days feature
cat_name_to_y_train = {}#create dicitonary mapping categories to the train feature column
index=0
for cat in labels_train.columns[1:]:
    cat_name_to_y_train[cat]=y_train_sub_dates[:,index]
    index+=1

#for y test set
y_test_sub_dates = y_test[:,1:]#take out absolute days feature
cat_name_to_y_test = {}#create dictionary mapping categories to test feature column
index=0
for cat in labels_test.columns[1:]:
    cat_name_to_y_test[cat]=y_test_sub_dates[:,index]
    index+=1

#for y valid set
labels_valid = pd.read_csv('TADPOLE_PredictTargetData_valid.csv')#read in csv using pandas frame
y_valid_ids = labels_valid.iloc[:,1].values #loads y_valid_ids with PTIDs in set

cat_name_to_y_valid = {}#create dictionary mapping categories to y valid features
index = 0
for cat in labels_valid.columns[1:]:
    cat_name_to_y_valid[cat] = labels_valid.iloc[:, (index+1)].values
    index+=1


In [6]:
#input S must be dictionary w/ 'PTID_Key' as one of keys
#maps patient ids tomost recent (last appearing index in matrix )
# returns dictionary with
# key: PT_ID         val: last index row index
def get_last_patient_visit(S):
    hist_dict={}
    for pt_id in S['PTID_Key']: #get indexes of last patient visit per patient id
        pt_id_appearances = np.where(S['PTID_Key']==pt_id)
        hist_dict[int(pt_id)] = np.max(pt_id_appearances)
    return hist_dict


In [7]:
#condense dataset into 1 visit per patient (last visit for each patient)
def make_last_visit(dataset, last_visit_dict):
    try:
        last_visit_dataset = np.zeros( (len(last_visit_dict.keys()), dataset.shape[1]) )
    except:
        last_visit_dataset = np.zeros( (len(last_visit_dict.keys()), ))
    ind = 0
    for k in last_visit_dict.keys():
        last_visit_index = last_visit_dict[k]
        last_visit_dataset[ind] = dataset[last_visit_index]
        ind+=1
    return last_visit_dataset
#get last visits for respective sets
X_patient_last_visit = get_last_patient_visit(cat_name_to_num_dict)
y_train_last_visit = get_last_patient_visit(cat_name_to_y_train)
y_test_last_visit = get_last_patient_visit(cat_name_to_y_test)
y_valid_last_visit = get_last_patient_visit(cat_name_to_y_valid)

last_visit_y_train = np.zeros((len(y_train_last_visit.keys()),y_train_sub_dates.shape[1]))
lv_y_train = make_last_visit(y_train_sub_dates, y_train_last_visit)
last_visit_y_test = make_last_visit(y_test_sub_dates, y_test_last_visit)
last_visit_y_valid = make_last_visit(y_valid_ids, y_valid_last_visit)

In [8]:
#condense to one visit / id
index_y = 0
for key in y_train_last_visit.keys():
    last_visit_index = y_train_last_visit[key]
    last_visit_y_train[index_y] = y_train_sub_dates[last_visit_index]
    index_y+=1
    
index_X=0
for key in X_patient_last_visit.keys():
    last_visit_index = X_patient_last_visit[key]
    index_X+=1

In [9]:
#X and y are np arrays, X/y_last is dict of PTID -> last row index in their respective datasets
#converts X so that it has same # of patients as y and each row corresponds to the same patient
def modify_X_to_y(X, X_last, y_last): 
    y_PT_ID = y_last.keys()
    modified_X = np.zeros((len(y_PT_ID), X.shape[1]))
    X_PTID_Index = list(X[:,0]) #index here corresponds to vector's index in matrix
    index = 0
    for pt_id in y_PT_ID:
        corr_X_index = X_PTID_Index.index(pt_id) #PT_ID -> X_index = PT_ID-1
        modified_X[index] = X[corr_X_index]
        index+=1
    return modified_X

In [10]:
#Ventricles_norm is computed as "Ventricles" divided by "ICV"
#DXCHANGE = {1, 7, 9} encodes healthy control, DXCHANGE = {2, 4, 8} encodes MCI, and 
#DXCHANGE = {3, 5, 6} encodes Alzheimer's diagnosis.
def DXCHANGE_to_diagnosis(DXCHANGE_col):
    expanded_DXCHANGE = np.zeros((len(DXCHANGE_col), 3))
    healthy = set([1,7,9])
    mci = set([2,4,8])
    ad = set([3,5,6])
    for i in range(len(DXCHANGE_col)):
        if int(DXCHANGE_col[i]) in healthy:
            new_row = np.array([1,0,0])
            expanded_DXCHANGE[i] = new_row
        elif int(DXCHANGE_col[i]) in mci:
            new_row = np.array([0,1,0])
            expanded_DXCHANGE[i] = new_row
        elif int(DXCHANGE_col[i]) in ad:
            new_row = np.array([0,0,1])
            expanded_DXCHANGE[i] = new_row
    return expanded_DXCHANGE

#converts the one hot encoding to a 0, 1, or 2 depending on diagnosis
def diagnosis_to_classes(y_diagnosis_cols):
    condensed_classes = np.zeros((y_diagnosis_cols.shape[0],))
    for i in range(len(condensed_classes)):
        check_equivalence_healthy = y_diagnosis_cols[i] == np.array([1,0,0])
        check_equivalence_mci = y_diagnosis_cols[i] == np.array([0,1,0])
        check_equivalence_ad = y_diagnosis_cols[i] == np.array([0,0,1])
        if check_equivalence_healthy.all() == True:
            condensed_classes[i] = 0
        elif check_equivalence_mci.all() == True:
            condensed_classes[i] = 1
        elif check_equivalence_ad.all() == True:
            condensed_classes[i] = 2        
    return condensed_classes
#converts one hot encoding of diagnosis to one column w/ classes 0-healthy, 1-MCI, 2-AD
def classes_to_diagnosis(y_diagnosis_col):
    condensed_classes = np.zeros((y_diagnosis_col.shape[0], 3))
    for i in range(len(condensed_classes)):
        if y_diagnosis_col[i] == 0:
            condensed_classes[i] = np.array([1,0,0])
        elif y_diagnosis_col[i] == 1:
            condensed_classes[i] = np.array([0,1,0])
        elif y_diagnosis_col[i] == 2:
            condensed_classes[i] = np.array([0,0,1])     
    return condensed_classes
                                                            
    

In [11]:
dxchange = cat_name_to_num_dict['DXCHANGE']
exp_DXCHANGE = DXCHANGE_to_diagnosis(dxchange)
y_train_condensed_classes = diagnosis_to_classes(y_train_sub_dates[:,1:4])

y_train_lv_condensed_classes = make_last_visit(y_train_condensed_classes, y_train_last_visit)

In [12]:
X_mod_train = modify_X_to_y(numX2, X_patient_last_visit, y_train_last_visit)#X input w/ corresponding patient ids as train set
X_mod_test = modify_X_to_y(numX2, X_patient_last_visit, y_test_last_visit)#X input w/ corresponding pt ids as test
X_mod_valid = modify_X_to_y(numX2, X_patient_last_visit, y_valid_last_visit)#X input w/ corresponding pt ids as valid



# Implementing Regression Models

##### MMSE

In [31]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

X = X_mod_train[:,1:] 
y_mmse = lv_y_train[:,6]

MMSE_lin_reg = LogisticRegression(C=1e5)
MMSE_lin_reg.fit(X,y_mmse)

LogisticRegression(C=100000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [32]:
MMSE_test_predict = MMSE_lin_reg.predict(X_mod_test[:,1:])
MMSE_valid_predict = MMSE_lin_reg.predict(X_mod_valid[:,1:])

##### Diagnosis

In [16]:
logreg_diagnosis = LogisticRegression(C=1e5)
X = X_mod_train[:,1:] 
y = y_train_lv_condensed_classes
logreg_diagnosis.fit(X, y)

LogisticRegression(C=100000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [17]:
#get probablilty of each diagnoses
diagnosis_predict_test = logreg_diagnosis.predict(X_mod_test[:,1:])
expanded_cols_test = classes_to_diagnosis(diagnosis_predict_test)
expanded_prob_test = logreg_diagnosis.predict_proba(X_mod_test[:,1:])

#get regular diagnosis
diagnosis_predict_valid = logreg_diagnosis.predict(X_mod_valid[:,1:])
expanded_cols_valid = classes_to_diagnosis(diagnosis_predict_valid)
expanded_prob_valid = logreg_diagnosis.predict_proba(X_mod_valid[:,1:])

##### ADAS13

In [18]:
logreg_adas13 = LogisticRegression(C=1e5)
X = X_mod_train[:,1:] 
y = lv_y_train[:,4]
logreg_adas13.fit(X, y)

LogisticRegression(C=100000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [19]:
predicted_adas13_test=logreg_adas13.predict(X_mod_test[:,1:])
predicted_adas13_valid=logreg_adas13.predict(X_mod_valid[:,1:])

##### Ventricle Norms

In [20]:
from sklearn.linear_model import LinearRegression
X = X_mod_train[:,1:] 
y = lv_y_train[:,5]

vn_lin_reg = LinearRegression(normalize=True)
vn_lin_reg.fit(X,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=True)

In [21]:
vn_predict_test = vn_lin_reg.predict(X_mod_test[:,1:])
vn_predict_valid = vn_lin_reg.predict(X_mod_valid[:,1:])

In [22]:
def get_col_index(category):
    return num_cat_names.index(category)

In [23]:
#output of vn lin regression model has some negative values, so i replace the negative values with vn calculated from
#historic patient data
def floor_vn_values(vn_prediction, X_corr):
    mod_vn_predict = np.zeros((len(vn_prediction), ))
    vent_index = get_col_index('Ventricles')
    icv_index = get_col_index('ICV')
    for i in range(len(X_corr[:,0])):
        curr_PTID = X_corr[:,0][i]
        
        if(vn_prediction[i] < 0):
            #print 'flooring'
            corr_PTID_row = list(X_corr[:,0]).index(curr_PTID) #gets row index for value to fix
            mod_vn_predict[i] = (X_corr[corr_PTID_row][vent_index]/X_corr[corr_PTID_row][icv_index])
            #print('floored', mod_vn_predict[i])
        else:
            mod_vn_predict[i] = vn_prediction[i]
    return mod_vn_predict

In [24]:
floored_vn_predict_test = floor_vn_values(vn_predict_test, X_mod_test)
floored_vn_predict_valid = floor_vn_values(vn_predict_valid, X_mod_valid)

# Creating Prediction CSV

In [36]:
#concatenate all the test values together
predicted_y_test = np.column_stack( (X_mod_test[:,0], expanded_cols_test, predicted_adas13_test, 
                                     floored_vn_predict_test, MMSE_test_predict) )
#test values with diagnosis cols as probabilities
predicted_y_test_prob = np.column_stack( (X_mod_test[:,0], expanded_prob_test, predicted_adas13_test, 
                                     floored_vn_predict_test, MMSE_test_predict) )
#concatenate all the valid values together
predicted_y_valid = np.column_stack( (X_mod_valid[:,0], expanded_cols_valid, predicted_adas13_valid, 
                                      floored_vn_predict_valid, MMSE_valid_predict) )
#valid values w/ diagnosis cols as probabilities
predicted_y_valid_prob = np.column_stack( (X_mod_valid[:,0], expanded_prob_valid, predicted_adas13_valid, 
                                      floored_vn_predict_valid, MMSE_valid_predict) )

In [37]:
filled_out_y_test = np.zeros( (len(labels_test.iloc[:,1].values), 7) )
filled_out_y_test_prob = np.zeros( (len(labels_test.iloc[:,1].values), 7) )

filled_out_y_valid = np.zeros( (len(labels_valid.iloc[:,1].values), 7) )
filled_out_y_valid_prob = np.zeros( (len(labels_valid.iloc[:,1].values), 7) )

#fill in all previous visits with the predicted visit
index = 0
predicted_y_test_ids = list(predicted_y_test[:,0])
for test_ids in labels_test.iloc[:,1].values:
    predicted_y_test_row = predicted_y_test_ids.index(test_ids)
    filled_out_y_test[index] = predicted_y_test[predicted_y_test_row]
    filled_out_y_test_prob[index] = predicted_y_test_prob[predicted_y_test_row]
    index+=1
    
index = 0
predicted_y_valid_ids = list(predicted_y_valid[:,0])
for valid_ids in labels_valid.iloc[:,1].values:
    predicted_y_valid_row = predicted_y_valid_ids.index(valid_ids)
    filled_out_y_valid[index] = predicted_y_valid[predicted_y_valid_row]
    filled_out_y_valid_prob[index] = predicted_y_valid_prob[predicted_y_valid_row]
    index+=1


In [38]:
#concatenate dates onto data
filled_out_y_test = np.column_stack( (labels_test['Date'], filled_out_y_test) )
filled_out_y_valid = np.column_stack( (labels_valid['Date'], filled_out_y_valid) )

In [39]:
#write predictions to a dataframe and export df to a csv
baseline_test_data_df = pd.DataFrame(data=filled_out_y_test, columns=labels_test.columns)
baseline_validation_data_df = pd.DataFrame(data=filled_out_y_valid, columns=labels_valid.columns)
output_folder = 'results/'
output_test = 'TADPOLE_baseline_test.csv'
output_validation = 'TADPOLE_baseline_valid.csv'
baseline_test_data_df.to_csv( (output_folder+output_test), index=False )
baseline_validation_data_df.to_csv( (output_folder+output_validation), index=False )