In [1]:
def PTLump(data,key):
    ptid_roster = data['PTID_Key'].astype(int)
    ptid_roster = ptid_roster.dropna(how='all') 
    ptid_roster = ptid_roster.unique()
    tbl = pd.concat([pd.DataFrame(data['PTID_Key'].index.values),data['PTID_Key'].astype(int)],axis=1)
    dic = tbl.groupby('PTID_Key').groups
    rowIdx = []
    for i in ptid_roster:
        allDates = data[key].iloc[dic[i]]
        print(i,allDates.idxmax())
        rowIdx.append(allDates.idxmax()) 
    reducX = data.iloc[rowIdx,:]
    reducX = reducX.reset_index(drop=True)
    return reducX

In [2]:
# Logistic Regression

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re

# Importing the dataset with library pandas
dataset = pd.read_csv('TADPOLE_InputData.csv')
labels_train = pd.read_csv('TADPOLE_TargetData_train.csv')
labels_test = pd.read_csv('TADPOLE_TargetData_test.csv')
target = pd.read_csv('TADPOLE_PredictTargetData_valid.csv')

# Drop meaningless rows and columns. A good practice is to drop rows before columns.
dataset = dataset[~np.isnan(dataset['PTID_Key'])] # Drop patients with no ID, since they cannot be used for learning or prediction. [] slices the rows in dataframe.
dataset = dataset.dropna(axis=1, how='all') # Drop empty columns in dataset      

#Sort all datasets by ascending patient ID.
dataset = dataset.sort_values('PTID_Key')
labels_train = labels_train.sort_values('PTID_Key')
labels_test = labels_test.sort_values('PTID_Key')
# Reindex rows
dataset = dataset.reset_index(drop=True)
labels_train = labels_train.reset_index(drop=True)
labels_test = labels_test.reset_index(drop=True)

# These columns are time stamps that do not carry particular meaning, so dropped for now.
badColumns = ['update_stamp_UCSFFSL_02_01_16_UCSFFSL51ALL_08_01_16',
    'update_stamp_UCSFFSX_11_02_15_UCSFFSX51_08_01_16',
    'update_stamp_UCBERKELEYAV45_10_17_16',
    'update_stamp_DTIROI_04_30_14']
dataset = dataset.drop(badColumns,axis=1) # Remove this section from objDataset

# Unfortunately, some numerical columns contain non-numerical character such as '>' in  '>1300'.
# The strategy is to convert these columns to floats by extracting only numbers. For example, '>1300' goes to 1300
columnsObjToNum = ['ABETA_UPENNBIOMK9_04_19_17','TAU_UPENNBIOMK9_04_19_17','PTAU_UPENNBIOMK9_04_19_17','COMMENT_UPENNBIOMK9_04_19_17']
for column in columnsObjToNum:
    colIdx = dataset.columns.get_loc(column)
    rowIdx = np.where(dataset[column].apply(type).values == str)[0] # Find all str type elements in each column, which may or may not contain non-numerical characters such as '<' or '>'.
    for row in rowIdx: # iterate through each row of string type element in the column
        dataset.iloc[row,colIdx] = float(re.sub("[^0-9.]","",dataset[column].values[row])) # Find the float/int number in the string, and cast to float type. 
dataset[columnsObjToNum] = dataset[columnsObjToNum].astype(float) # cast each column to float type

# Convert date columns to date format in dataset, since they are currently imported as object columns
for column in dataset: # variable 'column' is a string
    if dataset[column].dtype == 'object' and dataset[column].str.match('[0-9]+/[0-9]+/[0-9]+').sum() > 0: # returns true if this column contains at least one string that matches date format.
        dataset[column] = pd.to_datetime(dataset[column],format="%m/%d/%y",errors='coerce') # convert string to date


# Some numerical columns contain only one/few possible values, which are more likely to be categorical than numerical features.
# As a result, such columns are converted to objective dtype. e.g. some column with only -4 and nan.
columnsNumToCat = []
for column in dataset:
    psbVal = dataset[column].unique()
    if psbVal.dtype == 'float64' and psbVal.size <= 20:
        columnsNumToCat.append(column)
        validRowIdx = dataset[column].notnull()
        dataset[column].loc[validRowIdx] = dataset[column].loc[validRowIdx].astype(str)
#dataset[columnsNumToCat] = dataset[columnsNumToCat].astype(object) # cast to object columns

# Tally the data types of all data columns, and then separate them according to dtype.
dtypeCounts = dataset.dtypes.value_counts(); # Count the number of columns for each data type. Turns out to be only 'float64' and 'object'.
numDataset = dataset.select_dtypes(include=['float'])
objDataset = dataset.select_dtypes(include=['object'])
dateDataset = dataset.select_dtypes(include=['datetime64']) # select dates from objDataset for variable dateDataset

# Count the number of nan's in each column to get an idea of how sparse each column is. It is very likely to drop sparse columns unless they are highly correlated to results.
nonNanCounts_num = numDataset.count() # returns the count of non-NaN entries for each column in numDataset, since not only we want to impute, we want to know how many we impute, especially for columns with very sparse initial data.
#temp = numDataset.count()/8715
#temp.hist(bins=50)

''' The folloing codes were used to detect the problematic columns (columnsTofix and badColumns)mentioned above
for column in objDataset:
    if sum(objDataset[column].apply(type) == float) -  sum(pd.isnull(objDataset[column])) > 0: # returns true if there is at least one entry that is float but not 'NaN'. 'NaN' are excluded since they are float, but not really numerical.
        print(column)
'''

# Imputing missing data in numDataset
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values = 'NaN', strategy = 'mean',axis = 0) # impute numerical columns
imp = imp.fit(numDataset)
numX = imp.transform(numDataset) # Extract data from numData as numX in dtype ndarray
#numAttributes = numDataset.columns.values # Names of columns in numX. Executed after imputation since all NaN columns are dropped.

# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
objDataset.loc[-1,:] = np.repeat(np.nan,objDataset.shape[1]) # Append one NaN to the end of each column so that NaN must be a class for each column.
objDataset = objDataset.fillna(value=' ') # LabelEncoder does not work with NaN, so NaN is converted to a space ' ', which is always sorted as the first class by LabelEncoder.
le = LabelEncoder()
catClasses = np.array([]) # initiate an empty list of attribute names
catAttributes = np.array([]) # initiate an empty list of attribute names
for i in range(objDataset.shape[1]):
    objDataset.iloc[:,i] = le.fit_transform(objDataset.iloc[:,i]) # encode column i
    classes = le.classes_ # All the labels in column i, including ' ', which was translated from NaN
    classes[0] = 'NaN' # Replace ' ' with the attribute of the column
    catClasses = np.append(catClasses,classes) # Append column attribute followed by all its labels to catAttribute
    catAttributes = np.append(catAttributes, np.repeat(objDataset.columns.values[i],classes.size))
    
tups = [catAttributes,catClasses]
attrTups = list(zip(*tups)) 
enc = OneHotEncoder(categorical_features = 'all') # Based on numerical categories in objDataset, encode it to one in n-class features. e.g. 0 => 0, 0, 0, ...; 1 => 0, 1, 0, ...; 2 => 0, 0, 1, 0, ...
catDataset = pd.DataFrame(enc.fit_transform(objDataset).toarray()) # Dtype: csr_matrix => numpy array => dataframe
catDataset = catDataset.iloc[:-1,:] # Remove the last line full of NaN added earlier
objDataset = objDataset.iloc[:-1,:] # Remove the last line full of NaN added earlier
multiIdx = pd.MultiIndex.from_tuples(attrTups, names = ['Attribute','Class']) # Construct a two-level column names in the format of catDataset[Attribute][Class]
catDataset.columns = multiIdx 

# Convert dates to numerical, as relative days since the first date in the column for the same patient.
# First build a dictionary between PTID_Key (patient ID) and line indices. THis will also be useful for lumping data.
idxPTIDTable = pd.concat([pd.DataFrame(np.array(range(0,numDataset.shape[0]))),numDataset['PTID_Key']],axis=1)
dic = idxPTIDTable.groupby('PTID_Key').groups

# Each entry of dateX is the number of days since the first date of the same patient in the same column.
dateX = np.zeros(dateDataset.shape)
dateX[:] = np.nan
for key, value in dic.items():
    for i in range(dateDataset.shape[1]): # iterate through each date column
        allDates = dateDataset.iloc[value,i] # Get all dates in column i about patient with ID 'key'
        firstDate = allDates.min()
        validDateIdx = np.where(~pd.isnull(allDates))[0] # indices of all non-NaT dates
        # if not pd.isnull(firstDate): # There is at least one valid date in this column, which is the earliest date of this column
        for j in range(len(validDateIdx)):
            dateX[value[j],i] = (allDates[value[j]]-firstDate).days

# Repacking datasets for ease of inspection
dateDataset = pd.DataFrame(dateX, columns = dateDataset.columns.values) 
numDataset = pd.DataFrame(numX, columns = numDataset.columns.values) # numX is after imputation.
Data = pd.concat([numDataset,dateDataset,catDataset], axis=1) #numDataset is imputed, catDataset does not need imputation since NaN is a class, dateDataset is not imputed.
XAttributes = Data.columns.values

# Work on labels_train: convert dates to number of days relative to the initial date, and then bind to the rest of data and do imputation
idxPTIDTable_train = pd.concat([pd.DataFrame(labels_train.index.values),labels_train['PTID_Key']],axis=1)
dic_train = idxPTIDTable_train.groupby('PTID_Key').groups
dateY_train_raw = pd.to_datetime(labels_train['Date'],format="%m/%d/%y",errors='coerce')
dateY_train = np.zeros(labels_train['Date'].size)
dateY_train[:] = np.nan
for key, value in dic_train.items():
    allDates = dateY_train_raw[value] # Get all dates in column i about patient with ID 'key'
    firstDate = allDates.min()
    validDateIdx = np.where(~pd.isnull(allDates))[0]
    for j in range(len(validDateIdx)):
        dateY_train[value[j]] = (allDates[value[j]]-firstDate).days
labels_train['Date'] = dateY_train

# On Training labels: Impute missing data based on most frequent value for each individual patient
imp = Imputer(missing_values = 'NaN', strategy = 'most_frequent', axis = 0)
global_means = labels_train.iloc[:,2:].mean() # will be used when the whole column is missing
for key, value in dic_train.items():
        # Get the most frequent value in each column of each patient
        subLbl = labels_train.iloc[value,:]
        modes = subLbl.mode().iloc[0,:]
        diag = subLbl.iloc[:,2]*4+subLbl.iloc[:,3]*2+subLbl.iloc[:,4]
        diag_mode = diag.mode()
        # If the most frequent element of a column is a nan, that means all elemetns are nan in that column, so we will have to impute missing data in that column from input X.
        if diag_mode.size == 0: # If there is no diagnostic result, impute with the most typical situation
            modes['CN_Diag'] = 0
            modes['MCI_Diag'] = 1
            modes['AD_Diag'] = 0 
        else:
            b = bin(int(diag_mode.iloc[0]))[2:].zfill(3)
            modes['CN_Diag'] = b[0]
            modes['MCI_Diag'] = b[1]
            modes['AD_Diag'] = b[2] 
    
        if np.isnan(modes['ADAS13']):
            modes['ADAS13'] = global_means['ADAS13']
        if np.isnan(modes['Ventricles_Norm']):
            modes['Ventricles_Norm'] = global_means['Ventricles_Norm']
        if np.isnan(modes['MMSE']):
            modes['MMSE'] = global_means['MMSE']

        # impute missing data with either the most frequent item in this column for this patient or population mode if no record is found for this patient.    
        y_train_individual = labels_train.iloc[value,:].append(modes)
        imp = imp.fit(y_train_individual.values[:,2:]) 
        temp = imp.transform(y_train_individual.values[:,2:])
        labels_train.iloc[value,2:] = temp[:-1,:]

colNames = labels_train.columns.tolist()
newColNames = [colNames[1],colNames[0]]+ colNames[2:]
labels_train = labels_train[newColNames]
y_train = labels_train.values
yAttributes = labels_train.columns.values

# Work on labels_test: convert dates to number of days relative to the initial date, and then bind to the rest of data and do imputation
idxPTIDTable_test = pd.concat([pd.DataFrame(labels_test.index.values),labels_test['PTID_Key']],axis=1)
dic_test = idxPTIDTable_test.groupby('PTID_Key').groups
dateY_test_raw = pd.to_datetime(labels_test['Date'],format="%Y-%m-%d",errors='coerce')
dateY_test = np.zeros(dateY_test_raw.size)
dateY_test[:] = np.nan
for key, value in dic_test.items():
    allDates = dateY_test_raw[value] # Get all dates in column i about patient with ID 'key'
    firstDate = allDates.min()
    validDateIdx = np.where(~pd.isnull(allDates))[0]
    for j in range(len(validDateIdx)):
        dateY_test[value[j]] = (allDates[value[j]]-firstDate).days
labels_test['Date'] = dateY_test
colNames = labels_test.columns.tolist()
newColNames = [colNames[1],colNames[0]]+ colNames[2:] # switch PTID_Key to the first column
labels_test = labels_test[newColNames]

# Build a row index lookup table for each patient from Data to labels_train/labels_test
ptid_roster = Data['PTID_Key']
ptid_roster = ptid_roster.dropna(how='all') 
ptid_roster = ptid_roster.unique()
ptid_roster.sort()
ptid_roster = ptid_roster.astype(int)
ptid_train = labels_train['PTID_Key'].unique()
ptid_train.sort()
ptid_test = labels_test['PTID_Key'].unique()
ptid_test.sort()

# Separate input data for traning and test sets 
X_train_rowIdx = []
for i in ptid_train:
    X_train_rowIdx.extend(dic[i].tolist())
    
X_test_rowIdx = []
for i in ptid_test:
    X_test_rowIdx.extend(dic[i].tolist())
    

X_train = Data.iloc[X_train_rowIdx,:]
X_train = X_train.reset_index(drop=True)
X_test = Data.iloc[X_test_rowIdx,:]
X_test = X_test.reset_index(drop=True)


# Lumping
X_train_zipped = PTLump(X_train,'EXAMDATE')
y_train_zipped = PTLump(labels_train,'Date')
   


'''

# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
'''


# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
XtrainScaled = sc.fit_transform(X_train_zipped[numDataset.columns.values].values)
#X_train_zipped = sc.fit_transform(numDataset.iloc[X_train_rowIdx,:].values)
#X_test_zipped = sc.transform(numDataset.iloc[X_test_rowIdx,:].values)

# Fitting Logistic Regression to the Training set
#from sklearn.linear_model import LogisticRegression
#classifier = LogisticRegression(random_state = 0)
#classifier.fit(XtrainScaled, y_train_zipped['CN_Diag'])

# Predicting the Test set results
#y_pred = classifier.predict(X_test)

'''
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# Visualising the Training set results
from matplotlib.colors import ListedColormap
X_set, y_set = X_train, y_train
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('Logistic Regression (Training set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()

# Visualising the Test set results
from matplotlib.colors import ListedColormap
X_set, y_set = X_test, y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('Logistic Regression (Test set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()
'''

reducX = np.zeros([ptid_train.size, X_train.shape[1]])
for i in range(ptid_train.size):
    reducX[i,:] = X_train.iloc[X_train['EXAMDATE'].iloc[dic_train[ptid_train[i]]].idxmax(),:].values





  interactivity=interactivity, compiler=compiler, result=result)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


6 0
8 14
18 18
21 22
22 25
25 26
26 29
32 31
34 35
36 36
40 38
44 39
47 40
49 44
53 57
55 69
57 79
59 81
60 89
61 96
71 97
72 110
77 114
79 119
82 127
84 142
97 146
105 150
110 160
111 166
112 170
113 181
117 184
118 192
119 204
123 206
126 207
129 211
130 214
133 219
134 221
135 225
140 228
141 231
142 233
143 238
146 240
147 245
150 255
155 259
160 263
161 264
163 266
164 277
166 285
171 294
172 298
175 310
176 322
177 327
178 328
181 329
186 330
189 333
190 334
191 336
195 341
196 344
197 356
199 360
202 367
204 375
205 376
206 379
207 385
208 390
213 398
215 399
219 403
220 412
223 417
232 418
233 419
236 422
237 435
243 439
246 442
248 443
249 448
254 458
255 462
258 466
261 469
263 471
272 474
273 477
274 482
276 483
278 489
279 491
282 493
286 497
288 498
293 501
295 506
299 507
301 513
305 520
306 526
310 530
311 532
315 538
316 549
322 558
323 563
325 566
331 580
337 585
343 597
344 601
345 603
346 608
350 621
355 626
359 629
360 630
362 639
364 646
367 652
369 659
371 668
382

1120 1626
1123 1629
1124 1632
1128 1634
1132 1641
1135 1643
1136 1647
1139 1648
1140 1653
1142 1658
1144 1659
1153 1664
1157 1669
1159 1670
1161 1676
1163 1677
1164 1680
1166 1685
1170 1692
1175 1693
1177 1696
1180 1703
1181 1708
1184 1711
1185 1713
1187 1720
1190 1722
1196 1725
1197 1729
1198 1734
1200 1736
1201 1740
1202 1746
1204 1748
1208 1755
1209 1760
1212 1764
1213 1769
1217 1773
1220 1774
1223 1780
1225 1784
1228 1786
1229 1790
1231 1793
1233 1795
1234 1800
1236 1801
1238 1806
1239 1811
1243 1815
1246 1825
1249 1827
1251 1830
1253 1835
1254 1840
1260 1841
1261 1843
1263 1849
1264 1853
1265 1859
1269 1864
1270 1868
1271 1874
1273 1877
1282 1880
1284 1883
1285 1887
1286 1892
1288 1897
1289 1899
1293 1903
1294 1904
1296 1909
1297 1911
1302 1915
1303 1920
1306 1921
1307 1927
1308 1932
1309 1933
1315 1937
1320 1939
1329 1940
1330 1949
1337 1952
1340 1953
1343 1954
1345 1957
1350 1959
1353 1966
1358 1968
1363 1969
1366 1976
1367 1983
1368 1987
1369 1989
1371 1994
1374 2000
1378 2001


In [23]:
print(Y_category_list)

['PTID_Key', 'EXAMDATE', 'Date', 'CN_Diag', 'MCI_Diag', 'AD_Diag', 'ADAS13', 'Ventricles_Norm', 'MMSE']


In [4]:
#Build List of all the Categories in numX
X_category_list = list(numDataset.columns)
X_category_list.insert(1, 'EXAMDATE')
numX_with_date = np.insert(numX, 1, dateX[:,0], axis=1)

X_cat_to_dataset_dict = {}
for i in range(len(X_category_list)):
    X_cat_to_dataset_dict[X_category_list[i]] = numX_with_date[:,i]
    
Y_train_cat_to_dataset_dict = {}
Y_test_cat_to_dataset_dict = {}
Y_category_list = list(yAttributes)
#Y_category_list.insert(1, 'EXAMDATE')
for i in range(len(Y_category_list)):
    Y_test_cat_to_dataset_dict[Y_category_list[i]] = labels_test.values[:,i]
    Y_train_cat_to_dataset_dict[Y_category_list[i]] = y_train[:,i]

In [5]:
#input S must be dictionary w/ 'PTID_Key' as one of keys
#maps patient ids tomost recent (last appearing index in matrix )
# returns dictionary with
# key: PT_ID         val: last index row index
def get_last_patient_visit(S):
    hist_dict={}
    for pt_id in S['PTID_Key']: #get indexes of last patient visit per patient id
        pt_id_appearances = np.where(S['PTID_Key']==pt_id)
        hist_dict[int(pt_id)] = np.max(pt_id_appearances)
    return hist_dict

#condense dataset into 1 visit per patient (last visit for each patient)
def make_last_visit(dataset, last_visit_dict):
    try:
        last_visit_dataset = np.zeros( (len(last_visit_dict.keys()), dataset.shape[1]) )
    except:
        last_visit_dataset = np.zeros( (len(last_visit_dict.keys()), ))
    ind = 0
    for k in last_visit_dict.keys():
        last_visit_index = last_visit_dict[k]
        last_visit_dataset[ind] = dataset[last_visit_index]
        ind+=1
    return last_visit_dataset

#X and y are np arrays, X/y_last is dict of PTID -> last row index in their respective datasets
#converts X so that it has same # of patients as y and each row corresponds to the same patient
def modify_X_to_y(X, X_last, y_last): 
    y_PT_ID = y_last.keys()
    #print(y_PT_ID)
    modified_X = np.zeros((len(y_PT_ID), X.shape[1]))
    X_PTID_Index = list(X[:,0]) #index here corresponds to vector's index in matrix
    index = 0
    for pt_id in y_PT_ID:
        #print pt_id
        corr_X_index = X_PTID_Index.index(pt_id) #PT_ID -> X_index = PT_ID-1
        modified_X[index] = X[corr_X_index]
        index+=1
    return modified_X

#Ventricles_norm is computed as "Ventricles" divided by "ICV"
#DXCHANGE = {1, 7, 9} encodes healthy control, DXCHANGE = {2, 4, 8} encodes MCI, and 
#DXCHANGE = {3, 5, 6} encodes Alzheimer's diagnosis.
def DXCHANGE_to_diagnosis(DXCHANGE_col):
    expanded_DXCHANGE = np.zeros((len(DXCHANGE_col), 3))
    healthy = set([1,7,9])
    mci = set([2,4,8])
    ad = set([3,5,6])
    for i in range(len(DXCHANGE_col)):
        if int(DXCHANGE_col[i]) in healthy:
            new_row = np.array([1,0,0])
            expanded_DXCHANGE[i] = new_row
        elif int(DXCHANGE_col[i]) in mci:
            new_row = np.array([0,1,0])
            expanded_DXCHANGE[i] = new_row
        
        elif int(DXCHANGE_col[i]) in ad:
            new_row = np.array([0,0,1])
            expanded_DXCHANGE[i] = new_row
    return expanded_DXCHANGE


def diagnosis_to_classes(y_diagnosis_cols):
    condensed_classes = np.zeros((y_diagnosis_cols.shape[0],))
    for i in range(len(condensed_classes)):
        check_equivalence_healthy = y_diagnosis_cols[i] == np.array([1,0,0])
        check_equivalence_mci = y_diagnosis_cols[i] == np.array([0,1,0])
        check_equivalence_ad = y_diagnosis_cols[i] == np.array([0,0,1])
        if check_equivalence_healthy.all() == True:
            condensed_classes[i] = 0
        elif check_equivalence_mci.all() == True:
            condensed_classes[i] = 1
        elif check_equivalence_ad.all() == True:
            condensed_classes[i] = 2        
    return condensed_classes
#converts one hot encoding of diagnosis to one column w/ classes 0-healthy, 1-MCI, 2-AD
def classes_to_diagnosis(y_diagnosis_col):
    condensed_classes = np.zeros((y_diagnosis_col.shape[0], 3))
    for i in range(len(condensed_classes)):
        #check_equivalence_healthy = y_diagnosis_cols[i] == np.array([1,0,0])
        #check_equivalence_mci = y_diagnosis_cols[i] == np.array([0,1,0])
        #check_equivalence_ad = y_diagnosis_cols[i] == np.array([0,0,1])
        if y_diagnosis_col[i] == 0:
            condensed_classes[i] = np.array([1,0,0])
        elif y_diagnosis_col[i] == 1:
            condensed_classes[i] = np.array([0,1,0])
        elif y_diagnosis_col[i] == 2:
            condensed_classes[i] = np.array([0,0,1])     
    return condensed_classes
                                                            
    

In [6]:
#get dictionary for last visits for corresponding datasets
X_last_visit_dict = get_last_patient_visit(X_cat_to_dataset_dict)
y_train_last_visit_dict = get_last_patient_visit(Y_train_cat_to_dataset_dict)
y_test_last_visit_dict = get_last_patient_visit(Y_test_cat_to_dataset_dict)

#modify datasets so that there is only one visit per ptid
X_last_visit_dataset = make_last_visit(numX_with_date, X_last_visit_dict)
y_train_last_visit_dataset = make_last_visit(y_train, y_train_last_visit_dict)
y_test_last_visit_dataset = make_last_visit(labels_test.values, y_test_last_visit_dict)

In [7]:
X_train = modify_X_to_y(numX_with_date, X_last_visit_dict, y_train_last_visit_dict)
X_test = modify_X_to_y(numX_with_date, X_last_visit_dict, y_test_last_visit_dict)

# Scaling Features

In [8]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train[:,1:])
X_test_scaled = sc.fit_transform(X_test[:,1:])

# Implementing Support Vector Classifier/Regression

###### MMSE

In [None]:
from sklearn import svm

svr_mmse = svm.SVR()
y_mmse = y_train_last_visit_dataset[:,7]
svr_mmse.fit(X_train_scaled, y_mmse)
#

In [44]:
mmse_test_prediction = svr_mmse.predict(X_test_scaled)

In [46]:
for i in range(len(mmse_test_prediction)):
    print('PTID',X_test[i][0])
    print(mmse_test_prediction[i])

PTID 513.0
27.2322940105
PTID 514.0
26.7009823928
PTID 5.0
27.2438093595
PTID 1025.0
24.5538684067
PTID 1551.0
28.2712887907
PTID 222.0
26.1453181985
PTID 1041.0
27.9094976686
PTID 20.0
26.8049258414
PTID 534.0
29.5167700441
PTID 1563.0
27.0952070355
PTID 540.0
27.0684472064
PTID 30.0
26.9688295851
PTID 35.0
28.2475773014
PTID 41.0
26.0641393088
PTID 42.0
26.1230177963
PTID 1543.0
28.2168355229
PTID 1069.0
27.301508991
PTID 1070.0
28.1544653642
PTID 1071.0
29.5485186994
PTID 179.0
27.5985449471
PTID 565.0
29.4616543064
PTID 54.0
26.751499508
PTID 56.0
25.7927792922
PTID 571.0
27.9634050935
PTID 574.0
24.1029826603
PTID 65.0
26.0681336031
PTID 1602.0
29.7173937505
PTID 67.0
26.5235805291
PTID 1477.0
26.7795128576
PTID 1606.0
27.6459525193
PTID 73.0
29.0134451949
PTID 74.0
25.4065957606
PTID 1099.0
28.5959299952
PTID 1100.0
27.5356616314
PTID 1613.0
27.3701958018
PTID 1103.0
28.3232660774
PTID 593.0
27.1113398323
PTID 83.0
28.1830143222
PTID 598.0
27.3173959253
PTID 599.0
26.639691824
PT

###### Ventricle Norms

In [48]:
svr_vn = svm.SVR()
y_vn = y_train_last_visit_dataset[:,6]
svr_vn.fit(X_train_scaled, y_vn)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [49]:
vn_test_prediction = svr_vn.predict(X_test_scaled)

In [50]:
for i in range(len(vn_test_prediction)):
    print('PTID',X_test[i][0])
    print(vn_test_prediction[i])

PTID 513.0
0.0412216
PTID 514.0
0.0412216
PTID 5.0
0.0412216
PTID 1025.0
0.0412216
PTID 1551.0
0.0412216
PTID 222.0
0.0412216
PTID 1041.0
0.0412216
PTID 20.0
0.0412216
PTID 534.0
0.0412216
PTID 1563.0
0.0412216
PTID 540.0
0.0412216
PTID 30.0
0.0412216
PTID 35.0
0.0412216
PTID 41.0
0.0412216
PTID 42.0
0.0412216
PTID 1543.0
0.0412216
PTID 1069.0
0.0412216
PTID 1070.0
0.0412216
PTID 1071.0
0.0412216
PTID 179.0
0.0412216
PTID 565.0
0.0412216
PTID 54.0
0.0412216
PTID 56.0
0.0412216
PTID 571.0
0.0412216
PTID 574.0
0.0412216
PTID 65.0
0.0412216
PTID 1602.0
0.0412216
PTID 67.0
0.0412216
PTID 1477.0
0.0412216
PTID 1606.0
0.0412216
PTID 73.0
0.0412216
PTID 74.0
0.0412216
PTID 1099.0
0.0412216
PTID 1100.0
0.0412216
PTID 1613.0
0.0412216
PTID 1103.0
0.0412216
PTID 593.0
0.0412216
PTID 83.0
0.0412216
PTID 598.0
0.0412216
PTID 599.0
0.0412216
PTID 88.0
0.0412216
PTID 1113.0
0.0412216
PTID 602.0
0.0412216
PTID 1627.0
0.0412216
PTID 1435.0
0.0412216
PTID 605.0
0.0412216
PTID 98.0
0.0412216
PTID 99.0
0

###### Diagnosis

In [None]:
svc_diagnosis = svm.SVC()
y_diagnosis_classes = diagnosis_to_classes(y_train_last_visit_dataset[,2:4])

###### ADAS13

In [51]:
svr_ADAS13 = svm.SVR()
y_ADAS13 = y_train_last_visit_dataset[:,5]
svr_ADAS13.fit(X_train_scaled, y_ADAS13)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [57]:
y_test_ADAS13_prediction = svr_ADAS13.predict(X_test_scaled)

In [58]:
#print(type(X_train_scaled))
for i in range(len(y_test_ADAS13_prediction)):
    print('PTID',X_test[i][0])
    print(y_test_ADAS13_prediction[i])

PTID 513.0
13.1496651442
PTID 514.0
15.348961018
PTID 5.0
14.830980211
PTID 1025.0
18.935580203
PTID 1551.0
11.8789111936
PTID 222.0
16.555288512
PTID 1041.0
12.8041201033
PTID 20.0
15.0574384225
PTID 534.0
7.83465091618
PTID 1563.0
14.8442819661
PTID 540.0
13.8180092543
PTID 30.0
14.4740569017
PTID 35.0
10.7317903175
PTID 41.0
15.6862456198
PTID 42.0
16.6781584538
PTID 1543.0
12.1316058589
PTID 1069.0
13.813876066
PTID 1070.0
10.7844255332
PTID 1071.0
7.61754337693
PTID 179.0
13.1222693787
PTID 565.0
8.37131602171
PTID 54.0
15.1411835703
PTID 56.0
16.8967081258
PTID 571.0
12.6578652485
PTID 574.0
19.1628710344
PTID 65.0
17.1208472203
PTID 1602.0
8.20697883022
PTID 67.0
15.3795894505
PTID 1477.0
14.3345216403
PTID 1606.0
12.8951089596
PTID 73.0
11.1444429242
PTID 74.0
17.8256060102
PTID 1099.0
9.93218584503
PTID 1100.0
13.1571138088
PTID 1613.0
14.9300869791
PTID 1103.0
11.2563939449
PTID 593.0
14.5361334563
PTID 83.0
12.3957677614
PTID 598.0
13.8550923817
PTID 599.0
15.3127643673
PTID