## Imports

In [51]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

#Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

In [52]:
long_df = pd.read_csv('oasis_longitudinal.csv')
cross_df = pd.read_csv('oasis_cross-sectional.csv')

In [53]:
long_df.head()

Unnamed: 0,Subject ID,MRI ID,Group,Visit,MR Delay,M/F,Hand,Age,EDUC,SES,MMSE,CDR,eTIV,nWBV,ASF
0,OAS2_0001,OAS2_0001_MR1,Nondemented,1,0,M,R,87,14,2.0,27.0,0.0,1987,0.696,0.883
1,OAS2_0001,OAS2_0001_MR2,Nondemented,2,457,M,R,88,14,2.0,30.0,0.0,2004,0.681,0.876
2,OAS2_0002,OAS2_0002_MR1,Demented,1,0,M,R,75,12,,23.0,0.5,1678,0.736,1.046
3,OAS2_0002,OAS2_0002_MR2,Demented,2,560,M,R,76,12,,28.0,0.5,1738,0.713,1.01
4,OAS2_0002,OAS2_0002_MR3,Demented,3,1895,M,R,80,12,,22.0,0.5,1698,0.701,1.034


In [54]:
cross_df.head()

Unnamed: 0,ID,M/F,Hand,Age,Educ,SES,MMSE,CDR,eTIV,nWBV,ASF,Delay
0,OAS1_0001_MR1,F,R,74,2.0,3.0,29.0,0.0,1344,0.743,1.306,
1,OAS1_0002_MR1,F,R,55,4.0,1.0,29.0,0.0,1147,0.81,1.531,
2,OAS1_0003_MR1,F,R,73,4.0,3.0,27.0,0.5,1454,0.708,1.207,
3,OAS1_0004_MR1,M,R,28,,,,,1588,0.803,1.105,
4,OAS1_0005_MR1,M,R,18,,,,,1737,0.848,1.01,


In [55]:
print('Longitudinal df shape: '+str(long_df.shape))
print('Cross-sectional df shape: '+str(cross_df.shape))


Longitudinal df shape: (373, 15)
Cross-sectional df shape: (436, 12)


In [56]:
print(cross_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 436 entries, 0 to 435
Data columns (total 12 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   ID      436 non-null    object 
 1   M/F     436 non-null    object 
 2   Hand    436 non-null    object 
 3   Age     436 non-null    int64  
 4   Educ    235 non-null    float64
 5   SES     216 non-null    float64
 6   MMSE    235 non-null    float64
 7   CDR     235 non-null    float64
 8   eTIV    436 non-null    int64  
 9   nWBV    436 non-null    float64
 10  ASF     436 non-null    float64
 11  Delay   20 non-null     float64
dtypes: float64(7), int64(2), object(3)
memory usage: 41.0+ KB
None


In [57]:
print(long_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 373 entries, 0 to 372
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Subject ID  373 non-null    object 
 1   MRI ID      373 non-null    object 
 2   Group       373 non-null    object 
 3   Visit       373 non-null    int64  
 4   MR Delay    373 non-null    int64  
 5   M/F         373 non-null    object 
 6   Hand        373 non-null    object 
 7   Age         373 non-null    int64  
 8   EDUC        373 non-null    int64  
 9   SES         354 non-null    float64
 10  MMSE        371 non-null    float64
 11  CDR         373 non-null    float64
 12  eTIV        373 non-null    int64  
 13  nWBV        373 non-null    float64
 14  ASF         373 non-null    float64
dtypes: float64(5), int64(5), object(5)
memory usage: 43.8+ KB
None


## Data Preprocessing

In [58]:
long_df = long_df.loc[long_df['Visit']==1]
long_df = long_df.reset_index(drop=True)

In [59]:
long_df.drop(columns=['Subject ID','Hand','MRI ID','Visit','MR Delay'],inplace=True)

In [60]:
long_df = long_df.rename(columns={'M/F':'Male/Female'})

In [62]:
#encode non numerical columns
long_df['Group'] = long_df['Group'].apply(lambda x : 1 if x=='Demented' else 0)
long_df['Male/Female'] = long_df['Male/Female'].apply(lambda x : 1 if x=='M' else 0)

In [63]:
long_df.head()

Unnamed: 0,Group,Male/Female,Age,EDUC,SES,MMSE,CDR,eTIV,nWBV,ASF
0,0,1,87,14,2.0,27.0,0.0,1987,0.696,0.883
1,1,1,75,12,,23.0,0.5,1678,0.736,1.046
2,0,0,88,18,3.0,28.0,0.0,1215,0.71,1.444
3,0,1,80,12,4.0,28.0,0.0,1689,0.712,1.039
4,1,1,71,16,,28.0,0.5,1357,0.748,1.293


In [64]:
print('NaNs in longitudinal df:')
long_df.isna().sum()

NaNs in longitudinal df:


Group          0
Male/Female    0
Age            0
EDUC           0
SES            8
MMSE           0
CDR            0
eTIV           0
nWBV           0
ASF            0
dtype: int64

## Delete NaNs and Imputation (SES column)

In [65]:
long_df['SES'] = long_df['SES'].fillna(long_df['SES'].median())

In [66]:
long_df.head(10)

Unnamed: 0,Group,Male/Female,Age,EDUC,SES,MMSE,CDR,eTIV,nWBV,ASF
0,0,1,87,14,2.0,27.0,0.0,1987,0.696,0.883
1,1,1,75,12,2.0,23.0,0.5,1678,0.736,1.046
2,0,0,88,18,3.0,28.0,0.0,1215,0.71,1.444
3,0,1,80,12,4.0,28.0,0.0,1689,0.712,1.039
4,1,1,71,16,2.0,28.0,0.5,1357,0.748,1.293
5,0,0,93,14,2.0,30.0,0.0,1272,0.698,1.38
6,1,1,68,12,2.0,27.0,0.5,1457,0.806,1.205
7,1,0,66,12,3.0,30.0,0.5,1447,0.769,1.213
8,0,0,78,16,2.0,29.0,0.0,1333,0.748,1.316
9,0,0,81,12,4.0,30.0,0.0,1230,0.715,1.427


In [67]:
long_df.isna().sum()

Group          0
Male/Female    0
Age            0
EDUC           0
SES            0
MMSE           0
CDR            0
eTIV           0
nWBV           0
ASF            0
dtype: int64

In [68]:
long_df.shape

(150, 10)