In [1]:
# Imports
import numpy as np
import pandas as pd
pd.options.display.float_format = '{:,.3f}'.format

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from tqdm.notebook import tqdm
np.set_printoptions(linewidth=200, suppress=True, formatter={'float': lambda x: "{0:0.3f}".format(x)})

In [2]:
# Load in data
# rootdir = '/content/drive/MyDrive/Extracurriculars/Summer_2021/Polygence/Data/OASIS/csv_files/oasis_3.csv'
rootdir = '/hdd/Polygence/Data/OASIS/csv_files/oasis_3.csv'
df = pd.read_csv(rootdir)
print(df.shape)
df.head()

(2168, 22)


Unnamed: 0,Subject,MR ID,id,Age,M/F,dx1,mmse,cdr,apoe,TOTAL_HIPPOCAMPUS_VOLUME,...,rhCortexVol,CortexVol,SubCortGrayVol,TotalGrayVol,SupraTentorialVol,lhCorticalWhiteMatterVol,rhCorticalWhiteMatterVol,CorticalWhiteMatterVol,L.SurfArea,R.SurfArea
0,OAS30001,OAS30001_MR_d3132,OAS30001_Freesurfer53_d3132,73.0,F,Cognitively normal,30.0,0.0,23.0,6861.9,...,178031.559,359975.258,48400.0,491102.258,773671.599,174372.329,173244.012,347616.342,67598.1,67185.8
1,OAS30001,OAS30001_MR_d0129,OAS30001_Freesurfer53_d0129,65.0,F,Cognitively normal,30.0,0.0,23.0,7678.9,...,187528.786,379446.18,50687.0,517683.18,810585.114,184600.488,182662.445,367262.933,70168.1,69483.8
2,OAS30001,OAS30001_MR_d2430,OAS30001_Freesurfer53_d2430,71.0,F,Cognitively normal,30.0,0.0,23.0,7105.9,...,178872.68,357784.49,49058.0,487405.49,777931.271,175955.969,178172.813,354128.782,67905.7,68000.2
3,OAS30001,OAS30001_MR_d0757,OAS30001_Freesurfer53_d0757,67.0,F,Cognitively normal,29.0,0.0,23.0,7648.2,...,177566.875,362040.151,50071.0,500699.151,799341.921,185224.78,188151.99,373376.77,69142.3,68558.8
4,OAS30002,OAS30002_MR_d2345,OAS30002_Freesurfer53_d2345,73.0,M,Cognitively normal,29.0,0.0,34.0,7833.2,...,230240.533,457342.036,56773.0,607473.036,1051713.751,239168.338,245361.377,484529.716,83138.1,85742.3


In [3]:
# Data pre-processing
df = df.dropna(axis=1, how='all') # Drop any empty columns
df = df.dropna(axis=0, how='any') # Drop any rows with empty values 
df = df.rename(columns={'id':'Freesurfer ID', 'dx1':'Diagnosis', 
                        'TOTAL_HIPPOCAMPUS_VOLUME':'TotalHippocampusVol'}) # Rename columns
# df = df.drop_duplicates(subset='Subject', keep='first') # Keep only the first visit; this is possible because
#                                                         # df is sorted by age
df = df.reset_index(drop=True) # Reset the index
cols = df.columns.tolist()
cols[2], cols[4] = cols[4], cols[2]
df = df[cols]
df.loc[df['cdr'] < 0.5, 'Diagnosis'] = 0
df.loc[~(df['cdr'] < 0.5), 'Diagnosis'] = 1
df = df.drop(['MR ID', 'Freesurfer ID', 'cdr', 'M/F'], axis=1) # Drop categorical and redundant columns
print(df.shape)

(1956, 18)


In [4]:
index = df[df['Diagnosis'] == 0].index

In [5]:
df.drop(index[:37], inplace=True)

In [6]:
df.shape

(1919, 18)

In [7]:
df['age_bins'] = pd.cut(x=df['Age'], bins=[40, 60, 80, 100])

In [8]:
df['Diagnosis'].value_counts()

0    1495
1     424
Name: Diagnosis, dtype: int64

In [9]:
df['age_bins'].value_counts()

(60, 80]     1356
(40, 60]      315
(80, 100]     248
Name: age_bins, dtype: int64

In [10]:
# Load in data
# rootdir = '/content/drive/MyDrive/Extracurriculars/Summer_2021/Polygence/Data/OASIS/csv_files/oasis_3.csv'
rootdir = '/hdd/Polygence/Data/ADNI/patients.csv'
df = pd.read_csv(rootdir)
print(df.shape)
df.head()

(5375, 12)


Unnamed: 0,Image Data ID,Subject,Group,Sex,Age,Visit,Modality,Description,Type,Acq Date,Format,Downloaded
0,I431036,941_S_4764,EMCI,F,85,30,MRI,MPRAGE,Original,6/16/2014,DCM,8/07/2021
1,I307653,941_S_4764,EMCI,F,83,22,MRI,MPRAGE,Original,6/01/2012,DCM,8/07/2021
2,I376064,941_S_4764,EMCI,F,84,28,MRI,MPRAGE,Original,6/10/2013,DCM,8/07/2021
3,I325119,941_S_4764,EMCI,F,83,24,MRI,MPRAGE,Original,8/20/2012,DCM,8/07/2021
4,I294080,941_S_4420,EMCI,M,81,22,MRI,MPRAGE,Original,3/28/2012,DCM,8/07/2021


In [11]:
df['Group'].value_counts()

CN      1521
MCI     1275
EMCI    1224
AD       718
LMCI     637
Name: Group, dtype: int64

In [12]:
df['age_bins'] = pd.cut(x=df['Age'], bins=[40, 60, 80, 100])

In [13]:
df['age_bins'].value_counts()

(60, 80]     3935
(80, 100]    1269
(40, 60]      171
Name: age_bins, dtype: int64