### Dementia Prediction

Using MRI data from patients w/ and w/o Alzheimier's, develop model that can predict if a patient is demented

Based on: https://www.kaggle.com/ruslankl/dementia-prediction-w-tree-based-models

In [192]:
import pandas as pd
import numpy as np
from functools import reduce
import seaborn as sns

# Import data to dataframe
cs_df = pd.read_csv('Projects/DementiaPrediction/data/oasis_cross-sectional.csv', index_col='ID')
long_df = pd.read_csv('Projects/DementiaPrediction/data/oasis_longitudinal.csv', index_col=['Subject ID', 'MRI ID'])
set_columns_to_category_dtype(cs_df)
set_columns_to_category_dtype(long_df)

In [193]:
cs_df.head()

Unnamed: 0_level_0,M/F,Hand,Age,Educ,SES,MMSE,CDR,eTIV,nWBV,ASF,Delay
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
OAS1_0001_MR1,F,R,74,2.0,3.0,29.0,0.0,1344,0.743,1.306,
OAS1_0002_MR1,F,R,55,4.0,1.0,29.0,0.0,1147,0.81,1.531,
OAS1_0003_MR1,F,R,73,4.0,3.0,27.0,0.5,1454,0.708,1.207,
OAS1_0004_MR1,M,R,28,,,,,1588,0.803,1.105,
OAS1_0005_MR1,M,R,18,,,,,1737,0.848,1.01,


In [194]:
long_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Group,Visit,MR Delay,M/F,Hand,Age,EDUC,SES,MMSE,CDR,eTIV,nWBV,ASF
Subject ID,MRI ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
OAS2_0001,OAS2_0001_MR1,Nondemented,1,0,M,R,87,14,2.0,27.0,0.0,1987,0.696,0.883
OAS2_0001,OAS2_0001_MR2,Nondemented,2,457,M,R,88,14,2.0,30.0,0.0,2004,0.681,0.876
OAS2_0002,OAS2_0002_MR1,Demented,1,0,M,R,75,12,,23.0,0.5,1678,0.736,1.046
OAS2_0002,OAS2_0002_MR2,Demented,2,560,M,R,76,12,,28.0,0.5,1738,0.713,1.01
OAS2_0002,OAS2_0002_MR3,Demented,3,1895,M,R,80,12,,22.0,0.5,1698,0.701,1.034


In [195]:
cs_df.describe()

Unnamed: 0,Age,Educ,SES,MMSE,CDR,eTIV,nWBV,ASF,Delay
count,436.0,235.0,216.0,235.0,235.0,436.0,436.0,436.0,20.0
mean,51.357798,3.178723,2.490741,27.06383,0.285106,1481.919725,0.79167,1.198894,20.55
std,25.269862,1.31151,1.120593,3.69687,0.383405,158.740866,0.059937,0.128682,23.86249
min,18.0,1.0,1.0,14.0,0.0,1123.0,0.644,0.881,1.0
25%,23.0,2.0,2.0,26.0,0.0,1367.75,0.74275,1.11175,2.75
50%,54.0,3.0,2.0,29.0,0.0,1475.5,0.809,1.19,11.0
75%,74.0,4.0,3.0,30.0,0.5,1579.25,0.842,1.28425,30.75
max,96.0,5.0,5.0,30.0,2.0,1992.0,0.893,1.563,89.0


In [196]:
long_df.describe()

Unnamed: 0,Visit,MR Delay,Age,EDUC,SES,MMSE,CDR,eTIV,nWBV,ASF
count,373.0,373.0,373.0,373.0,354.0,371.0,373.0,373.0,373.0,373.0
mean,1.882038,595.104558,77.013405,14.597855,2.460452,27.342318,0.290885,1488.128686,0.729568,1.195461
std,0.922843,635.485118,7.640957,2.876339,1.134005,3.683244,0.374557,176.139286,0.037135,0.138092
min,1.0,0.0,60.0,6.0,1.0,4.0,0.0,1106.0,0.644,0.876
25%,1.0,0.0,71.0,12.0,2.0,27.0,0.0,1357.0,0.7,1.099
50%,2.0,552.0,77.0,15.0,2.0,29.0,0.0,1470.0,0.729,1.194
75%,2.0,873.0,82.0,16.0,3.0,30.0,0.5,1597.0,0.756,1.293
max,5.0,2639.0,98.0,23.0,5.0,30.0,2.0,2004.0,0.837,1.587


In [293]:
from functools import reduce
def get_data_quality_report(df: pd.DataFrame):
    """
    Create data quality report for both continuous and categorical features

    :param df: dataframe to analyze
    :return: data quality report for continuous features, data quality report for categorical features
    :rtype: pd.Dataframe, pd.Dataframe
    """

    '''
    COMMON DQR CHARACTERISTICS
    '''
    # Total number of rows in dataset
    num_rows = df.shape[0]

    # Number of records as nulls
    nulls = df.isnull().sum()
    nulls.name = 'nulls'

    # Percent of records as nulls
    nulls_pct = nulls / num_rows
    nulls_pct.name = 'nulls pct'

    # Number of unique values in each column
    cardinality = df.nunique()
    cardinality.name = 'cardinality'

    '''
    CREATE DQR FOR NUMERIC/CONTINUOUS FEATURES
    '''
    # Baseline describe function (filters out non-numeric fields)
    continuous_dqr = df.describe()
    continuous_cols = list(continuous_dqr.columns)

    # Modify or add supplemental rows
    continuous_dqr.loc['count'] = num_rows
    continuous_dqr = continuous_dqr.append(nulls[continuous_cols])
    continuous_dqr = continuous_dqr.append(nulls_pct[continuous_cols])
    continuous_dqr = continuous_dqr.append(cardinality[continuous_cols])

    '''
    CREATE DQR FOR CATEGORICAL FEATURES
    '''
    categorical_cols = [col_name for col_name in df.dtypes.where(df.dtypes == 'category').dropna().index if col_name != df.index.name]
    categorical_df = df[categorical_cols]

    # Calculate mode data for each column and aggregate results into single dataframe
    categorical_mode_list = []
    for col in categorical_df:
        categorical_mode_list.append(get_mode_and_second_mode(categorical_df[col]))
    categorical_mode = reduce(lambda x, y: x.merge(y, left_index=True, right_index=True), categorical_mode_list)

    # Aggregate all categorical data quality rows
    categorical_dqr = pd.DataFrame()
    categorical_dqr = categorical_dqr.append(pd.Series([num_rows for col in categorical_df.columns], name='count', index=categorical_df.columns))
    categorical_dqr = categorical_dqr.append(categorical_mode)
    categorical_dqr = categorical_dqr.append(nulls[categorical_cols])
    categorical_dqr = categorical_dqr.append(nulls_pct[categorical_cols])
    categorical_dqr = categorical_dqr.append(cardinality[categorical_cols])

    '''
    Apply formatting
    '''
    def two_decimal_precision(x):
        return "%.2F" % x
    continuous_dqr.loc['nulls pct'] = continuous_dqr.loc['nulls pct'].apply(two_decimal_precision)
    categorical_dqr.loc['nulls pct'] = categorical_dqr.loc['nulls pct'].apply(two_decimal_precision)
    categorical_dqr.loc['mode pct'] = categorical_dqr.loc['mode pct'].apply(two_decimal_precision)
    categorical_dqr.loc['2nd mode pct'] = categorical_dqr.loc['2nd mode pct'].apply(two_decimal_precision)

    '''
    Identify columns that were not listed as continuous or categorical
    '''
    error_cols = list(set(df.columns).difference(set(continuous_cols + categorical_cols)))

    return continuous_dqr, categorical_dqr, error_cols


def get_mode_and_second_mode(s: pd.Series):
    """
    Returns mode, second mode if available, and counts/percentage for both
    :param s: pd.Series to analyze
    :return: pd.DataFrame containing mode and 2nd mode information
    """
    counts = s.value_counts()
    mode_col = counts.index[0]
    mode_counts = counts.iloc[0]
    mode_pct = mode_counts / s.size

    if counts.index.size < 2:
        second_mode_col = np.NaN
        second_mode_counts = np.NaN
        second_mode_pct = np.NaN
    else:
        second_mode_col = counts.index[1]
        second_mode_counts = counts.iloc[1]
        second_mode_pct = second_mode_counts / s.size
    mode_df = pd.DataFrame(
        {
            s.name: [mode_col, mode_counts, mode_pct, second_mode_col, second_mode_counts, second_mode_pct]
        },
        index=['mode', 'mode count', 'mode pct', '2nd mode', '2nd mode count', '2nd mode pct']
    )

    return mode_df


def set_columns_to_category_dtype(df: pd.DataFrame, cols=None):
    # if no columns are passed in, infer categorical columns
    if cols is None:
        cols_to_change = [col_name for col_name in df.dtypes.where(df.dtypes == 'object').dropna().index if col_name != df.index.name]
        for col in cols_to_change:
            df[col] = df[col].astype('category')
    else:
        for col in cols:
            df[col] = df[col].astype('category')

In [199]:

cs_df.head()

Unnamed: 0_level_0,M/F,Hand,Age,Educ,SES,MMSE,CDR,eTIV,nWBV,ASF,Delay
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
OAS1_0001_MR1,F,R,74,2.0,3.0,29.0,0.0,1344,0.743,1.306,
OAS1_0002_MR1,F,R,55,4.0,1.0,29.0,0.0,1147,0.81,1.531,
OAS1_0003_MR1,F,R,73,4.0,3.0,27.0,0.5,1454,0.708,1.207,
OAS1_0004_MR1,M,R,28,,,,,1588,0.803,1.105,
OAS1_0005_MR1,M,R,18,,,,,1737,0.848,1.01,


In [295]:
continuous, categorical, error = get_data_quality_report(cs_df)
continuous

Unnamed: 0,Age,Educ,SES,MMSE,CDR,eTIV,nWBV,ASF,Delay
count,436.0,436.0,436.0,436.0,436.0,436.0,436.0,436.0,436.0
mean,51.3578,3.17872,2.49074,27.0638,0.285106,1481.92,0.79167,1.19889,20.55
std,25.2699,1.31151,1.12059,3.69687,0.383405,158.741,0.059937,0.128682,23.8625
min,18.0,1.0,1.0,14.0,0.0,1123.0,0.644,0.881,1.0
25%,23.0,2.0,2.0,26.0,0.0,1367.75,0.74275,1.11175,2.75
50%,54.0,3.0,2.0,29.0,0.0,1475.5,0.809,1.19,11.0
75%,74.0,4.0,3.0,30.0,0.5,1579.25,0.842,1.28425,30.75
max,96.0,5.0,5.0,30.0,2.0,1992.0,0.893,1.563,89.0
nulls,0.0,201.0,220.0,201.0,201.0,0.0,0.0,0.0,416.0
nulls pct,0.0,0.46,0.5,0.46,0.46,0.0,0.0,0.0,0.95


In [287]:
def two_decimal_precision(x):
    return "%.2F" % x
categorical.loc[['mode pct','nulls pct']].apply(two_decimal_precision)

TypeError: ("cannot convert the series to <class 'float'>", 'occurred at index Hand')