# CSC 510 Portfolio Project

## Loading and Filtering Initial Data

In [40]:
import pandas as pd
import numpy as np
import os
import shutil
import imageio as iio
import cv2 as cv
import json
import math

print(os.getcwd())

brfss_data_filepath = os.path.join(os.getcwd(), "Data/LLCP2020.XPT")
brfss_filtered_filepath = os.path.join(os.getcwd(), "Data/brfss_filtered_data.csv")

oasis_csv_filepath = os.path.join(os.getcwd(),"Data/oasis_cross-sectional.csv")
oasis_csv_filtered_filepath = os.path.join(os.getcwd(),"Data/oasis_filtered.csv")

def round_dec(num, places = 2):
    """Rounds the provided number to a specific number of decimal places (defaulted to 2)"""

    multiplier = 10 ** places

    return math.trunc(num * multiplier) / multiplier


/Users/karlestes/Documents/Grad School/CSC 510 - Foundations of Artificial Intelligence/Portfolio Project


### BRFSS

In [None]:
# RUN TO GET CLEAN BRFSS DATA
# Read in the dataset from the file
# may take a while
print("Reading SAS Data")
df=pd.read_sas(brfss_data_filepath)
print("SAS Data Loaded")

print("Removing Unecessary Columns Data")
# Filter only relavant data for analysis
df_filtered = df[['_STATE','DISPCODE','SEXVAR','GENHLTH','PHYSHLTH','MENTHLTH','POORHLTH','EXERANY2','SLEPTIM1','CVDINFR4','CVDCRHD4',
                  'CVDSTRK3','ASTHMA3','CHCSCNCR','CHCOCNCR','CHCCOPD2','HAVARTH4','ADDEPEV3','CHCKDNY2','DIABETE4','PREGNANT',
                  'DEAF','BLIND','DECIDE','DIFFWALK','DIFFDRES','DIFFALON','SMOKE100','USENOW3',
                  'FALL12MN','HIVRISK5','TOLDHEPC','HAVEHEPB','CIMEMLOS','CDASSIST','CDSOCIAL',
                  'ACEDEPRS','ACEHURT1','ACETOUCH','ACETTHEM','ACEHVSEX','_RACE','_AGE_G','_BMI5CAT','_RFDRHV7']]
print("Slices of Data Gathered")
print("data shape: ",df_filtered.shape)

In [None]:
# Temp copy of original filtered data
filtered_copy = df_filtered.copy()

In [None]:
# Load temp copy
df_filtered = filtered_copy.copy()

In [None]:
# Data Filtering
dropna_categories = ['SEXVAR','_AGE_G','DEAF','BLIND','USENOW3','CIMEMLOS','_RACE','_BMI5CAT']
dontknow_categories = [('PHYSHLTH', 77),('MENTHLTH',77),('POORHLTH',77),('EXERANY2',7),('CVDINFR4',7),('CVDCRHD4',7),('CVDSTRK3',7),('ASTHMA3',7),('CHCSCNCR',7),
                       ('CHCOCNCR',7),('CHCCOPD2',7),('HAVARTH4',7),('ADDEPEV3',7),('CHCKDNY2',7),('DIABETE4',7),('DECIDE',7),('DIFFWALK',7),('DIFFDRES',7),
                       ('DIFFALON',7),('SMOKE100',7),('HIVRISK5',7),('TOLDHEPC',7),('HAVEHEPB',7),('ACEDEPRS',7),('ACEHURT1',7),('ACETOUCH',7),('ACETTHEM',7),
                       ('ACEHVSEX',7),('_RFDRHV7',9),('SLEPTIM1',77),('FALL12MN',77),('GENHLTH',7)]
categorical_categories = ['PHYSHLTH','MENTHLTH','POORHLTH','SLEPTIM1','FALL12MN']

non_cd_categories = ['_STATE','DISPCODE','SEXVAR','GENHLTH','PHYSHLTH','MENTHLTH','POORHLTH','EXERANY2','SLEPTIM1','CVDINFR4','CVDCRHD4',
                  'CVDSTRK3','ASTHMA3','CHCSCNCR','CHCOCNCR','CHCCOPD2','HAVARTH4','ADDEPEV3','CHCKDNY2','DIABETE4','PREGNANT',
                  'DEAF','BLIND','DIFFWALK','DIFFDRES','DIFFALON','SMOKE100','USENOW3',
                  'FALL12MN','HIVRISK5','TOLDHEPC','HAVEHEPB',
                  'ACEDEPRS','ACEHURT1','ACETOUCH','ACETTHEM','ACEHVSEX','_RACE','_AGE_G','_BMI5CAT','_RFDRHV7']

# Remove rows missing necessary info
print("Initial data shape: ", df_filtered.shape)
print("Removing rows missing necessary datapoints")
df_filtered = df_filtered[df_filtered.DISPCODE == 1100] # Ensure completed interviews only
df_filtered = df_filtered.dropna(axis='rows',subset=dropna_categories)
print("Data shape after removing responses with necessary missing data: ", df_filtered.shape)

# Replace blank data with Don't Know/Not Sure
print("Replacing missing data points with corresponding label for don't know/not sure")

for column,value in dontknow_categories:
    df_filtered[column] = df_filtered[column].fillna(value)

# Convert categorical data
for column in categorical_categories:
    masks = []
    if column in ['PHYSHLTH','MENTHLTH','POORHLTH']:
        masks.append((df_filtered[column] >= 1) & (df_filtered[column] < 7))
        masks.append((df_filtered[column] >= 7) & (df_filtered[column] < 14))
        masks.append((df_filtered[column] >= 14) & (df_filtered[column] <= 30))
    elif column == 'SLEPTIM1':
        masks.append((df_filtered.SLEPTIM1 >= 1) & (df_filtered.SLEPTIM1 < 6))
        masks.append((df_filtered.SLEPTIM1 >= 6) & (df_filtered.SLEPTIM1 < 12))
        masks.append((df_filtered.SLEPTIM1 >= 12) & (df_filtered.SLEPTIM1 < 18))
        masks.append((df_filtered.SLEPTIM1 >= 18) & (df_filtered.SLEPTIM1 <= 24))
    elif column == 'FALL12MN':
        masks.append((df_filtered.FALL12MN >= 1) & (df_filtered.FALL12MN < 10))
        masks.append((df_filtered.FALL12MN >= 10) & (df_filtered.FALL12MN < 20))
        masks.append((df_filtered.FALL12MN >= 20) & (df_filtered.FALL12MN < 30))
        masks.append((df_filtered.FALL12MN >= 30) & (df_filtered.FALL12MN < 40))
        masks.append((df_filtered.FALL12MN >= 40) & (df_filtered.FALL12MN < 50))
        masks.append((df_filtered.FALL12MN >= 50) & (df_filtered.FALL12MN < 60))
        masks.append((df_filtered.FALL12MN >= 60) & (df_filtered.FALL12MN < 70))
        masks.append((df_filtered.FALL12MN >= 70) & (df_filtered.FALL12MN <= 76))
    
    for i,mask in enumerate(masks):
        df_filtered.loc[mask, column] = (i+1)

# Handling Pregnant Variable
df_filtered.loc[df.SEXVAR == 1, 'PREGNANT'] = 2 # All 'M' listed as not pregnant
df_filtered.loc[((df._AGE_G >= 4) & (df.PREGNANT != 1)), 'PREGNANT'] = 2 # List anyone 49 and older as not pregnant
df_filtered['PREGNANT'] = df_filtered['PREGNANT'].fillna(7) # Fill remaining missing values with don't know/not sure

df_filtered = df_filtered.dropna(subset=non_cd_categories)

print("Data shape after removing responses with any remaining missing data: ", df_filtered.shape)


In [None]:
# Save filtered data to csv
print("Saving Filtered Data")
df_filtered.to_csv(brfss_filtered_filepath)
print("Filtered Data Written to CSV")

In [41]:
# RUN TO LOAD FILTERED DATA
print("Loading filtered data from csv")
df_filtered = pd.read_csv(brfss_filtered_filepath)
print("Filtered data loaded")

Loading filtered data from csv
Filtered data loaded


### OASIS

In [None]:
# Load Original Oasis CSV and get clean filtered data

print("Loading original CSV")
oasis_df = pd.read_csv(oasis_csv_filepath)
print("Oasis CSV Shape: ", oasis_df.shape)
print("Filtering non-CDR labeled subjects")
oasis_df_filtered = oasis_df.dropna(subset=['CDR'])
print("Oasis CSV Filtered Shape: ", oasis_df_filtered.shape)
# Save filtered data

print("Saving filtered Oasis CSV")
oasis_df_filtered.to_csv(oasis_csv_filtered_filepath)

In [None]:
# Load Filtered Oasis CSV
oasis_df_filtered = pd.read_csv(oasis_csv_filtered_filepath)

In [None]:
# Move all images to subfolders

print("Copying all MRI images of CDR patients to corresponding planar subfolders")

coronal_path = os.path.join(os.getcwd(), "Data/MRI/coronal/")
transverse_path = os.path.join(os.getcwd(), "Data/MRI/transverse/")
sagittal_path = os.path.join(os.getcwd(), "Data/MRI/sagittal")

n3_substr = "_mpr_n3_anon_111_t88_gfc_"
n4_substr = "_mpr_n4_anon_111_t88_gfc_"
n5_substr = "_mpr_n5_anon_111_t88_gfc_"
n6_substr = "_mpr_n6_anon_111_t88_gfc_"

for disc_num in range(1,12): #Look through all twelve disc folders
    disc_path = os.path.join(os.getcwd(), "Data/MRI/disc" + str(disc_num))
    for folder in os.listdir(disc_path): # Check all subject folder names
        if folder in oasis_df_filtered['ID'].values: # Check if the folder is in the filtered subject list
            subject_data_path = os.path.join(disc_path, folder, "PROCESSED/MPRAGE/T88_111")
            # Move Coronal
            try:
               shutil.copy(src=(subject_data_path + "/" + folder + n3_substr + "cor_110.gif"), dst=(coronal_path))
            except:
                try:
                    shutil.copy(src=(subject_data_path + "/" + folder + n4_substr + "cor_110.gif"), dst=(coronal_path))
                except:
                    try:
                        shutil.copy(src=(subject_data_path + "/" + folder + n5_substr + "cor_110.gif"), dst=(coronal_path))
                    except:
                        try:
                            shutil.copy(src=(subject_data_path + "/" + folder + n6_substr + "cor_110.gif"), dst=(coronal_path))
                        except:
                            print(f"ERROR: Could not find coronal image for subject {folder}")
            # Move Transverse
            try:
               shutil.copy(src=(subject_data_path + "/" + folder + n3_substr + "tra_90.gif"), dst=(transverse_path))
            except:
                try:
                    shutil.copy(src=(subject_data_path + "/" + folder + n4_substr + "tra_90.gif"), dst=(transverse_path))
                except:
                    try:
                        shutil.copy(src=(subject_data_path + "/" + folder + n5_substr + "tra_90.gif"), dst=(transverse_path))
                    except:
                        try:
                            shutil.copy(src=(subject_data_path + "/" + folder + n6_substr + "tra_90.gif"), dst=(transverse_path))
                        except:
                            print(f"ERROR: Could not find transverse image for subject {folder}")

            # Move Sagittal
            try:
               shutil.copy(src=(subject_data_path + "/" + folder + n3_substr + "sag_95.gif"), dst=(sagittal_path))
            except:
                try:
                    shutil.copy(src=(subject_data_path + "/" + folder + n4_substr + "sag_95.gif"), dst=(sagittal_path))
                except:
                    try:
                        shutil.copy(src=(subject_data_path + "/" + folder + n5_substr + "sag_95.gif"), dst=(sagittal_path))
                    except:
                        try:
                            shutil.copy(src=(subject_data_path + "/" + folder + n6_substr + "sag_95.gif"), dst=(sagittal_path))
                        except:
                            print(f"ERROR: Could not find coronal image for subject {folder}")

print("Images copied to planar subfolders") 

In [39]:
# List all image sizes in each folder
coronal_path = os.path.join(os.getcwd(), "Data/MRI/coronal/")
transverse_path = os.path.join(os.getcwd(), "Data/MRI/transverse/")
sagittal_path = os.path.join(os.getcwd(), "Data/MRI/sagittal")

paths = [coronal_path,transverse_path,sagittal_path]
sizes = []

for path in paths:
    s = set()
    for image in os.listdir(path):
        try: 
            im = iio.get_reader(os.path.join(path,image))
            for frame in im:
                s.add(frame.shape)
        except ValueError:
            print(f"Value Error trying to read image {image}")

    sizes.append(s)

print("Coronal image sizes: ", sizes[0])
print("Transverse image sizes: ", sizes[1])
print("Sagittal image sizes: ", sizes[2])
        

Value Error trying to read image .DS_Store
Value Error trying to read image .DS_Store
Value Error trying to read image .DS_Store
Coronal image sizes:  {(176, 176)}
Transverse image sizes:  {(208, 176)}
Sagittal image sizes:  {(176, 208)}


In [None]:
# Resize all coronal images to 176 by 176 and convert all images from .gif to jpg
# Removes the .gif files
coronal_path = os.path.join(os.getcwd(), "Data/MRI/coronal/")
transverse_path = os.path.join(os.getcwd(), "Data/MRI/transverse/")
sagittal_path = os.path.join(os.getcwd(), "Data/MRI/sagittal")

paths = [coronal_path, transverse_path, sagittal_path]

for path in paths:
    for image in os.listdir(path):
        new_format = image[:-3]
        new_format = new_format + "jpg"
        try:
            im = iio.get_reader(os.path.join(path,image))
            for frame in im:

                if path == coronal_path:
                    resized = cv.resize(frame, (176,176), interpolation=cv.INTER_AREA)
                    cv.imwrite(os.path.join(path,new_format), resized)
                else:
                    cv.imwrite(os.path.join(path,new_format),frame)

                os.remove(os.path.join(path,image))
        except ValueError:
            print(f"Value Error trying to read image {image}")

## BRFSS Precomputations

In [45]:
# RUN TO LOAD FILTERED DATA
print("Loading filtered data from csv")
brfss_filtered = pd.read_csv(brfss_filtered_filepath)
print("Filtered data loaded")

Loading filtered data from csv
Filtered data loaded


In [46]:
# Data Column for CD Decline
brfss_filtered['CD'] = np.where(((brfss_filtered.DECIDE == 1) & ((brfss_filtered.CDASSIST == 1) | (brfss_filtered.CDASSIST == 2) | (brfss_filtered.CDASSIST == 3) | 
                               (brfss_filtered.CDASSIST == 4)) & ((brfss_filtered.CDSOCIAL == 1) | (brfss_filtered.CDSOCIAL == 2) | (brfss_filtered.CDSOCIAL == 3) | 
                               (brfss_filtered.CDSOCIAL == 4)) & (brfss_filtered.CIMEMLOS == 1)), True, False)

brfss_filtered.loc[brfss_filtered.CD == True, 'CD'] = 1
brfss_filtered.loc[brfss_filtered.CD == False, 'CD'] = 0
brfss_filtered.loc[((brfss_filtered.DECIDE == 7) | (brfss_filtered.DECIDE == 9)), 'CD'] = 2
brfss_filtered.loc[((brfss_filtered.CIMEMLOS == 7) | (brfss_filtered.CIMEMLOS == 9)), 'CD'] = 2
brfss_filtered.loc[((brfss_filtered.CDASSIST == 7) | (brfss_filtered.CDASSIST == 9)), 'CD'] = 2
brfss_filtered.loc[((brfss_filtered.CDSOCIAL == 7) | (brfss_filtered.CDSOCIAL == 9)), 'CD'] = 2

yes_count = np.count_nonzero(brfss_filtered['CD'].values == 1)
maybe_count = np.count_nonzero(brfss_filtered['CD'].values == 2)
no_count = np.count_nonzero(brfss_filtered['CD'].values == 0)

print(f"Reported CD Count: {yes_count}\nPosisble CD Count: {maybe_count}\nNo CD Count: {no_count}")

Reported CD Count: 1246
Posisble CD Count: 847
No CD Count: 59286


In [47]:
conversions = open(os.path.join(os.getcwd(), "Data/conversions.json"))
conversions = json.load(conversions)

print(conversions.keys())


dict_keys(['_STATE', 'DISPCODE', 'SEXVAR', 'GENHLTH', 'PHYSHLTH', 'MENTHLTH', 'POORHLTH', 'EXERANY2', 'SLEPTIM1', 'CVDINFR4', 'CVDCRHD4', 'CVDSTRK3', 'ASTHMA3', 'CHCSCNCR', 'CHCOCNCR', 'CHCCOPD2', 'HAVARTH4', 'ADDEPEV3', 'CHCKDNY2', 'DIABETE4', 'PREGNANT', 'DEAF', 'BLIND', 'DECIDE', 'DIFFWALK', 'DIFFDRES', 'DIFFALON', 'SMOKE100', 'USENOW3', 'FALL12MN', 'HADHYST2', 'HIVRISK5', 'TOLDHEPC', 'HAVEHEPB', 'CIMEMLOS', 'CDASSIST', 'CDSOCIAL', 'ACEDEPRS', 'ACEHURT1', 'ACETOUCH', 'ACETTHEM', 'ACEHVSEX', '_RACE', '_AGE_G', '_BMI5CAT', '_RFDRHV7'])


In [48]:
# Precompute Probability Tables
non_cd_categories = ['_STATE','DISPCODE','SEXVAR','GENHLTH','PHYSHLTH','MENTHLTH','POORHLTH','EXERANY2','SLEPTIM1','CVDINFR4','CVDCRHD4',
                  'CVDSTRK3','ASTHMA3','CHCSCNCR','CHCOCNCR','CHCCOPD2','HAVARTH4','ADDEPEV3','CHCKDNY2','DIABETE4','PREGNANT',
                  'DEAF','BLIND','DIFFWALK','DIFFDRES','DIFFALON','SMOKE100','USENOW3',
                  'FALL12MN','HIVRISK5','TOLDHEPC','HAVEHEPB',
                  'ACEDEPRS','ACEHURT1','ACETOUCH','ACETTHEM','ACEHVSEX','_RACE','_AGE_G','_BMI5CAT','_RFDRHV7']

for column in non_cd_categories:
    laplace = False
    temp_df = brfss_filtered[[column, 'CD']].copy()

    print(f"\nBeginning likelihood table for {column}")
    # Ensure all keys are added, and apply Laplace if not
    # for key in conversions[column].keys():
    #     try:
    #         n = int(key)

    #         if not (n in temp_df[column].values):
    #             laplace = True
    #             break
    #     except:
    #         print(f"Could not convert {key} to integer")

    # Apply laplace as necessary
    print("Applying Laplace")
    for key in conversions[column].keys():
        try:
            n = float(key)
            if n in temp_df[column].values:
                for i in range(3):
                    dictionary = {column: n, 'CD': i}
                    temp_df = temp_df.append(dictionary, ignore_index=True)
        except:
            print(f"Could not proccess key: {key}")

    print("Calling pd.crosstab()")
    table = pd.crosstab(temp_df[column], temp_df['CD'], margins=True, margins_name='Total')

    # Add row for P(hypothesis)
    print("Adding probability rows and columns")
    table.loc['P_H'] = [round_dec((table.at['Total',0])/(table.at['Total','Total']),6),
                            round_dec((table.at['Total',1])/(table.at['Total','Total']),6),
                            round_dec((table.at['Total',2])/(table.at['Total','Total']),6), '']

    # Add posterior probabilities
    p_d = []
    p_d_zero = []
    p_d_one = []
    p_d_two = []
    for index, row in table.iterrows():
        try: 
            p_d.append(round_dec((row['Total'] / table.at['Total','Total']),6))
        except:
            p_d.append('')

        try: 
            p_d_zero.append(round_dec((row[0] / table.at['Total',0]),6))
        except:
            p_d.append('')

        try: 
            p_d_one.append(round_dec((row[1] / table.at['Total',1]),6))
        except:
            p_d.append('')

        try: 
            p_d_two.append(round_dec((row[2] / table.at['Total',2]),6))
        except:
            p_d.append('')

    table['P_D'] = p_d
    table['P_D_ZERO'] = p_d_zero
    table['P_D_ONE'] = p_d_one
    table['P_D_TWO'] = p_d_two

    print(f"Writing table {column} to csv")
    table.to_csv(os.path.join(os.getcwd(), "Data/tables", (column + ".csv")))
    print("Table saved")



Beginning likelihood table for _STATE
Applying Laplace
Could not proccess key: ?
Calling pd.crosstab()
Adding probability rows and columns
Writing table _STATE to csv
Table saved

Beginning likelihood table for DISPCODE
Applying Laplace
Calling pd.crosstab()
Adding probability rows and columns
Writing table DISPCODE to csv
Table saved

Beginning likelihood table for SEXVAR
Applying Laplace
Could not proccess key: ?
Calling pd.crosstab()
Adding probability rows and columns
Writing table SEXVAR to csv
Table saved

Beginning likelihood table for GENHLTH
Applying Laplace
Could not proccess key: ?
Calling pd.crosstab()
Adding probability rows and columns
Writing table GENHLTH to csv
Table saved

Beginning likelihood table for PHYSHLTH
Applying Laplace
Could not proccess key: ?
Calling pd.crosstab()
Adding probability rows and columns
Writing table PHYSHLTH to csv
Table saved

Beginning likelihood table for MENTHLTH
Applying Laplace
Could not proccess key: ?
Calling pd.crosstab()
Adding pro