# LAC DHS Data Cleaning

In [1]:
repo_dir = "Repos"   # Set this to be where your github repos are located.
%load_ext autoreload
%autoreload 2

# Update the load path so python can find modules for the model
import sys
from pathlib import Path
sys.path.insert(0, str(Path.home() / repo_dir / "eye-ai-ml"))

In [2]:
# Prerequisites

import json
import os
import re
from eye_ai.eye_ai import EyeAI
import pandas as pd
import numpy as np
from pathlib import Path, PurePath,PosixPath
import logging


logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)

In [3]:
from deriva.core.utils.globus_auth_utils import GlobusNativeLogin
catalog_id = "eye-ai" #@param
host = 'www.eye-ai.org'


gnl = GlobusNativeLogin(host=host)
if gnl.is_logged_in([host]):
    print("You are already logged in.")
else:
    gnl.login([host], no_local_server=True, no_browser=True, refresh_tokens=True, update_bdbag_keychain=True)
    print("Login Successful")

2024-06-26 00:49:20,560 - INFO - Creating client of type <class 'globus_sdk.services.auth.client.native_client.NativeAppAuthClient'> for service "auth"
2024-06-26 00:49:20,561 - INFO - Finished initializing AuthLoginClient. client_id='8ef15ba9-2b4a-469c-a163-7fd910c9d111', type(authorizer)=<class 'globus_sdk.authorizers.base.NullAuthorizer'>


You are already logged in.


In [4]:
# Variables to configure the rest of the notebook.

cache_dir = '/data'        # Directory in which to cache materialized BDBags for datasets
working_dir = '/data'    # Directory in which to place output files for later upload.

configuration_rid="2-C6B8" # rid

In [5]:
EA = EyeAI(hostname = host, catalog_id = catalog_id, cache_dir= cache_dir, working_dir=working_dir)

2024-06-26 00:49:20,601 - INFO - Creating client of type <class 'globus_sdk.services.auth.client.native_client.NativeAppAuthClient'> for service "auth"
2024-06-26 00:49:20,602 - INFO - Finished initializing AuthLoginClient. client_id='8ef15ba9-2b4a-469c-a163-7fd910c9d111', type(authorizer)=<class 'globus_sdk.authorizers.base.NullAuthorizer'>


In [6]:
# @title Initiate an Execution
configuration_records = EA.execution_init(configuration_rid=configuration_rid)
configuration_records.model_dump()

2024-06-26 00:49:21,052 - INFO - File [/data/alyciaqiu/EyeAI_working/Execution_Metadata/Execution_Config-lac_data_cleaning.json] transfer successful. 0.65 KB transferred. Elapsed time: 0:00:00.000072.
2024-06-26 00:49:21,053 - INFO - Verifying SHA256 checksum for downloaded file [/data/alyciaqiu/EyeAI_working/Execution_Metadata/Execution_Config-lac_data_cleaning.json]
2024-06-26 00:49:21,079 - INFO - Configuration validation successful!
2024-06-26 00:49:26,745 - INFO - File [/data/alyciaqiu/EyeAI_working/Execution_Assets/LACDHS_All_Data_to_2024-03-03.csv] transfer successful. 106.88 MB transferred at 21.98 MB/second. Elapsed time: 0:00:04.862632.


{'caching_dir': PosixPath('/data'),
 'working_dir': PosixPath('/data/alyciaqiu/EyeAI_working'),
 'vocabs': {'Workflow_Type': [{'name': 'lac_data_cleaning', 'rid': '2-C6BC'}],
  'Execution_Asset_Type': [{'name': 'cleaned_data', 'rid': '2-C6BE'}]},
 'execution_rid': '2-C6C8',
 'workflow_rid': '2-C6BG',
 'bag_paths': [],
 'assets_paths': [PosixPath('/data/alyciaqiu/EyeAI_working/Execution_Assets/LACDHS_All_Data_to_2024-03-03.csv')],
 'configuration_path': PosixPath('/data/alyciaqiu/EyeAI_working/Execution_Metadata/Execution_Config-lac_data_cleaning.json')}

In [7]:
# Helper Functions

# Function to convert English numbers to Arabic numerals
def convert_word_to_num(text):
    """
    Converts English number words to Arabic numerals within the given text.
    
    Args:
    text (str): Text containing English number words.
    
    Returns:
    str: Modified text with number words converted to Arabic numerals.
    """
    # Mapping of English words to Arabic numerals
    word_to_num = {
        "one": "1",
        "two": "2",
        "three": "3",
        "four": "4",
        "five": "5",
        "six": "6",
        "seven": "7",
        "eight": "8",
        "nine": "9",
        "ten": "10"
    }
    for word, num in word_to_num.items():
        text = re.sub(r'\b' + word + r'\b', num, text, flags=re.IGNORECASE)
    return text

# Define a function to replace the time frame values
def replace_time_frame(value):
    """
    Converts time descriptions into ISO 8601 duration format.
    
    Args:
    value (str): String describing a time frame.
    
    Returns:
    str: ISO 8601 formatted duration string.
    """
    value = convert_word_to_num(value)
    if re.match(r"\d+ Year", value, re.IGNORECASE):
        return re.sub(r"(\d+) Year", r"P\1Y", value, flags=re.IGNORECASE)
    elif re.match(r"\d+ Month", value, re.IGNORECASE):
        return re.sub(r"(\d+) Month", r"P\1M", value, flags=re.IGNORECASE)
    elif re.match(r"\d+ week(s)?", value, re.IGNORECASE):
        return re.sub(r"(\d+) week(s)?", r"P\1W", value, flags=re.IGNORECASE)
    elif value == "0" or re.match(r"Same day", value, re.IGNORECASE):
        return "P0D"
    elif value == "1" or re.match(r"Next Day", value, re.IGNORECASE):
        return "P1D"
    else:
        return value

def to_01(value):
    """
    Converts true/false and yes/no descriptions to binary (0, 1) representations.
    
    Args:
    value (str or bool): Input representing a binary condition.
    
    Returns:
    int or np.nan: Binary representation or NaN for undefined inputs.
    """
    value = str(value)
    value = value.lower()
    if value == "true" or value == "yes":
        return 1
    elif value == "false" or value == "no":
        return 0
    elif value == "unknown" or value == None:
        return np.nan  
    else:
        return value

def to_FM(value):
    """
    Converts full gender descriptions to single-character representations.
    
    Args:
    value (str): Gender description ("Female" or "Male").
    
    Returns:
    str: Single-character gender code ("F" or "M").
    """
    if value == "Female":
        return "F"
    elif value == "Male":
        return "M"
    else:
        return value

def ethnicity(value):
    """
    Ensures proper labeling of ethnicity, providing a default if unspecified.
    
    Args:
    value (str): Ethnicity description.
    
    Returns:
    str: Properly labeled ethnicity or a default value.
    """
    if value == "null":
        return "ethnicity not specified"
    return value

In [8]:
# Main Function 1: Clean Data Types and Basic Processing
def clean_data_types(df):
    """
    Cleans and converts data types for DataFrame columns.
    
    Args:
    df (pd.DataFrame): The DataFrame to clean.
    
    Returns:
    pd.DataFrame: DataFrame with converted data types.
    """
    # Conversion to float
    float_cols = ['hba1c']
    df[float_cols] = df[float_cols].astype(float)
    
    # Conversion to datetime
    time_cols = ['date_of_encounter', 'reviewed_date']
    for col in time_cols:
        df[col] = pd.to_datetime(df[col])
    
    # Conversion to string and cleaning
    str_cols = ['glaucoma_hx', 'provider', 'dr_level', 'assessment_and_recommendation', 'additional_comments', 'return_time_frame',
                'referral_status_time_frame', 'gender', 'ethnicity', 'hypertension', 'image_quality', 'site_mrn', 'consultant']
    for col in str_cols:
        df[col] = df[col].astype(str).apply(str.strip)
        df[col] = df[col].str.replace('\r\n', ' ')
        df[col] = df[col].replace('', np.nan)
        df[col] = df[col].replace('nan', 'null')
    
    # Conversion to Int64 for handling NaN
    int_cols = ['insulin_dependent', 'pregnant', 'cataract', 'maculopathy', 'other', 'consult_id', 'visual_acuity_right', 'visual_acuity_left']
    for col in int_cols:
        df[col] = df[col].apply(to_01)
        # Convert the column to float first
        df[col] = df[col].astype(float)
        # Convert the column to integers (Int64 to keep NaN values)
        df[col] = df[col].astype('Int64')
    
    return df

# Main Function 2: Process Special Fields
def process_special_fields(df):
    """
    Processes special fields like time frames, gender, and ethnicity using helper functions.
    
    Args:
    df (pd.DataFrame): DataFrame with data to process.
    
    Returns:
    pd.DataFrame: DataFrame with processed fields.
    """
    df['return_time_frame'] = df['return_time_frame'].apply(replace_time_frame)
    df['referral_status_time_frame'] = df['referral_status_time_frame'].apply(replace_time_frame)
    df['gender'] = df['gender'].apply(to_FM)
    df['ethnicity'] = df['ethnicity'].apply(ethnicity)
    return df

# Main Function 3: Process Data
def process_data(filepath, output_path):
    """
    Main function to process the CSV file. Reads, applies transformations, and saves the processed data.
    
    Args:
    filepath (str): Path to the CSV data file.
    
    Returns:
    None: Saves the processed data to a new CSV file.
    """
    df = pd.read_csv(filepath, encoding="latin-1")
    df = clean_data_types(df)
    df = process_special_fields(df)
    df.to_csv(output_path, index=False)

In [10]:
# Define input and output path
output_path = EA.execution_assets_path/'cleaned_data/LACDHS_All_Data_to_2024-03-03_cleaned.csv'
input_path = configuration_records.assets_paths[0]

In [12]:
# Clean & Save
process_data(input_path,output_path)

In [13]:
uploaded_assets = EA.execution_upload(configuration_records.execution_rid, False)

2024-06-26 00:50:16,172 - INFO - Initializing uploader: GenericUploader v1.7.1 [Python 3.10.13, Linux-5.10.210-201.852.amzn2.x86_64-x86_64-with-glibc2.26]
2024-06-26 00:50:16,173 - INFO - Creating client of type <class 'globus_sdk.services.auth.client.native_client.NativeAppAuthClient'> for service "auth"
2024-06-26 00:50:16,173 - INFO - Finished initializing AuthLoginClient. client_id='8ef15ba9-2b4a-469c-a163-7fd910c9d111', type(authorizer)=<class 'globus_sdk.authorizers.base.NullAuthorizer'>
2024-06-26 00:50:16,211 - INFO - Checking for updated configuration...
2024-06-26 00:50:16,321 - INFO - Updated configuration found.
2024-06-26 00:50:16,323 - INFO - Scanning files in directory [/data/alyciaqiu/EyeAI_working/Execution_Assets/cleaned_data]...
2024-06-26 00:50:16,326 - INFO - Including file: [/data/alyciaqiu/EyeAI_working/Execution_Assets/cleaned_data/LACDHS_All_Data_to_2024-03-03_cleaned.csv].
2024-06-26 00:50:16,327 - INFO - Processing: [/data/alyciaqiu/EyeAI_working/Execution_As